In [47]:
import numpy as np
import pandas as pd
import random
import copy

In [26]:
df = pd.read_csv('diabetes.csv')

In [27]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [28]:
df.shape

(768, 9)

In [29]:
def minMaxScalar(df):
    for i in range(len(df[0]) - 1):
        for j in range(len(df)):
            min_val = min(df[:,i])
            max_val = max(df[:,i])
            df[j][i] = (df[j][i] - min_val) / (max_val - min_val)
            

In [36]:
def crossValidation(df, k=5):
    df_copy = list(df)
    n = len(df)
    fold_size = n//k
    folds = []
    for i in range(k):
        fold = []
        while len(fold) < fold_size:
            index = random.randrange(len(df_copy))
            fold.append(df_copy.pop(index))
        folds.append(fold)
    return folds

In [37]:
def accuracy(y_test, y_pred):
    count = 0
    for i in range(len(y_test)):
        if y_test[i] == y_pred:
            count += 1
    return count / len(y_test) * 100

In [38]:
def predict(row, coef):
    x = coef[0]
    for i in range(len(row) - 1):
        x += coef[i+1] * row[i]
    return 1 / (1 + np.exp(-x))

In [41]:
def sgd(x_train, y_train, epochs, alpha):
    coef = np.zeros(x_train.shape[1])
    for epoch in range(epochs):
        for i in range(len(x_train)):
            y_pred = predict(x_train[i], coef)
            loss = y_pred - y_train[i]
            coef[0] = coef[0] - ((2/n) * loss) * alpha
            for j in range(len(x_train[0])):
                coef[j+1] = coef[j+1] - ((2/n) * (x_train[i] * loss)) * alpha
    return coef

In [42]:
def logistic(x_train, y_train, x_test, y_test, epochs, alpha):
    coef = sgd(x_train, y_train, epochs, alpha)
    predictions = []
    for row in x_test:
        y_pred = predict(row, coef)
        predictions.append(round(y_pred))
    return predictions

In [49]:
def evaluate(df, epochs, alpha):
    accuracies = []
    folds = crossValidation(df)
    for i in range(len(folds)):
        x_train = []
        y_train = []
        x_test = []
        y_test = []
        train = copy.deepcopy(folds)
        test = train.pop(folds[i])
        for data in train:
            x_train.append(data[:, :-1])
            y_train.append(data[:, -1])
        
        for data in test:
            x_test.append(data[:,:-1])
            y_test.append(data[:,-1])
        
        y_pred = logistic(x_train, y_train, x_test, y_test, epochs, alpha)
        acc = accuracy(y_test, y_pred)
        accuracies.append(acc)
    return accuracies

In [30]:
df = df.values

In [31]:
df

array([[  6.   , 148.   ,  72.   , ...,   0.627,  50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   , ...,   0.351,  31.   ,   0.   ],
       [  8.   , 183.   ,  64.   , ...,   0.672,  32.   ,   1.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,   0.245,  30.   ,   0.   ],
       [  1.   , 126.   ,  60.   , ...,   0.349,  47.   ,   1.   ],
       [  1.   ,  93.   ,  70.   , ...,   0.315,  23.   ,   0.   ]])

In [32]:
minMaxScalar(df)

In [35]:
df[0]

array([0.35294118, 0.74371859, 0.59016393, 0.35353535, 0.        ,
       0.50074516, 0.23441503, 0.48333333, 1.        ])

In [43]:
folds = crossValidation(df)

In [46]:
folds

[[array([0.07692308, 0.48743719, 0.56140351, 0.19191919, 0.13666667,
         0.31762653, 0.16505125, 0.29743285, 0.        ]),
  array([0.07692308, 0.86934673, 0.64912281, 0.        , 0.        ,
         0.64223386, 0.03923595, 0.54118064, 1.        ]),
  array([0.        , 0.52763819, 0.52459016, 0.41414141, 0.19086022,
         0.61847988, 0.05667439, 0.26929757, 0.        ]),
  array([0.        , 0.48947368, 0.52631579, 0.        , 0.        ,
         0.71169355, 0.20758759, 0.35994622, 0.        ]),
  array([0.61538462, 0.56315789, 0.70175439, 0.        , 0.        ,
         0.4989858 , 0.71890485, 0.51326522, 0.        ]),
  array([0.15384615, 0.41708543, 0.57017544, 0.28282828, 0.11      ,
         0.64223386, 0.3618606 , 0.34044717, 0.        ]),
  array([0.46153846, 0.73869347, 0.70175439, 0.        , 0.        ,
         0.51483421, 0.11354086, 0.7132379 , 1.        ]),
  array([0.53846154, 0.75376884, 0.68421053, 0.59183673, 0.23333333,
         0.61431065, 0.39967165, 0.

In [None]:
epochs = 100
alpha = 0.01
evaluate(df, epochs, alpha)