In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
df = pd.read_csv('pima-indians-diabetes.data.csv', header=None)

In [3]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
columns = ['Pregnancies', 'Glucose', 'BP', 'Skin Thickness', 'Insulin', 'BMI',
          'Diabetes Pedigree', 'Age', 'Outcome']

In [5]:
df.columns = columns

In [6]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BP,Skin Thickness,Insulin,BMI,Diabetes Pedigree,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:
df.shape

(768, 9)

In [8]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [9]:
X.shape

(768, 8)

In [10]:
y.shape

(768,)

In [11]:
# Feature Scaling - Normalization
def minMaxScaler(df):
    n = df.shape[0]
    df = df.values
    for i in range(0,df.shape[1] - 1):
        min_val = min(df[:,i])
        max_val = max(df[:,i])
        for j in range(n):
            numer = df[j][i] - min_val
            denom = max_val - min_val
            df[j][i] = numer / denom
    return df

In [12]:
df = minMaxScaler(df)

In [32]:
df[0:5]

array([[0.35294118, 0.74371859, 0.59016393, 0.35353535, 0.        ,
        0.50074516, 0.23441503, 0.48333333, 1.        ],
       [0.05882353, 0.42713568, 0.54098361, 0.29292929, 0.        ,
        0.39642325, 0.11656704, 0.16666667, 0.        ],
       [0.47058824, 0.91959799, 0.52459016, 0.        , 0.        ,
        0.34724292, 0.25362938, 0.18333333, 1.        ],
       [0.05882353, 0.44723618, 0.54098361, 0.23232323, 0.11111111,
        0.41877794, 0.03800171, 0.        , 0.        ],
       [0.        , 0.68844221, 0.32786885, 0.35353535, 0.19858156,
        0.64232489, 0.94363792, 0.2       , 1.        ]])

In [14]:
# Divide data - Train Test Split using K-Fold Cross Validation Technique
def kfold(dataset,k=5):
    dataset_copy = list(dataset)
    fold_size = dataset.shape[0] // k
    folds = []
    for i in range(k):
        fold = []
        while len(fold) < fold_size:
            index = random.randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        folds.append(fold)
    return folds

In [15]:
# Predict function to get the predictions
def predict(coef, row):
    x = coef[0]
    for i in range(len(row)):
        x += coef[i + 1] * row[i]
    return 1 / (1 + np.exp(-x))

In [16]:
# to find out accuracy of our model
def accuracy(y_test, y_pred):
    count = 0
    for i in range(len(y_pred)):
        if y_pred[i] == y_test[i]:
            count += 1
    return count / len(y_pred) * 100

In [21]:
# Gradient Descent
# - Batch
# - Stochastic
# - Mini Batch
def gradient_descent(x_train, y_train, epochs, alpha):
    coef = [0] * (x_train.shape[1] + 1)
    n = x_train.shape[0]
    # logic to calculate coef using stochastic gradient descent
    for epoch in range(epochs):
        for i in range(len(x_train)):
            output = predict(coef, x_train[i])
            loss = y_train[i] - output
            coef[0] = coef[0] - ((2/n) * loss) * alpha
            for j in range(x_train.shape[1]):
                coef[j + 1] = coef[j + 1] - ((2/n) * loss * x_train[i][j]) * alpha
    return coef

In [40]:
def logistic(x_train, y_train, x_test, y_test, epochs, alpha):
    coef = gradient_descent(x_train, y_train, epochs, alpha)
    predictions = []
    for row in x_test:
        y_pred = predict(coef, row)
        predictions.append(np.round(y_pred))
    #print(predictions, y_test)
    acc_score = accuracy(y_test, predictions)
    return acc_score

In [41]:
folds = kfold(df)

In [42]:
np.asarray(folds).shape

(5, 153, 9)

In [47]:
folds = np.asarray(folds)

In [48]:
folds.shape

(5, 153, 9)

In [51]:
folds[0]

array([[0.41176471, 0.57286432, 0.62295082, ..., 0.16567037, 0.16666667,
        0.        ],
       [0.64705882, 0.60301508, 0.6557377 , ..., 0.30187874, 0.45      ,
        1.        ],
       [0.29411765, 0.3919598 , 0.39344262, ..., 0.24594364, 0.06666667,
        0.        ],
       ...,
       [0.05882353, 0.70351759, 0.60655738, ..., 0.32023911, 0.03333333,
        0.        ],
       [0.52941176, 0.77386935, 0.63934426, ..., 0.03672075, 0.4       ,
        0.        ],
       [0.05882353, 0.48241206, 1.        , ..., 0.05508113, 0.1       ,
        0.        ]])

In [43]:
def evaluate(dataset,epochs,alpha):
    folds = kfold(dataset)
    for i in range(len(folds)):
        x_train = []
        x_test = []
        y_train = []
        y_test = []
        train = list(folds)
        train.pop(i)
        for train_fold in train:
            for data in train_fold:
                x_train.append(data[:-1])
                y_train.append(data[-1])
        
        for data in folds[i]:
            x_test.append(data[:-1])
            y_test.append(data[-1])
        
        x_train = np.asarray(x_train)
        y_train = np.asarray(y_train)
        x_test = np.asarray(x_test)
        y_test = np.asarray(y_test)
        acc = logistic(x_train, y_train, x_test, y_test, epochs, alpha)
        print("Fold : {} accuracy is {}".format(i,acc))

In [46]:
epochs = 1000
alpha = 0.1
evaluate(df,epochs,alpha)

Fold : 0 accuracy is 32.02614379084967
Fold : 1 accuracy is 41.17647058823529
Fold : 2 accuracy is 33.33333333333333


KeyboardInterrupt: 