In [1]:
import pandas as pd
import numpy as np
import random

In [4]:
df = pd.read_csv('pima-indians-diabetes.data.csv', header=None)

In [5]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
columns = ['Pregnancies', 'Glucose', 'BP', 'Skin Thickness', 'Insulin', 'BMI',
          'Diabetes Pedigree', 'Age', 'Outcome']

In [7]:
df.columns = columns

In [8]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BP,Skin Thickness,Insulin,BMI,Diabetes Pedigree,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [9]:
df.shape

(768, 9)

In [26]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [27]:
X.shape

(768, 8)

In [28]:
y.shape

(768,)

In [52]:
# Feature Scaling - Normalization
def minMaxScaler(df):
    n = df.shape[0]
    df = df.values
    for i in range(0,df.shape[1] - 1):
        min_val = min(df[:,i])
        max_val = max(df[:,i])
        for j in range(n):
            numer = df[j][i] - min_val
            denom = max_val - min_val
            df[j][i] = numer / denom
    return df

In [53]:
df = minMaxScaler(df)

In [54]:
df[0]

array([0.35294118, 0.74371859, 0.59016393, 0.35353535, 0.        ,
       0.50074516, 0.23441503, 0.48333333, 1.        ])

In [55]:
# Divide data - Train Test Split using K-Fold Cross Validation Technique
def kfold(dataset,k=5):
    dataset_copy = list(dataset)
    fold_size = dataset.shape[0] // k
    folds = []
    for i in range(k):
        fold = []
        while len(fold) < fold_size:
            index = random.randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        folds.append(fold)
    return folds

In [56]:
# Predict function to get the predictions
def predict(coef, row):
    x = coef[0]
    for i in range(len(row)):
        x += coef[i + 1] * row[i]
    return 1 / (1 + np.exp(-x))

In [57]:
# to find out accuracy of our model
def accuracy(y_test, y_pred):
    count = 0
    for i in range(len(y_pred)):
        if y_pred[i] == y_test[i]:
            count += 1
    return count / len(y_pred) * 100

In [58]:
# Gradient Descent
# - Batch
# - Stochastic
# - Mini Batch
def gradient_descent(x_train, y_train, epochs, alpha):
    coef = [0] * (x_train.shape(1) + 1)
    n = x_train.shape[0]
    # logic to calculate coef using stochastic gradient descent
    for epoch in range(epochs):
        for i in range(len(x_train)):
            output = predict(coef, x_train[i])
            loss = y_train[i] - output
            coef[0] = coef[0] - ((2/n) * loss) * alpha
            for j in range(len(x_train.shape[1])):
                coef[j + 1] = coef[j + 1] - ((2/n) * loss * x_train[i][j]) * alpha
    return coef

In [59]:
def logistic(x_train, y_train, x_test, y_test, epochs, alpha):
    coef = gradient_descent(x_train, y_train, epochs, alpha)
    predcitions = []
    for row in x_test:
        y_pred = predict(coef, row)
        predictions.append(round(y_pred))
    acc_score = accuracy(y_test, predcitions)
    return acc_score

In [60]:
def evaluate(dataset,epochs,alpha):
    folds = kfold(dataset)
    for i in range(len(folds)):
        # x_train, y_train, x_test, y_test
        acc = logistic(x_train, y_train, x_test, y_test, epochs, alpha)

In [62]:
epochs = 100
alpha = 0.01
# evaluate(df,epochs,alpha)