In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import sklearn.model_selection as sk

In [2]:
df = pd.read_csv('diabetes2.csv')

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
X = df.drop(columns='Outcome')
Y = df['Outcome']

In [5]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [6]:
Y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [8]:
x_train, x_test, y_train, y_test = sk.train_test_split(X, Y, test_size= 0.3, random_state = 78)

In [9]:
mean = x_train.sum()/x_train.count()

In [10]:
variance = (((x_train - mean)**2).sum())/x_train.count()
sd = np.sqrt(variance)

In [11]:
mean

Pregnancies                   4.050667
Glucose                     125.152000
BloodPressure                69.290667
SkinThickness                20.450667
Insulin                      76.757333
BMI                          32.355733
DiabetesPedigreeFunction      0.501819
Age                          33.946667
dtype: float64

In [12]:
sd

Pregnancies                   3.531964
Glucose                      33.773691
BloodPressure                21.049359
SkinThickness                16.840177
Insulin                     106.630489
BMI                           8.750093
DiabetesPedigreeFunction      0.350435
Age                          11.682344
dtype: float64

In [13]:
x_train = (x_train - mean)/sd

In [14]:
x_train.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
321,1.401298,0.913374,0.793817,0.448293,0.733774,0.2222,1.960935,0.689359
87,-0.863731,-0.892766,-0.441375,-0.145525,-0.17591,-0.966359,-0.690052,-1.022626
255,-0.014345,-0.448633,0.318743,-0.026761,0.217974,-0.452079,-1.095262,-0.59463
452,2.250684,-1.218463,0.128713,0.626438,-0.719844,-0.303509,-0.584469,1.031756
90,-1.14686,-1.218463,0.603787,0.626438,0.452428,0.667909,-0.767099,-0.937027
183,0.551912,-0.241371,1.268891,-1.214397,-0.719844,-0.417794,-0.983972,-0.337832
75,-0.863731,-0.892766,-0.156331,-0.442434,-0.363473,-1.457783,-0.478886,-0.765828
201,1.118169,-0.211762,0.128713,-0.086143,-0.719844,-1.057787,2.779916,1.031756
159,0.835041,-1.248072,0.413758,0.329529,-0.053993,-0.349223,0.756719,0.175764
41,-0.014345,0.084326,0.033699,-1.214397,-0.719844,0.2222,-0.567347,-0.851427


In [15]:
x_test = (x_test - mean)/sd

In [16]:
def sigmoid(z):
    return 1/(1 + math.exp(-z))

In [17]:
def evaluate(TP, FP, FN, TN):
    accuracy = (TN + TP)/(TN + TP + FN + FP)
    precision = TP/(FP + TP)
    recall = TP/(TP + FN)
    return accuracy

In [18]:
iteration = 400
w = np.random.randn(len(x_train.columns) + 1)
m = x_train.count()[0]
m_test = x_test.count()[0]
for itr in range(iteration + 1):
    
    if itr < 300:
        lr = 0.2
    elif 300 <= itr < 400:
        lr = 0.15

    J_train, J_test, dw = 0, 0, 0
    TP_tr, FP_tr, FN_tr, TN_tr, TP_test, FP_test, FN_test, TN_test = 0, 0, 0, 0, 0, 0, 0, 0

    for i in range(m):
    
        x = np.array(x_train.iloc[i])
        x = np.append(x, 1)
        h = sigmoid(np.dot(x, w))
        y = y_train.iloc[i]
        J_train += (- y * math.log(h)) - ((1-y)*math.log(1-h))
        
        dw += np.dot(x, (h - y))
        
        if h >= 0.5:
            if y == 1:
                TP_tr += 1
            else:
                FP_tr += 1
        else:
            if y == 1:
                FN_tr += 1
            else:
                TN_tr += 1
        
    J_train = J_train/m
    Acc_train = evaluate(TP_tr, FP_tr, FN_tr, TN_tr) 
    
    w = w - (lr/m)*dw
    
    for i in range(m_test):
        
        x = np.array(x_test.iloc[i])
        x = np.append(x, 1)
        h = sigmoid(np.dot(x, w))
        y = y_test.iloc[i]
        J_test += (- y * math.log(h)) - ((1-y)*math.log(1-h))
        
        if h >= 0.5:
            if y == 1:
                TP_test += 1
            else:
                FP_test += 1
        else:
            if y == 1:
                FN_test += 1
            else:
                TN_test += 1
    
    J_test = J_test/m_test
    Acc_test = evaluate(TP_test, FP_test, FN_test, TN_test)
    
    if itr%20 == 0:
    
        J_train = round(J_train, 3)
        J_test = round(J_test, 3)
        Acc_train = round(Acc_train, 3)
        Acc_test = round(Acc_test, 3)
        
        p = str(itr) + ' - Tr loss : ' + str(J_train) + ' Tr Acc : ' + str(Acc_train)
        p += ' Test loss : ' + str(J_test) + ' Test Acc : ' + str(Acc_test)
        
        print(p)

0 - Tr loss : 1.109 Tr Acc : 0.603 Test loss : 1.123 Test Acc : 0.59
20 - Tr loss : 0.603 Tr Acc : 0.704 Test loss : 0.68 Test Acc : 0.652
40 - Tr loss : 0.521 Tr Acc : 0.747 Test loss : 0.609 Test Acc : 0.689
60 - Tr loss : 0.498 Tr Acc : 0.755 Test loss : 0.595 Test Acc : 0.665
80 - Tr loss : 0.486 Tr Acc : 0.755 Test loss : 0.591 Test Acc : 0.671
100 - Tr loss : 0.478 Tr Acc : 0.765 Test loss : 0.588 Test Acc : 0.677
120 - Tr loss : 0.473 Tr Acc : 0.765 Test loss : 0.586 Test Acc : 0.671
140 - Tr loss : 0.47 Tr Acc : 0.771 Test loss : 0.585 Test Acc : 0.677
160 - Tr loss : 0.467 Tr Acc : 0.771 Test loss : 0.584 Test Acc : 0.677
180 - Tr loss : 0.465 Tr Acc : 0.779 Test loss : 0.584 Test Acc : 0.677
200 - Tr loss : 0.464 Tr Acc : 0.776 Test loss : 0.584 Test Acc : 0.677
220 - Tr loss : 0.463 Tr Acc : 0.784 Test loss : 0.584 Test Acc : 0.677
240 - Tr loss : 0.463 Tr Acc : 0.787 Test loss : 0.584 Test Acc : 0.689
260 - Tr loss : 0.463 Tr Acc : 0.781 Test loss : 0.584 Test Acc : 0.689
2