In [1]:
import numpy as np
import pandas as pd
import random, copy

In [2]:
def load_data(path):
    df = pd.read_csv(path, header=None)
    return df

In [3]:
def minMax(data):
    minMaxData = []
    for i in range(len(data[0])):
        col = [row[i] for row in data]
        minVal = min(col)
        maxVal = max(col)
        minMaxData.append([minVal, maxVal])
    return minMaxData

def minMaxScaler(data, minMaxData):
    for row in data:
        for i in range(len(row)):
            numer = row[i] - minMaxData[i][0]
            denom = minMaxData[i][1] - minMaxData[i][0]
            row[i] = numer / denom

In [4]:
def cross_validation(data):
    data_copy = list(data)
    k = 5
    fold_size = len(data) // k
    folds = []
    for i in  range(k):
        fold = []
        while len(fold) < fold_size:
            index = random.randrange(len(data_copy))
            fold.append(data_copy.pop(index))
        folds.append(fold)
    return folds

In [11]:
def stochastic_gd(x_train, y_train, epochs, alpha):
    coef = [0] * x_train.shape[1] + 1
    n = x_train.shape[0]
    for i in range(epochs):
        for j in range(n):
            pred = prediction(coef, x_train[j])
            loss = pred - y_train[j]
            grad = (2/n) * np.sum(loss)
            coef[0] = coef[0] - alpha * grad
            for k in range(len(x_train[j])):
                grad = (2/n) * np.dot(x_train[j], loss)
                coef[k+1] = coef[k+1] - alpha * grad
    return coef

In [6]:
def prediction(coef, row):
    x = coef[0]
    for i in range(len(row) - 1):
        x += coef[i + 1] * row[i]
    return 1 / (1 + np.exp(-x))

In [7]:
def accuracy(predicted, actual):
    count = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            count += 1
    return count/len(actual) * 100

In [9]:
def logistic_regssion(x_train, y_train, x_test, y_test, epochs, alpha):
    coef = stochastic_gd(x_train, y_train, epochs, alpha)
    y_pred = []
    for row in x_test:
        pred = prediction(coef, row)
        y_pred.append(round(pred))
    acc = accuracy(y_pred, y_test)
    return acc

In [10]:
def evaluate(data, epochs, alpha):
    acc = []
    folds = cross_validation(data)
    
    accuracy = logistic_regssion(x_train, y_train, x_test, y_test, epochs, alpha)
    acc.append(accuracy)
    return acc

In [16]:
df = load_data('pima-indians-diabetes.data.csv')

In [17]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [27]:
df = df.values

In [29]:
minMaxData = minMax(df)

In [30]:
minMaxScaler(df, minMaxData)

In [31]:
df[0]

array([0.35294118, 0.74371859, 0.59016393, 0.35353535, 0.        ,
       0.50074516, 0.23441503, 0.48333333, 1.        ])

In [33]:
df.shape

(768, 9)

In [34]:
df.shape[0] / 5

153.6

In [36]:
folds = cross_validation(df)

In [37]:
folds = np.asarray(folds)

In [38]:
folds.shape

(5, 153, 9)

In [39]:
folds[0]

array([[0.17647059, 0.53266332, 0.59016393, ..., 0.05508113, 0.1       ,
        0.        ],
       [0.47058824, 0.93467337, 0.73770492, ..., 0.14730999, 0.26666667,
        1.        ],
       [0.05882353, 0.72361809, 0.67213115, ..., 0.22587532, 0.11666667,
        0.        ],
       ...,
       [0.05882353, 0.62311558, 0.60655738, ..., 0.00939368, 0.15      ,
        0.        ],
       [0.        , 0.63316583, 0.68852459, ..., 0.18872758, 0.05      ,
        0.        ],
       [0.        , 0.51256281, 0.63934426, ..., 0.06831768, 0.05      ,
        0.        ]])