In [48]:
import math
import numpy as np
import pandas as pd
from random import randrange
import random
import matplotlib.pyplot as plt

In [49]:
df = pd.read_csv("dataset_LR.csv", header=0)

In [50]:
def Sigmoid(z):
    G_of_Z = float(1.0 / float((1.0 + math.exp(-1.0*z))))
    return G_of_Z

def Hypothesis(theta, x):
    z = 0
    for i in range(len(theta)):
        z += x[i]*theta[i]
    return Sigmoid(z)

def Cost_Function(X,Y,theta,m):
    sumOfErrors = 0
    for i in range(m):
        xi = X[i]
        hi = Hypothesis(theta,xi)
        if Y[i] == 1:
            error = Y[i] * math.log(hi)
        elif Y[i] == 0:
            error = (1-Y[i]) * math.log(1-hi)
        sumOfErrors += error
    const = -1/m
    J = const * sumOfErrors
    # print ('cost is ', J )
    return J

def Cost_Function_Derivative(X,Y,theta,j,m,alpha):
    sumErrors = 0
    for i in range(m):
        xi = X[i]
        xij = xi[j]
        hi = Hypothesis(theta,X[i])
        error = (hi - Y[i])*xij
        sumErrors += error
    m = len(Y)
    constant = float(alpha)/float(m)
    J = constant * sumErrors
    return J

def Gradient_Descent(X,Y,theta,m,alpha):
    new_theta = []
    constant = alpha/m
    for j in range(len(theta)):
        CFDerivative = Cost_Function_Derivative(X,Y,theta,j,m,alpha)
        #print(theta[j])
        new_theta_value = theta[j] - CFDerivative
        new_theta.append(new_theta_value)
    return new_theta

def Stoc_Cost_Function_Derivative(X,Y,theta,j,m,alpha):
    sumErrors = 0
    i = np.random.randint(len(X))
    xi = X[i]
    xij = xi[j]
    hi = Hypothesis(theta,X[i])
    error = (hi - Y[i])*xij
    sumErrors += error
    m = len(Y)
    constant = float(alpha)/float(m)
    J = constant * sumErrors
    return J

def Stochastic_Gradient_Descent(X,Y,theta,m,alpha):
    # print("Entered")
    new_theta = []
    constant = alpha/m
    for j in range(len(theta)):
      CFDerivative = Stoc_Cost_Function_Derivative(X,Y,theta,j,m,alpha)
        #print(theta[j])
      new_theta_value = theta[j] - CFDerivative
      new_theta.append(new_theta_value)
    return new_theta

def Declare_Winner(theta,X_test,Y_test):
    predict = list()
    score = 0
    length = len(X_test)
    for i in range(length):
        predict.append(round(Hypothesis(X_test[i],theta)))
        prediction = round(Hypothesis(X_test[i],theta))
        answer = Y_test[i]
        if prediction == answer:
            score += 1
    my_score = float(score) / float(length)
    return my_score,predict

def Logistic_Regression(X,Y,alpha,theta,num_iters,model):
    m = len(Y)
    acc_epoch = list()
    cost_epoch = list()
    for x in range(num_iters):
        if model == 1: 
          new_theta = Gradient_Descent(X,Y,theta,m,alpha)
        elif model == 2:
            new_theta = Stochastic_Gradient_Descent(X,Y,theta,m,alpha)
        theta = new_theta
        if x % 50 == 0:
            #here the cost function is used to present the final hypothesis of the model in the same form for each gradient-step iteration
            cost = Cost_Function(X,Y,theta,m)
            # cost_epoch.append(cost)
            # acc,xyz = Declare_Winner(theta,X,Y)
            # acc_epoch.append(acc)
            # #print ('theta ', theta)
            #print ('cost is ', Cost_Function(X,Y,theta,m))
    # return theta,acc_epoch,cost_epoch
    return theta,cost

In [51]:
def f_score_metric(actual,predicted):
	true_pos=0
	true_neg=0
	false_pos=0
	false_neg=0
	for i in range(len(actual)):
		if actual[i] == predicted[i] and predicted[i]==1:#true pos
			true_pos += 1
		elif actual[i] == predicted[i] and predicted[i]==0:#true neg
			true_neg += 1
		elif actual[i] ==0 and predicted[i]==1:#false pos
			false_pos+= 1
		else:
			false_neg+=1
	precision =true_pos/(true_pos+false_pos)
	recall = true_pos/(true_pos+false_neg)
	f_score=2*precision*recall/(precision+recall)
	return f_score,precision,recall

In [59]:
accuracies_test = list()
f_scores_test = list()
recalls_test =list()
precisions_test = list()
accuracies_train = list()
f_scores_train = list()
recalls_train =list()
precisions_train = list()
costs = list()
iters = list()
iterations = 5000
for i in range(iterations):
  if i%50 == 0:
    iters.append(i)
df.columns = ["attr1","attr2","attr3","attr4","class"]
for i in range(10):
  train_data = df.sample(frac=0.7,random_state=(np.random.randint(1,1000,1))[0])
  test_data = df.drop(train_data.index)
  X_train = train_data[["attr1","attr2","attr3","attr4"]]
  X_test = test_data[["attr1","attr2","attr3","attr4"]]
  Y_train = train_data["class"]
  Y_test = test_data["class"]
  X_train = np.array(X_train)
  X_test = np.array(X_test)
  Y_train = np.array(Y_train)
  Y_test = np.array(Y_test)
  # print(X_train)
  # print(Y_train)
  initial_theta = [0,0,0,0]
  alpha = 0.1
  # predicted,acc_epoch,cost_epoch = Logistic_Regression(X_train,Y_train,alpha,initial_theta,iterations,1)
  predicted,cost = Logistic_Regression(X_train,Y_train,alpha,initial_theta,iterations,1)
  print(predicted)
  costs.append(cost)
  accuracy_train,predict_train = Declare_Winner(predicted,X_train,Y_train)
  f_score_train,precision_train,recall_train = f_score_metric(Y_train,predict_train)
  f_scores_train.append(f_score_train)
  precisions_train.append(precision_train)
  recalls_train.append(recall_train)
  accuracies_train.append(accuracy_train*100)
  accuracy_test,predict_test = Declare_Winner(predicted,X_test,Y_test)
  f_score_test,precision_test,recall_test = f_score_metric(Y_test,predict_test)
  f_scores_test.append(f_score_test)
  precisions_test.append(precision_test)
  recalls_test.append(recall_test)
  accuracies_test.append(accuracy_test*100)

# print('accuracies 50 iteration %s'% acc_epoch)
# print('cost 50 iteration %s'% cost_epoch) 
print('____TRAIN DATA____') 
print('Accuracies over each iteration: %s' % accuracies_train)
print('Precisions over each iteration: %s' % precisions_train)
print('Recalls over each iteration: %s' % recalls_train)
print('Average Accuracy: %.3f' % (sum(accuracies_train)/float(len(accuracies_train))))
print('Average Cost: %.3f' % (sum(costs)/float(len(costs))))
print('____TEST DATA____') 
print('Accuracies over each iteration: %s' % accuracies_test)
print('Precisions over each iteration: %s' % precisions_test)
print('Recalls over each iteration: %s' % recalls_test)
print('Average Accuracy: %.3f' % (sum(accuracies_test)/float(len(accuracies_test))))

# plt.plot(iters,acc_epoch)
# plt.xlabel("Epoches")
# plt.ylabel("Accuracies")
# plt.title("Accuracies vs Epoches")
# plt.show()
# # [0.2,] [95.955,]
# plt.plot(iters,cost_epoch)
# plt.xlabel("Epoches")
# plt.ylabel("Cost")
# plt.title("Cost vs Epoches")
# plt.show()

[-2.5053395011441206, -1.5859682701822717, -1.6346605115341555, -0.8388766163489728]
[-2.704049604206519, -1.683888043377677, -1.7638308643054328, -0.8467033073400532]
[-2.6526237059416333, -1.6181826054762563, -1.7271110903989826, -0.8950377356359108]
[-2.524579160389126, -1.5631203616973501, -1.6466537279776574, -0.7626993925337276]
[-2.722970481588964, -1.6885775231958045, -1.7752314676777556, -0.9238333671031972]
[-2.8958976783612576, -1.7385664316342644, -1.8496177662312232, -0.7900708434591098]
[-2.45459460653867, -1.5217658499684765, -1.6160546448458375, -0.7481905270663857]
[-2.589987756938105, -1.6873025162930757, -1.7750083464320852, -0.8168952518146475]
[-2.704034096737561, -1.5961025801815683, -1.6847759903775472, -0.7462002326631328]
[-2.6336624084455997, -1.6580176602160357, -1.7612392389452245, -0.8039344817461458]
____TRAIN DATA____
Accuracies over each iteration: [95.83333333333334, 95.9375, 96.04166666666667, 95.3125, 96.04166666666667, 95.9375, 95.0, 96.0416666666666

In [56]:
accuracies_test = list()
f_scores_test = list()
recalls_test =list()
precisions_test = list()
accuracies_train = list()
f_scores_train = list()
recalls_train =list()
costs = list()
precisions_train = list()
iterations = 7500
iters = list()
for i in range(iterations):
  if i%50 == 0:
    iters.append(i)

df.columns = ["attr1","attr2","attr3","attr4","class"]
for i in range(10):
  train_data = df.sample(frac=0.7,random_state=(np.random.randint(1,1000,1))[0])
  test_data = df.drop(train_data.index)
  X_train = train_data[["attr1","attr2","attr3","attr4"]]
  X_test = test_data[["attr1","attr2","attr3","attr4"]]
  Y_train = train_data["class"]
  Y_test = test_data["class"]
  X_train = np.array(X_train)
  X_test = np.array(X_test)
  Y_train = np.array(Y_train)
  Y_test = np.array(Y_test)
  # print(X_train)
  # print(Y_train)
  initial_theta = [0,0,0,0]
  alpha = 0.9
  # predicted,acc_epoch,cost_epoch = Logistic_Regression(X_train,Y_train,alpha,initial_theta,iterations,2)
  predicted,cost = Logistic_Regression(X_train,Y_train,alpha,initial_theta,iterations,2)
  print(predicted)
  costs.append(cost)
  accuracy_train,predict_train = Declare_Winner(predicted,X_train,Y_train)
  f_score_train,precision_train,recall_train = f_score_metric(Y_train,predict_train)
  f_scores_train.append(f_score_train)
  precisions_train.append(precision_train)
  recalls_train.append(recall_train)
  accuracies_train.append(accuracy_train*100)
  accuracy_test,predict_test = Declare_Winner(predicted,X_test,Y_test)
  f_score_test,precision_test,recall_test = f_score_metric(Y_test,predict_test)
  f_scores_test.append(f_score_test)
  precisions_test.append(precision_test)
  recalls_test.append(recall_test)
  accuracies_test.append(accuracy_test*100)
  

# print('accuracies 50 iteration %s'% acc_epoch)
# print('cost 50 iteration %s'% cost_epoch)  
print('____TRAIN DATA____') 
print('Accuracies over each iteration: %s' % accuracies_train)
print('Precisions over each iteration: %s' % precisions_train)
print('Recalls over each iteration: %s' % recalls_train)
print('Average Accuracy: %.3f' % (sum(accuracies_train)/float(len(accuracies_train))))
print('Average Cost: %.3f' % (sum(costs)/float(len(costs))))
print('____TEST DATA____') 
print('Accuracies over each iteration: %s' % accuracies_test)
print('Precisions over each iteration: %s' % precisions_test)
print('Recalls over each iteration: %s' % recalls_test)
print('Average Accuracy: %.3f' % (sum(accuracies_test)/float(len(accuracies_test))))

# plt.plot(iters,acc_epoch)
# plt.xlabel("Epoches")
# plt.ylabel("Accuracies")
# plt.title("Accuracies vs Epoches")
# plt.show()

# plt.plot(iters,cost_epoch)
# plt.xlabel("Epoches")
# plt.ylabel("Cost")
# plt.title("Cost vs Epoches")
# plt.show()

[-1.09359777659756, -0.5494752390214777, -0.543920584722556, -0.30457573398466076]
[-1.0710210380749527, -0.528814934452572, -0.4997054546712983, -0.2878070625031467]
[-1.077726338612576, -0.5572348452348411, -0.509781174466149, -0.31663463429538896]
[-1.0036077279114015, -0.5320222790995524, -0.48190764735818, -0.3041124582625317]
[-1.0917563399312118, -0.5568593475262846, -0.5406071599122001, -0.3327525418414671]
[-1.1181635463575674, -0.5580775086938273, -0.5434280932957295, -0.28700128979609957]
[-1.1212129413772254, -0.5236994901895218, -0.5452486724694245, -0.3068212307910015]
[-1.058791439785771, -0.5575179273747184, -0.5155531951420471, -0.31960944188470997]
[-1.060837382261309, -0.5713716856333497, -0.5431167848399537, -0.2964942383857477]
[-1.0957980531556661, -0.5354275497449095, -0.5593892385923712, -0.29521189301530915]
____TRAIN DATA____
Accuracies over each iteration: [95.52083333333333, 95.3125, 95.0, 94.27083333333334, 95.3125, 95.20833333333333, 95.10416666666667, 94.