# Logistic Regression

In [158]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [159]:
df = pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [160]:
X = df[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']]
y = df['Outcome']

In [161]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.30, random_state=42)

## Using Sklearn

In [162]:
lr = LogisticRegression()
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [163]:
y_pred = lr.predict(X_test)
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))

Accuracy:  0.7532467532467533


In [164]:
print("Accuracy: ", metrics.accuracy_score(y_test, pred))

Accuracy:  0.7532467532467533


In [165]:
print("Intercept: ", lr.intercept_[0])
#print("Coefficients/Weights: ", lr.coef_[0])
for i in range(len(lr.coef_[0])):
     print("Theta",i+1,": ", lr.coef_[0][i])

Intercept:  -5.780518811078252
Theta 1 :  0.06123394995299268
Theta 2 :  0.026202438959727323
Theta 3 :  -0.016642865660055923
Theta 4 :  -0.0021790391971866227
Theta 5 :  -0.0002831534980666913
Theta 6 :  0.06435603045980515
Theta 7 :  0.25165584811460323
Theta 8 :  0.022135381139038564


## From scratch

In [166]:
# sigmoid function
def sigmoid(X, theta):
    z = np.dot(X, theta)
    return 1 / (1 + np.exp(-z))

# cost function
def cost_function(y, h):
    j = (-y * np.log(h) - (1-y) * np.log(1-h)).mean()
    return j

# calculating gradient and updating theta
def gradient_descent(X, h, y):
    return np.dot(X.T, (h-y)) / X.shape[0]

def update(theta, learning_rate, gradient):
    return theta - learning_rate * gradient

In [167]:
X2 = X_train.to_numpy()
y2 = y_train.to_numpy()

In [168]:
# adding value of X0 = 1 in X2
intercept = np.ones((X2.shape[0],1))
X2 = np.concatenate((intercept, X2), axis=1)

# theta vector initialised with 0
theta = np.zeros(X2.shape[1])

In [169]:
eta = 0.0001
n_iter = 10000
for i in range(n_iter):
    # get predicted probability - p_hat
    # get gradient using the cost function
    # update the theta vector
    h = sigmoid(X2, theta)
    cost = cost_function(y2, h)
    gradient = gradient_descent(X2, h, y2)
    theta = update(theta, eta, gradient)

In [170]:
print("Intercept: ", theta[0])
for i in range(1,len(theta)):
    print("Theta",i,": ", theta[i])

Intercept:  -0.04004713525047281
Theta 1 :  0.06751655990043788
Theta 2 :  0.012452998617748786
Theta 3 :  -0.031519610271979887
Theta 4 :  -0.004215782070656332
Theta 5 :  0.0008288200439875959
Theta 6 :  -1.0816541614350414e-05
Theta 7 :  0.0017694085447978337
Theta 8 :  0.0006629378649117798


In [171]:
# testing on the test set
X3 = X_test.to_numpy()
intercept = np.ones((X3.shape[0],1))
X3 = np.concatenate((intercept, X3), axis=1)

In [172]:
result = sigmoid(X3, theta)
# changing values to 1 where prob is >= 0.5
result[result >= 0.5] = 1
result[result < 0.5] =0

In [173]:
print("Accuracy: ", metrics.accuracy_score(y_test, result))

Accuracy:  0.6926406926406926
