In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv('synthetic_dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0,7,149,96,19,10,38.387409,0.561331,40,1
1,1,0,151,44,6,105,26.125923,0.463959,27,1
2,2,3,169,57,24,240,33.224573,0.541364,36,0
3,3,4,86,61,35,0,32.918264,0.526311,39,0
4,4,6,75,62,31,77,37.45383,0.178734,21,0


In [4]:
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [5]:
X= df.iloc[:,:-1]

In [6]:
y = df['Outcome']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.20,random_state=23)

In [8]:
def standardize(column):
    return (column - column.mean()) / column.std()

In [9]:
def apply(X):
    cols=list(X)
    for i in cols:
        X[i]=standardize(X[i])

In [10]:
apply(X_train)

In [11]:
apply(X_test)

In [12]:
clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

In [13]:
cnf_matrix = metrics.confusion_matrix(y_test, y_pred) 

cnf_matrix 

array([[64,  8],
       [12, 16]], dtype=int64)

In [14]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) 

print("Precision:",metrics.precision_score(y_test, y_pred)) 

print("Recall:",metrics.recall_score(y_test, y_pred))

Accuracy: 0.8
Precision: 0.6666666666666666
Recall: 0.5714285714285714


hi

In [15]:
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [16]:
weights = np.zeros(X.shape[1])  # One weight per feature
bias = 0.0

In [17]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [18]:
def compute_cost(X, y, weights, bias):
    m = len(y)
    h = sigmoid(X.dot(weights) + bias)
    cost = (-1 / m) * np.sum((y.dot(np.log(h)) + (1 - y).dot(np.log(1 - h))))    #log loss
    return cost

In [19]:
def gradient_descent(X, y, weights, bias, learning_rate, num_iterations):
    m = len(y)
    costs=[]

    for i in range(num_iterations):
        h = sigmoid(X.dot(weights)+bias)
        gradient = (1 / m) * X.T.dot(h - y)
        weights -= learning_rate * gradient
        bias -= learning_rate * (1/m) * np.sum(h - y)
        cost = compute_cost(X, y, weights, bias)
        if i%100000==0:
            costs.append(cost)

    return weights, bias,costs

In [20]:
def predict(X, weights, bias):
    probabilities = sigmoid(X.dot(weights) + bias)
    return (probabilities >= 0.5).astype(int)

In [21]:
lr = 0.001
itr = 1000000
#try reducing itr
tw, tb , c= gradient_descent(X_train, y_train, weights, bias, lr, itr)

In [22]:
c

[0.6930625107540088,
 0.5299230589480526,
 0.5299227378251675,
 0.5299227378186484,
 0.5299227378186483,
 0.5299227378186483,
 0.5299227378186483,
 0.5299227378186483,
 0.5299227378186483,
 0.5299227378186483]

In [23]:
def accuracy(y_pred,y_test):
    acc = np.sum(np.equal(y_pred,y_test))/y_pred.size
    return acc

In [24]:
pred = predict(X_test, tw, tb)

In [25]:
y_test

array([0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0], dtype=int64)

In [26]:
accuracy(y_test,pred)

0.8