In [1]:
import numpy as np
from numpy import log,dot,exp,shape
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [34]:
xd,yd = make_classification(n_features=4)
x_tr,x_te,y_tr,y_te = train_test_split(xd,yd,test_size=0.25)

In [35]:
def standardize(data):
    for i in range(shape(data)[1]):
        data[:,i] = (data[:,i]-np.mean(data[:,i]))/np.std(data[:,i])

The data sets are always multidimensional. We will need to use matrices for any kind of calculation. So, for input, we have two matrices to deal with. The first one is for feature vectors, and the second is for parameters or weights. Our first matrix is of the mxn dimension, where m is the number of observations while n is the dimension of observations. And the second one is of nx1 dimension. Here, we will add a bias column of ones to our feature vectors matrix and a corresponding parameter term to the weight vector. Bias is important to make the model more flexible.

Linear regression employs the least squared error as the cost function. But the least squared error function for logistic regression is non-convex. While performing gradient descent chances that we get stuck in a local minimum is more. So instead, we use log loss as the cost function.

In [37]:
class LogisticRegression:
    
    
    def sigmoid(self,z):
        sig = 1/(1+exp(-z))
        return sig


    def initialize(self,x):
        weights = np.zeros((shape(x)[1]+1,1))
        x = np.c_[np.ones((shape(x)[0],1)),x] #(100,5)
        return weights,x


    def standardize(data):
        for i in range(shape(data)[1]):
            data[:,i] = (data[:,i]-np.mean(data[:,i]))/np.std(data[:,i])
    
    def cost(self,x,y,theta):
        #print("shape cost" ,shape(x))
        z = dot(x,theta)
        cost0 = y.T.dot(log(self.sigmoid(z)))
        cost1 = (1-y.T).dot(log(1-self.sigmoid(z)))
        cost = -(cost0+cost1)/len(y)
        return cost


    
    
    #gradient descent
    def fit(self,x,y,alpha=0.001,iter=100):
        weights,x = self.initialize(x)
        cost_list = np.zeros(iter,)
        for i in range(iter):
            weights = weights - alpha * dot(x.T, self.sigmoid(dot(x,weights)) - np.reshape(y,(len(y),1)))
            cost_list[i] = self.cost(x,y,weights)
        self.weights = weights
        return cost_list
    

    
    # prediction
    def predict(self,x):
        z = dot(self.initialize(x)[1],self.weights)
        lis = []
        for i in self.sigmoid(z):
            if i>0.5:
                lis.append(1)
            else:
                lis.append(0)
        return lis

In [38]:
 # F1 scores
def f1_score(y,y_hat):
    tp,tn,fp,fn = 0,0,0,0
    for i in range(len(y)):
        if y[i] == 1 and y_hat[i] == 1:
            tp += 1
        elif y[i] == 1 and y_hat[i] == 0:
            fn += 1
        elif y[i] == 0 and y_hat[i] == 1:
            fp += 1
        elif y[i] == 0 and y_hat[i] == 0:
            tn += 1
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1_score = 2*precision*recall/(precision+recall)
    return f1_score

In [39]:
standardize(x_tr)
standardize(x_te)
test = LogisticRegression()
model= test.fit(x_tr,y_tr)
y_pred = test.predict(x_te)
y_train = test.predict(x_tr)
#Let's see the f1-score for training and testing data
f1_score_tr = f1_score(y_tr,y_train)
f1_score_te = f1_score(y_te,y_pred)
print(f1_score_tr)
print(f1_score_te)

0.9275362318840579
0.9655172413793104


### Now, let’s see how our logistic regression fares in comparison to sklearn’s logistic regression

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
model = LogisticRegression().fit(x_tr,y_tr)
y_pred = model.predict(x_te)
print(f1_score(y_te,y_pred))

0.9655172413793104


#References
https://www.analyticsvidhya.com/blog/2022/02/implementing-logistic-regression-from-scratch-using-python/
https://www.kaggle.com/code/jagannathrk/logistic-regression-from-scratch-python