<b>Logistic-Regression (LogsReg) - Scratch</b> <br>
<i>Implementing logistic regression using only NumPy, step-by-step. </i>

<b>requirements</b>

In [12]:
# example:- pip install numpy

<b>imports</b>

In [13]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import datasets

<b>(1) DATA PRE-PROCESSING</b>

In [14]:
# Generate Synthetic Data
# Features (X) & Dependent-Variable(y)
X, y = datasets.make_blobs(centers=3, n_samples=500, n_features=2, shuffle=True, random_state=42, cluster_std = 1.3)

# Data-Splitting 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 134)

# Data Pre-Processing
# Normalize (if needed)
# Data Visualization

<b>(2) ML ALGORITHM - SCRATCH</b>

In [15]:
class LogisticRegression: 
    
    # (2.1) Initiaize Model-Parameters
    def __init__(self, learning_rate = 0.001, iters = 1000) -> None:
        """
        Initialize Model-Parameters
        
        Parameters-Variables: 
            learning_rate : [param](int) learning_rate of the model
            iters         : [param](int) Number of Training Iterations , default = 1000
            weights       : [vars](int) weights of the model, initially None
            bias          : [vars](int) bais of the model, initially None
            
        Returns:
            Nothing
        """

        self.iters= iters
        self.learning_rate = learning_rate
        self.weights = None
        self.bias = None
        
    
    # (2.2) Calculate Sigmoid Function 
    def sigmoid(self,z) -> np.ndarray:
        """
        Calculates the sigmoid , fx_wb = g(wx+b) = g(z) = 1/(1+e^-z), given z
        
        Parameters: 
            z : (np.array) depenent-Variables , z=wx+b
            
        Returns: 
            The sigmoid function g(z) for values z
        
        """
        gz = 1 / (1 + np.exp(-z))
        return gz
        
    
    
    # (2.3) Calculate Cost/Loss Function
    def cross_entropy_loss(self,X,y) -> np.ndarray:
        """
        The loss function for Logistic Regression is log-loss / Cross-Entropy. 
        Calculate loss for each datapoints. So its a cost calculation
            
        Parameters: 
            X : (np.array) Indepenent-Variables (Features-Matrix)
            y : (np.array) True labels 
            
        Returns: 
           The computed cost
            
        """
        
        n = X.shape[0] # rows, needed when used for loop
        cost = 0
        
        # can also be done using for loop, but vectorization is faster : 
        
        # calculate z
        z = np.dot(X, self.weights) + self.bias
        
        # calculate sigmoid, fx_wb  i.e gz
        gz = self.sigmoid(z)
        
        # calculate cost
        cost = -np.dot(y, np.log(gz)) - np.dot((1 - y), np.log(1 - gz))
        cost /= n
        
        return cost
    
    
    # (2.4) Fit-Model 
    def fit(self, X, y) -> None:
        """
        Fits andd Trains Model to Data X. After Training gives us the learned
        Calculates Gradients and apply gradient descent algorithm
        
        Parameter :
            X : (np.array) Independent-Variable (Features-Matrix) 
        
        Returns : 
            Nothing
        """
        
        n_data, n_features = X.shape
        
        # init params
        self.weights = np.zeros(n_features)
        self.bias = 0
        history = {}
        
        for i in range(self.iters):
            
            z = np.dot(X, self.weights) + self.bias
            y_preds = self.sigmoid(z)

            # Calc Gradients/derivs
            dw = (1/n_data) * np.dot(X.T, (y_preds-y))
            db = (1/n_data) * np.sum((y_preds-y))
            
            # GD algo / backprop : update Params based on derivs
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
            
            # printing : 
            if i % 100 == 0:
                cost = self.cross_entropy_loss(X, y)
                history[i] = cost
                print(f"Iter\t{i}\tCost\t{cost}")

        return history, self.weights, self.bias
    
    
    # (2.5) Predicted labels ŷ 
    def predict(self, X) -> np.ndarray:
        """
        Predictions ŷ of the model, calculates the probabilites and the classes based on those probs
        
        Parameters: 
            X : (np.array) Indepenent-Variables (Features-Matrix), X_test in this case
        
        Returns: 
             ŷ , the pred
        """
        
        z = np.dot(X, self.weights) + self.bias
        y_probs = self.sigmoid(z) # the probabilities of being in a particular class
        y_preds = [1 if i > 0.5 else 0 for i in y_probs] # separate to classes 0 or 1 based on probs if > or < than 0.5
        return np.array(y_probs), np.array(y_preds)

<b>(3) MODEL TRAINING</b>

In [19]:
# Fit (Training) 
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

bc = datasets.load_breast_cancer()
X, y = bc.data, bc.target

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=1234)

regressor = LogisticRegression(learning_rate=0.00001, iters=1000)
regressor.fit(X_train, y_train)

Iter	0	Cost	0.7477317800761581
Iter	100	Cost	0.5080910033542689
Iter	200	Cost	0.3665557601894522
Iter	300	Cost	0.33580207493736225
Iter	400	Cost	0.31409867807566644
Iter	500	Cost	0.2982272196637791
Iter	600	Cost	0.286223167788391
Iter	700	Cost	0.2768251260067898
Iter	800	Cost	0.2692447931704441
Iter	900	Cost	0.26298239735450696


({0: 0.7477317800761581,
  100: 0.5080910033542689,
  200: 0.3665557601894522,
  300: 0.33580207493736225,
  400: 0.31409867807566644,
  500: 0.2982272196637791,
  600: 0.286223167788391,
  700: 0.2768251260067898,
  800: 0.2692447931704441,
  900: 0.26298239735450696},
 array([ 5.05153448e-03,  8.66524306e-03,  2.97521836e-02,  1.56091081e-02,
         5.12200001e-05, -8.16830112e-06, -6.82816539e-05, -3.03114194e-05,
         1.00783931e-04,  4.01512371e-05,  4.49374913e-05,  6.87556054e-04,
         4.86594446e-05, -1.04389482e-02,  3.99564326e-06, -7.90925221e-07,
        -1.87416173e-06,  6.21585683e-07,  1.20750046e-05,  1.17941004e-06,
         5.20685184e-03,  1.10272651e-02,  2.95784037e-02, -1.92611583e-02,
         6.59633597e-05, -6.05398322e-05, -1.40768918e-04, -3.30011471e-05,
         1.43409816e-04,  3.88102552e-05]),
 0.0006780733445365326)

<b>(4) PREDICTION</b>

In [20]:
predictions = regressor.predict(X_test)


<b>(5) EVALUATION-VISUALIZATION</b>

In [21]:
print("LR classification accuracy:", accuracy(y_test, predictions))

# Predicting on Test-Set using the trained  model (by the learned '')
#print(f"\nPredicted Class-Label : {predictions}")


LR classification accuracy: 0.9210526315789473


<b>CONCLUSION</b>
- The model is performing well for classification