# Understanding Logistic Regression from Scratch


Import Section 


In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from  sklearn.model_selection import train_test_split

Lets work on the cancer dataset

In [4]:
!pip install -q kaggle
from google.colab import files
files.upload()
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json
!mkdir cancerdataset
!kaggle datasets download -d amandam1/breastcancerdataset
!unzip breastcancerdataset.zip -d cancerdataset

Saving kaggle.json to kaggle.json
Downloading breastcancerdataset.zip to /content
  0% 0.00/10.8k [00:00<?, ?B/s]
100% 10.8k/10.8k [00:00<00:00, 9.02MB/s]
Archive:  breastcancerdataset.zip
  inflating: cancerdataset/BRCA.csv  


Lets Load the dataset

In [152]:
cancerdata = pd.read_csv('/content/cancerdataset/BRCA.csv')
cancerdata.head()

Unnamed: 0,Patient_ID,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Date_of_Surgery,Date_of_Last_Visit,Patient_Status
0,TCGA-D8-A1XD,36.0,FEMALE,0.080353,0.42638,0.54715,0.27368,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,15-Jan-17,19-Jun-17,Alive
1,TCGA-EW-A1OX,43.0,FEMALE,-0.42032,0.57807,0.61447,-0.031505,II,Mucinous Carcinoma,Positive,Positive,Negative,Lumpectomy,26-Apr-17,09-Nov-18,Dead
2,TCGA-A8-A079,69.0,FEMALE,0.21398,1.3114,-0.32747,-0.23426,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,08-Sep-17,09-Jun-18,Alive
3,TCGA-D8-A1XR,56.0,FEMALE,0.34509,-0.21147,-0.19304,0.12427,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,25-Jan-17,12-Jul-17,Alive
4,TCGA-BH-A0BF,56.0,FEMALE,0.22155,1.9068,0.52045,-0.31199,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,06-May-17,27-Jun-19,Dead


Lets do some preprocessing

In [153]:
# drop duplicates
print("data before duplicate removal : ",cancerdata.shape[0])
cancerdata = cancerdata.drop_duplicates()
print("data after duplicate remomval : ",cancerdata.shape[0])


data before duplicate removal :  341
data after duplicate remomval :  335


In [154]:
# lets only take few features which are important
cancerdata = cancerdata.iloc[:,[1,2,3,4,5,6,7,8,9,10,11,12,15]]
cancerdata.head()


Unnamed: 0,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Patient_Status
0,36.0,FEMALE,0.080353,0.42638,0.54715,0.27368,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,Alive
1,43.0,FEMALE,-0.42032,0.57807,0.61447,-0.031505,II,Mucinous Carcinoma,Positive,Positive,Negative,Lumpectomy,Dead
2,69.0,FEMALE,0.21398,1.3114,-0.32747,-0.23426,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,Alive
3,56.0,FEMALE,0.34509,-0.21147,-0.19304,0.12427,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,Alive
4,56.0,FEMALE,0.22155,1.9068,0.52045,-0.31199,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,Dead


In [155]:
# null value analysis
print(cancerdata.isnull().sum())
print("data before null remomval : ",cancerdata.shape[0])
cancerdata = cancerdata.dropna()
print("data after null remomval : ",cancerdata.shape[0])
    

Age                1
Gender             1
Protein1           1
Protein2           1
Protein3           1
Protein4           1
Tumour_Stage       1
Histology          1
ER status          1
PR status          1
HER2 status        1
Surgery_type       1
Patient_Status    14
dtype: int64
data before null remomval :  335
data after null remomval :  321


In [156]:
# lets encode the categorical features
X = cancerdata.iloc[:,:-1]
y = cancerdata.iloc[:,-1]


In [157]:
y.shape

(321,)

In [158]:
features_with_noencoding = X.iloc[:,[0,2,3,4,5]]
feature_with_ordinal_encoding = X.iloc[:,[6]]
feature_with_one_hot_encoding = X.iloc[:,[1,7,8,9,10,11]]
feature_with_ordinal_encoding.Tumour_Stage = feature_with_ordinal_encoding.Tumour_Stage.map({'I':1,"II":2,"III":3})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [159]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
feature_with_one_hot_encoding = enc.fit_transform(feature_with_one_hot_encoding).toarray()
feature_with_one_hot_encoding = pd.DataFrame(feature_with_one_hot_encoding,columns = enc.get_feature_names_out())
feature_with_one_hot_encoding


Unnamed: 0,Gender_FEMALE,Gender_MALE,Histology_Infiltrating Ductal Carcinoma,Histology_Infiltrating Lobular Carcinoma,Histology_Mucinous Carcinoma,ER status_Positive,PR status_Positive,HER2 status_Negative,HER2 status_Positive,Surgery_type_Lumpectomy,Surgery_type_Modified Radical Mastectomy,Surgery_type_Other,Surgery_type_Simple Mastectomy
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
316,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
317,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
318,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
319,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0


In [160]:
feature_with_one_hot_encoding.shape

(321, 13)

In [161]:
X = pd.concat([features_with_noencoding.reset_index(), feature_with_ordinal_encoding.reset_index(),feature_with_one_hot_encoding.reset_index()], axis=1)
X.shape
y = y.map({"Alive":0,"Dead":1})

In [162]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)


# Lets Implement Logistic Regression from Scratch

###Sigmoid function

<center><img src="https://qph.cf2.quoracdn.net/main-qimg-0c921e324b298fdc72027d25ee584db3.webp" width="200" height="100"></center>


###Optimisation function for logistic regression

<center><img src="https://sebastianraschka.com/images/faq/probablistic-logistic-regression/6.png" width="500" height="100"></center>

###stochastic gradient descent


<center><img src="https://media.geeksforgeeks.org/wp-content/uploads/20200611183120/1406-7.png" width="300" height="100"></center>




In [172]:
class Logistic:
  """
		Logistic Regression is a Classification ML model.
	"""
  def __init__(self, learning_rate = 1e-3, max_iter = 2000):
    self.num_feats = int
    self.train_size = int
    self.weights = np.array 
    self.y_train = np.array 
    self.input_matrix = np.array
    self.eps = np.finfo(float).eps
    self.learning_rate = learning_rate   #Learning rate for gradient descent
    self.max_iter = max_iter 	#Number of iterations to run gradient descent
    self.cost_threshold = 0.1 * learning_rate  #stopping criterion for gradien descent
  
  def sigmoid(self, x):

    """
    Logistic function for binary classification.
    Sigmoid = 1/(1 + e^(-x))  -> It outputs values between 0 and 1
    """

    return 1 / (1 + np.exp(-x))

  def fit(self, X, y, logistic_function = "sigmoid"):

    """
      Adjust weights to training data
    """

    self.train_size = X.shape[0]
    self.num_feats = X.shape[1]
    self.input_matrix = np.append(X, np.ones(self.train_size).reshape(-1, 1), axis = 1)   #Add Column with Ones for intercept term 
    self.y_train = y.to_numpy()
    self.weights = np.zeros(self.num_feats + 1) #Extra +1 for the intercept


    #optimize weights
    prev_cost = float("inf")
    for i in range(self.max_iter):
      cost = self._update_weights()

      if i%100 ==0 or i == self.max_iter:
        print("Cost after {} iterations is: {}".format(i, cost))
      if abs(prev_cost -cost) < self.cost_threshold*prev_cost:
        print("Cost after {} iterations is: {}".format(i, cost))
        break
      prev_cost = cost


  def _update_weights(self):

    """
      Cost Function:
        yhat = sigmoid(wX)
        l(w) = -(1/n) * (y*log(yhat) + (1-y)*log(1-yhat))
      Gradient:
        delta_w = dl/dw = (1/n)*((yhat - y) * X)) 

      Gradient Descent:
        w = w - (learning_rate * delta_w)
    """

    y_pred = self.sigmoid((self.weights * self.input_matrix).sum(axis = 1))  # y_pred = sigmoid(wX)

    cost = -(1/self.train_size) * (self.y_train*np.log(y_pred+self.eps) + (1-self.y_train)*np.log(1-y_pred+self.eps)).sum(axis = 0)

    delta_w = (1/self.train_size) * (((y_pred - self.y_train).reshape(-1, 1) * self.input_matrix).sum(axis = 0))  #delta_w = (1/n)*((yhat - y) * X)) 

    self.weights = self.weights - (self.learning_rate * delta_w) 

    return cost


  def predict(self, X):

    """ Make predictions on given X using trained model """

    size = X.shape[0]
    X = np.append(X, np.ones(size).reshape(-1, 1), axis = 1)

    y_pred = self.sigmoid((self.weights * X).sum(axis = 1))

    y_pred[np.where(y_pred >= 0.5)] = 1.0
    y_pred[np.where(y_pred < 0.5)] = 0.0

    return y_pred 

# lets fit and predict

In [164]:
# Create a Logistic Regression Model Object
logistic_clf = Logistic(learning_rate = 1e-2, max_iter = 2000)

#Train our Logistic Regression Model
logistic_clf.fit(X_train, y_train)


print("Train Accuracy: {}%".format(accuracy_score(y_train, logistic_clf.predict(X_train))))
print("Test Accuracy: {}%".format(accuracy_score(y_test, logistic_clf.predict(X_test))))

print('Logistic Regression Model Coefficients (W): {}'.format(logistic_clf.weights[:-1]))
print('Logistic Regression Model Intercept (b): {}'.format(logistic_clf.weights[-1]))



Cost after 0 iterations is: 0.6931471805599447
Cost after 100 iterations is: 7.380561665156636




Cost after 200 iterations is: 7.387811406005125
Cost after 300 iterations is: 7.387793853818394
Cost after 400 iterations is: 7.391016697914574
Cost after 500 iterations is: 7.39072040232841
Cost after 600 iterations is: 7.390613192271838
Cost after 700 iterations is: 7.390504667715654
Cost after 800 iterations is: 7.390394832361908
Cost after 900 iterations is: 7.390283689002641
Cost after 1000 iterations is: 7.3901712392577075
Cost after 1100 iterations is: 7.3900574835271176
Cost after 1200 iterations is: 7.389942420944494
Cost after 1300 iterations is: 7.389826049331911
Cost after 1400 iterations is: 7.389708365156291
Cost after 1500 iterations is: 7.389589363487597
Cost after 1600 iterations is: 7.389469037959065
Cost after 1700 iterations is: 7.389347380729644
Cost after 1800 iterations is: 7.389224382448958
Cost after 1900 iterations is: 7.389100032224967
Train Accuracy: 0.7901785714285714%
Test Accuracy: 0.8041237113402062%
Logistic Regression Model Coefficients (W): [-0.584158

# Lets Use SKlearn logistic regression 

In [171]:
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")
sklearn_clf = LogisticRegression(random_state=0).fit(X_train, y_train)
y_pred = sklearn_clf.predict(X_test)
y_pob = sklearn_clf.predict_proba(X_test)
print("Test Accuracy from sklearn :",accuracy_score(y_test,y_pred))
print("Logistic Regression Model Coefficients (W) : ",sklearn_clf.coef_)
print("Logistic Regression Model Intercept (b) : ",sklearn_clf.intercept_)

Test Accuracy from sklearn : 0.7835051546391752
Logistic Regression Model Coefficients (W) :  [[ 0.05223748  0.01435396  0.00389069  0.33604386  0.90113093  0.15092367
   0.05223748  0.20056033 -0.11007461 -0.66155718  0.20935248 -0.24738884
  -0.19188588 -0.01292998 -0.4522047  -0.4522047  -0.03359672 -0.41860798
  -0.52155284 -0.08428796  0.02335935  0.13027675]]
Logistic Regression Model Intercept (b) :  [-0.60563339]
