<a href="https://colab.research.google.com/github/sandipanpaul21/ML-Code-in-Python/blob/master/08_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Libraries
from sklearn import datasets
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

  import pandas.util.testing as tm


In [2]:
# IRIS Dataset
iriss = datasets.load_iris()
iris = pd.DataFrame(iriss.data)
iris.columns = iriss.feature_names
iris['species'] = iriss.target

# Logistic only applies to 2 Classes but IRIS have 3 classes
# So filtering out 2 classes (0 & 1)
iris = iris[iris['species'] <= 1]
iris.columns = iris.columns.str.replace(" ","")
iris.columns = iris.columns.str.replace("(","")
iris.columns = iris.columns.str.replace(")","")
iris.head()

Unnamed: 0,sepallengthcm,sepalwidthcm,petallengthcm,petalwidthcm,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [3]:
# Target Column Distribution
iris['species'].value_counts()

1    50
0    50
Name: species, dtype: int64

In [4]:
# Distribution (mean) of Independent Columns respect to Dependent Column
iris.groupby('species').mean().round(2)

Unnamed: 0_level_0,sepallengthcm,sepalwidthcm,petallengthcm,petalwidthcm
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5.01,3.43,1.46,0.25
1,5.94,2.77,4.26,1.33


In [5]:
# Independent Variables
Independent_Variable_Base_Set = iris[iris.columns[0:4]]
Independent_Variable_Base_Set.head()

Unnamed: 0,sepallengthcm,sepalwidthcm,petallengthcm,petalwidthcm
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [6]:
# Dependent Variable
Dependent_Variable = iris[iris.columns[-1:iris.columns.size]]
Dependent_Variable.head()

Unnamed: 0,species
0,0
1,0
2,0
3,0
4,0


In [7]:
# Fitting Logistic Model 

# Base Model
logit_model = sm.Logit(Dependent_Variable,Independent_Variable_Base_Set)
result = logit_model.fit(method = 'bfgs')
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.000001
         Iterations: 22
         Function evaluations: 24
         Gradient evaluations: 24
                          Results: Logit
Model:               Logit            Pseudo R-squared: 1.000      
Dependent Variable:  species          AIC:              8.0002     
Date:                2020-09-03 13:11 BIC:              18.4209    
No. Observations:    100              Log-Likelihood:   -0.00011118
Df Model:            3                LL-Null:          -69.315    
Df Residuals:        96               LLR p-value:      7.4648e-30 
Converged:           1.0000           Scale:            1.0000     
-------------------------------------------------------------------
               Coef.   Std.Err.    z    P>|z|    [0.025     0.975] 
-------------------------------------------------------------------
sepallengthcm -2.5944 1094.5370 -0.0024 0.9981 -2147.8474 2142.6587
sepalwidthcm  -8.5218 1836.4958 -0.0046

In [8]:
# Model Summary

print("Base Model Summary")
print("Iteration suggests how many loop model did to perform the fit")
print("Iterations : 22")
r_square = result.prsquared.round(2)
print("Pseudo R Square suggests overall effect size (ideal value is close to 1)")
print("Base Model, MacFadden Pseudo R Square : ",r_square)
base_model_aic = result.aic.round(2)
print("AIC compares Goodness of Fit, Lower AIC better is the Model")
print("Base Model, AIC  :",base_model_aic)
base_model_bic = result.bic.round(2)
print("BIC also work same as AIC, Lower BIC better is the Model")
print("Base Model, BIC :",base_model_bic)

Base Model Summary
Iteration suggests how many loop model did to perform the fit
Iterations : 22
Pseudo R Square suggests overall effect size (ideal value is close to 1)
Base Model, MacFadden Pseudo R Square :  1.0
AIC compares Goodness of Fit, Lower AIC better is the Model
Base Model, AIC  : 8.0
BIC also work same as AIC, Lower BIC better is the Model
Base Model, BIC : 18.42


In [9]:
# Split the Dataset how logistic works

# Lets start with selecting one variable
Independent_Variable_Set_v1 = iris[iris.columns[0:1]]
X_train, X_test, y_train, y_test = train_test_split(Independent_Variable_Set_v1,Dependent_Variable,test_size = 0.3,random_state = 21)
logit_model = sm.Logit(y_train,X_train)
result = logit_model.fit(method='bfgs')
print(result.summary2())

# Model Summary

print("Model 1 Summary")
print("Iteration suggests how many loop model did to perform the fit")
print("Iterations : 3")
r_square_1 = result.prsquared.round(2)
print("Pseudo R Square suggests overall effect size (ideal value is close to 1)")
print("Model 1, MacFadden Pseudo R Square : ",r_square_1)
base_model_aic_1 = result.aic.round(2)
print("AIC compares Goodness of Fit, Lower AIC better is the Model")
print("Model 1, AIC  :",base_model_aic_1)
base_model_bic_1 = result.bic.round(2)
print("BIC also work same as AIC, Lower BIC better is the Model")
print("Model 1, BIC :",base_model_bic_1)

Optimization terminated successfully.
         Current function value: 0.662428
         Iterations: 3
         Function evaluations: 5
         Gradient evaluations: 5
                        Results: Logit
Model:              Logit            Pseudo R-squared: 0.024  
Dependent Variable: species          AIC:              94.7399
Date:               2020-09-03 13:11 BIC:              96.9884
No. Observations:   70               Log-Likelihood:   -46.370
Df Model:           0                LL-Null:          -47.487
Df Residuals:       69               LLR p-value:      nan    
Converged:          1.0000           Scale:            1.0000 
---------------------------------------------------------------
               Coef.   Std.Err.    z     P>|z|   [0.025  0.975]
---------------------------------------------------------------
sepallengthcm  0.0902    0.0442  2.0399  0.0414  0.0035  0.1769

Model 1 Summary
Iteration suggests how many loop model did to perform the fit
Iterations : 3
P

In [10]:
# Model Prediction

pred = result.predict(X_test)
model_prediction = pd.DataFrame(pred.round(2),columns = ['Prediction'])
model_prediction['temp'] = 'temp'
model_prediction['Final_Class'] = np.where(model_prediction['Prediction'] > 0.5,1,0)
print(model_prediction.head())
temp = model_prediction.groupby('temp')['Final_Class'].apply(list)
y_pred = temp.loc['temp']
model_1_accuracy = accuracy_score(y_test,y_pred).round(2)
print("\nModel Performance")
print("Model 1, Accuracy :",model_1_accuracy)
model_1_precision = precision_score(y_test,y_pred).round(2)
print("Model 1, Precision :",model_1_precision)
model_1_recall = recall_score(y_test,y_pred).round(2)
print("Model 1, Recall :",model_1_recall)
model_1_fscore = f1_score(y_test,y_pred).round(2)
print("Model 1, F1 Score :",model_1_fscore)
model_1_roc = roc_auc_score(y_test,y_pred)
print("Model 1, AUC :",model_1_roc)
print("\nConfusion Matrix Model 1")
model_1_cm = confusion_matrix(y_test,y_pred)
print(model_1_cm)

    Prediction  temp  Final_Class
23        0.61  temp            1
81        0.62  temp            1
85        0.63  temp            1
34        0.61  temp            1
62        0.63  temp            1

Model Performance
Model 1, Accuracy : 0.3
Model 1, Precision : 0.3
Model 1, Recall : 1.0
Model 1, F1 Score : 0.46
Model 1, AUC : 0.5

Confusion Matrix Model 1
[[ 0 21]
 [ 0  9]]
