## Logistic Regression Implementation

In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



In [94]:
from sklearn.datasets import make_classification

In [95]:
# create a binary classification dataset
X,y=make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=5)


In [96]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.272936,0.692062,1.629791,1.585230,0.277696,0.219281,0.764519,0.820231,0.262557,0.488732
1,1.129914,1.226379,0.721052,-2.116677,-0.198220,0.092808,-0.442125,-0.779630,0.339266,0.399886
2,-0.646201,-0.288975,0.289428,0.214029,-0.899804,0.382664,-2.023049,-0.144888,1.458470,-0.135793
3,-0.775029,1.771516,0.729749,0.490634,0.024272,0.352327,0.205241,-0.482151,0.726705,0.387939
4,0.899904,0.842222,1.115633,0.531954,0.145408,0.510530,0.564724,-0.634429,0.973364,2.009438
...,...,...,...,...,...,...,...,...,...,...
995,-0.109542,1.315071,2.081221,0.070912,-0.126397,-0.314053,-0.436992,-0.032162,-0.572068,-0.551259
996,0.388405,0.750692,-1.201588,-1.138094,0.505130,0.181461,1.300335,-2.075176,0.018337,-0.470791
997,-1.242721,0.462528,-0.340826,0.266460,0.467969,0.605415,1.386358,1.564752,0.940731,1.616508
998,-1.009094,-0.090729,0.532980,0.521625,0.542108,0.624873,1.574231,0.276313,0.928266,-0.985994


In [97]:
pd.DataFrame(y)

Unnamed: 0,0
0,1
1,0
2,0
3,0
4,0
...,...
995,0
996,1
997,1
998,1


In [98]:
#train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [99]:
# Model Training
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)


In [100]:
y_pred=model.predict(X_test)

In [101]:
y_predprob=model.predict_proba(X_test)

In [102]:
# Performance Metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix: \n", confusion_matrix(y_test, y_pred))
print("\nClassification Report: \n", classification_report(y_test, y_pred))


Accuracy Score:  0.9133333333333333

Confusion Matrix: 
 [[137  10]
 [ 16 137]]

Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.93      0.91       147
           1       0.93      0.90      0.91       153

    accuracy                           0.91       300
   macro avg       0.91      0.91      0.91       300
weighted avg       0.91      0.91      0.91       300



### Hyperparameter Tuning and Cross Validation

In [103]:
model=LogisticRegression()

In [104]:
penalty = ['l1', 'l2', 'elasticnet']
c_values = [100, 10, 1.0, 0.1, 0.01]
solver=['liblinear', 'saga']

In [105]:
# params=dict(penalty=penalty, C=c_values, solver=solver)
# Defining parameters for GridSearchCV
params = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': c_values,
    'solver': solver,
    'l1_ratio': [0.5]  # Required only for elasticnet
}

In [106]:
#straified kfold
from sklearn.model_selection import StratifiedKFold
cv=StratifiedKFold(n_splits=5)

In [107]:
#grid search cv
from sklearn.model_selection import GridSearchCV
grid=GridSearchCV(estimator=model, param_grid=params,scoring='accuracy', cv=cv, n_jobs=-1)

In [108]:
grid

In [109]:
grid.fit(X_train, y_train)

25 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\SaadS\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\SaadS\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\SaadS\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(se

In [110]:
grid.best_params_

{'C': 1.0, 'l1_ratio': 0.5, 'penalty': 'l2', 'solver': 'liblinear'}

In [111]:
grid.best_score_

0.9157142857142858

In [112]:
grid_pred=grid.predict(X_test)

In [113]:
# performance metrics
print("Accuracy Score: ", accuracy_score(y_test, grid_pred))
print("\nConfusion Matrix: \n", confusion_matrix(y_test, grid_pred))
print("\nClassification Report: \n", classification_report(y_test, grid_pred))


Accuracy Score:  0.9133333333333333

Confusion Matrix: 
 [[137  10]
 [ 16 137]]

Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.93      0.91       147
           1       0.93      0.90      0.91       153

    accuracy                           0.91       300
   macro avg       0.91      0.91      0.91       300
weighted avg       0.91      0.91      0.91       300



In [117]:
# RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
randomcv=RandomizedSearchCV(estimator=model, param_distributions=params,cv=5,scoring='accuracy')

In [118]:
randomcv.fit(X_train, y_train)

5 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\SaadS\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\SaadS\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\SaadS\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(self.

In [119]:
randomcv.best_score_

0.9157142857142858

In [120]:
randomcv.best_params_

{'solver': 'saga', 'penalty': 'l2', 'l1_ratio': 0.5, 'C': 1.0}

In [121]:
y_predr=randomcv.predict(X_test)

In [122]:
#performance metrics
print("Accuracy Score: ", accuracy_score(y_test, y_predr))
print("\nConfusion Matrix: \n", confusion_matrix(y_test, y_predr))
print("\nClassification Report: \n", classification_report(y_test, y_predr))


Accuracy Score:  0.9133333333333333

Confusion Matrix: 
 [[137  10]
 [ 16 137]]

Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.93      0.91       147
           1       0.93      0.90      0.91       153

    accuracy                           0.91       300
   macro avg       0.91      0.91      0.91       300
weighted avg       0.91      0.91      0.91       300



### Logistic Regression for Multiclass Classification

In [124]:
X,y=make_classification(n_samples=1000, n_features=10, n_classes=3, random_state=5,n_informative=3)

In [125]:
pd.DataFrame(y)

Unnamed: 0,0
0,2
1,2
2,2
3,2
4,0
...,...
995,2
996,2
997,0
998,0


In [126]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [129]:
model=LogisticRegression(multi_class="ovr")
model.fit(X_train, y_train)



In [130]:
y_predovr=model.predict(X_test)

In [131]:
# performance metrics
print("Accuracy Score: ", accuracy_score(y_test, y_predovr))
print("\nConfusion Matrix: \n", confusion_matrix(y_test, y_predovr))
print("\nClassification Report: \n", classification_report(y_test, y_predovr))


Accuracy Score:  0.7

Confusion Matrix: 
 [[60 21 18]
 [14 75 11]
 [17  9 75]]

Classification Report: 
               precision    recall  f1-score   support

           0       0.66      0.61      0.63        99
           1       0.71      0.75      0.73       100
           2       0.72      0.74      0.73       101

    accuracy                           0.70       300
   macro avg       0.70      0.70      0.70       300
weighted avg       0.70      0.70      0.70       300



In [132]:
# hyperparameter tuning for ovr
#grid search cv
grid=GridSearchCV(estimator=model, param_grid=params,scoring='accuracy', cv=cv, n_jobs=-1)

In [133]:
randomcv.fit(X_train, y_train)

10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\SaadS\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\SaadS\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\SaadS\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(sel

In [134]:
randomcv.best_score_

0.6771428571428572