## Logistic Regression for Multi-Class Classification

In [56]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [57]:
from sklearn.datasets import make_classification

In [58]:
X,y = make_classification(n_samples=1000,n_features=10,n_informative=3,n_classes=3,random_state=42)

In [59]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.771533,-1.474336,2.196646,-0.678992,0.767479,-1.226179,-0.237566,0.681743,1.089962,0.962503
1,1.858383,-3.680880,0.227496,-3.818368,0.635968,1.609521,-0.153135,-0.474178,1.341139,-0.771772
2,-0.987248,1.539168,0.585904,1.185281,-0.736770,-1.406815,0.684783,-0.322028,-1.451034,1.325432
3,-1.022420,1.381787,2.015275,0.861434,1.290644,-1.889649,1.009138,0.363116,-1.844238,0.311110
4,-0.131161,-2.079008,1.483744,1.636858,-0.734811,-0.640154,2.666422,-1.085748,-0.892851,0.029796
...,...,...,...,...,...,...,...,...,...,...
995,0.982677,-1.908917,-0.385673,-2.565989,0.434503,-1.334611,-0.060318,1.040062,0.433730,-0.739771
996,-1.144949,1.434090,-0.459657,0.358606,-1.079842,0.918915,1.339638,1.193113,-2.473007,1.021515
997,-1.227082,1.656028,0.472630,1.718125,-1.366858,0.018017,1.131626,-0.863494,-1.839323,-0.031203
998,1.285276,-0.297152,-0.671721,-1.843337,-1.254161,-0.276889,-2.913304,-1.315441,2.854465,1.309444


In [60]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=42)

In [61]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression(multi_class='ovr')
logistic.fit(X_train,y_train)
y_pred = logistic.predict(X_test)



In [62]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [63]:
score = accuracy_score(y_test,y_pred)
print(score)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.68
[[79 16  8]
 [31 38 27]
 [ 3 11 87]]
              precision    recall  f1-score   support

           0       0.70      0.77      0.73       103
           1       0.58      0.40      0.47        96
           2       0.71      0.86      0.78       101

    accuracy                           0.68       300
   macro avg       0.67      0.67      0.66       300
weighted avg       0.67      0.68      0.66       300



### Hyper Parameter Tuning

In [64]:
from sklearn.model_selection import GridSearchCV
model = LogisticRegression(multi_class='ovr')
penalty = ['l1', 'l2', 'elasticnet']
c_values = [100,10,1.0,0.1,0.01]
solver = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']

In [65]:
params = dict(penalty=penalty,C=c_values,solver=solver)

In [66]:
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold()
grid = GridSearchCV(estimator=model,param_grid=params,scoring='accuracy',cv=cv,n_jobs=-1)

In [67]:
grid

In [68]:
grid.fit(X_train,y_train)

250 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Paul\Desktop\Learnings\Machine Learning\venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Paul\Desktop\Learnings\Machine Learning\venv\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Paul\Desktop\Learnings\Machine Learning\venv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(self

In [69]:
y_pred = grid.predict(X_test)

In [70]:
grid.best_estimator_

In [71]:
grid.best_params_

{'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}

In [72]:
grid.best_score_

np.float64(0.6385714285714286)

In [73]:
score = accuracy_score(y_test,y_pred)
print(score)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.6766666666666666
[[81 13  9]
 [36 32 28]
 [ 2  9 90]]
              precision    recall  f1-score   support

           0       0.68      0.79      0.73       103
           1       0.59      0.33      0.43        96
           2       0.71      0.89      0.79       101

    accuracy                           0.68       300
   macro avg       0.66      0.67      0.65       300
weighted avg       0.66      0.68      0.65       300



**Here we can observe that the accuracy is decreased**

In [97]:
from sklearn.model_selection import RandomizedSearchCV

In [98]:
model = LogisticRegression(multi_class='ovr')
randomcv = RandomizedSearchCV(estimator=model,param_distributions=params,cv=5,scoring='accuracy')

In [99]:
randomcv.fit(X_train,y_train)

30 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Paul\Desktop\Learnings\Machine Learning\venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Paul\Desktop\Learnings\Machine Learning\venv\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Paul\Desktop\Learnings\Machine Learning\venv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(self.so

In [100]:
y_pred = randomcv.predict(X_test)

In [101]:
randomcv.best_estimator_

In [102]:
randomcv.best_params_

{'solver': 'liblinear', 'penalty': 'l2', 'C': 0.1}

In [103]:
randomcv.best_score_

np.float64(0.6285714285714286)

In [104]:
score = accuracy_score(y_test,y_pred)
print(score)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.6733333333333333
[[80 13 10]
 [35 33 28]
 [ 2 10 89]]
              precision    recall  f1-score   support

           0       0.68      0.78      0.73       103
           1       0.59      0.34      0.43        96
           2       0.70      0.88      0.78       101

    accuracy                           0.67       300
   macro avg       0.66      0.67      0.65       300
weighted avg       0.66      0.67      0.65       300



Read this for knowing about why the accuracy is decreased after using Randomized SearchCV and GridSearchCV \
https://chatgpt.com/c/6772c59e-c4ac-800a-bf13-c85ab8866a0e

In [136]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression

In [137]:
model = LogisticRegression(multi_class='ovr')
randomcv = RandomizedSearchCV(estimator=model,param_distributions=params,cv=15,scoring='accuracy')

In [138]:
randomcv.fit(X_train,y_train)

75 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Paul\Desktop\Learnings\Machine Learning\venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Paul\Desktop\Learnings\Machine Learning\venv\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Paul\Desktop\Learnings\Machine Learning\venv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(self.

In [139]:
y_pred = randomcv.predict(X_test)

In [140]:
score = accuracy_score(y_test,y_pred)
print(score)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.6833333333333333
[[80 15  8]
 [35 35 26]
 [ 3  8 90]]
              precision    recall  f1-score   support

           0       0.68      0.78      0.72       103
           1       0.60      0.36      0.45        96
           2       0.73      0.89      0.80       101

    accuracy                           0.68       300
   macro avg       0.67      0.68      0.66       300
weighted avg       0.67      0.68      0.66       300



After increasing some Cross Validations here we got some better performance... **:)**