In [None]:
import warnings
warnings.filterwarnings("ignore")

In [114]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, r2_score, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score


print(sklearn.__version__)


1.5.2


In [75]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
glass_identification = fetch_ucirepo(id=42) 
  
# data (as pandas dataframes) 
X = glass_identification.data.features 
y = glass_identification.data.targets 
  
# metadata  & variable information 
# print(glass_identification.metadata) 
print(glass_identification.variables) 

             name     role         type demographic       description  \
0       Id_number       ID      Integer        None              None   
1              RI  Feature   Continuous        None  refractive index   
2              Na  Feature   Continuous        None            Sodium   
3              Mg  Feature   Continuous        None         Magnesium   
4              Al  Feature   Continuous        None          Aluminum   
5              Si  Feature   Continuous        None           Silicon   
6               K  Feature   Continuous        None         Potassium   
7              Ca  Feature   Continuous        None           Calcium   
8              Ba  Feature   Continuous        None            Barium   
9              Fe  Feature   Continuous        None              Iron   
10  Type_of_glass   Target  Categorical        None              None   

                                    units missing_values  
0                                    None             no  
1    

In [None]:
lr = LogisticRegression(random_state=24)

kfold = StratifiedKFold(n_splits=5, random_state=24, shuffle=True)


params = {
    "solver": [
        "lbfgs",
        "liblinear",
        "newton-cg",
        "sag",
        "saga"
        "newton-cholesky"
    ],
    "multi_class": [
        "ovr",
        "multinomial"
    ],
    "C" : np.linspace(0.001, 10, 20),
}

gcv = GridSearchCV(
    estimator=lr,
    param_grid=params,
    cv=kfold
)


In [80]:

gcv.fit(X, y['Type_of_glass'])

print(gcv.best_params_)
print(gcv.best_score_)
 

pd_cv = pd.DataFrame(gcv.cv_results_)
print(pd_cv.shape)
 

{'C': 4.211105263157895, 'multi_class': 'ovr', 'solver': 'newton-cg'}
0.6499446290143964
(200, 16)


In [81]:
# tst_df = pd.read_csv(r"C:\Users\DAI.STUDENTSDC\Desktop\Machine Learning\Data Sets\tst_Glass.csv")
tst_df = pd.DataFrame({
    'RI': {0: 1.5321, 1: 1.5212, 2: 1.5112, 3: 1.5, 4: 1.52, 5: 1.51},
    'Na': {0: 14.0, 1: 15.0, 2: 13.0, 3: 12.4, 4: 13.0, 5: 16.0},
    'Mg': {0: 0.0, 1: 3.0, 2: 3.5, 3: 1.23, 4: 2.4, 5: 2.7},
    'Al': {0: 0.34, 1: 1.23, 2: 2.3, 3: 3.22, 4: 0.34, 5: 4.0},
    'Si': {0: 70.23, 1: 75.9, 2: 73.0, 3: 74.22, 4: 71.22, 5: 70.0},
    'K': {0: 0.001, 1: 0.1, 2: 3.4, 3: 4.5, 4: 3.2, 5: 2.0},
    'Ca': {0: 6.7, 1: 7.0, 2: 14.0, 3: 10.0, 4: 9.0, 5: 6.0},
    'Ba': {0: 1.23, 1: 0.0, 2: 2.3, 3: 3.1, 4: 1.44, 5: 2.9},
    'Fe': {0: 0.0, 1: 0.44, 2: 0.22, 3: 0.1, 4: 0.001, 5: 0.89}
    }
)
tst_df.head(10)

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
0,1.5321,14.0,0.0,0.34,70.23,0.001,6.7,1.23,0.0
1,1.5212,15.0,3.0,1.23,75.9,0.1,7.0,0.0,0.44
2,1.5112,13.0,3.5,2.3,73.0,3.4,14.0,2.3,0.22
3,1.5,12.4,1.23,3.22,74.22,4.5,10.0,3.1,0.1
4,1.52,13.0,2.4,0.34,71.22,3.2,9.0,1.44,0.001
5,1.51,16.0,2.7,4.0,70.0,2.0,6.0,2.9,0.89


Inferencing : Preding on the Unlabeled Data.

Unlabelled Data : The data that we have not labelled yet.

`.predict()` of any classisifcation model will return the class with the highest probability.

`.predict_proba()` will return the probability of each class.



In [107]:
lr_best = LogisticRegression(random_state=24, C=4.211105263157895, multi_class='ovr', solver='newton-cg')

lr_best.fit(X, y)
lr_best.score(X, y)
print(lr_best.coef_)
print(lr_best.intercept_)
print(lr_best.predict(tst_df))
print(lr_best.predict_proba(tst_df).shape)
# all the index with max probabilities
pd.DataFrame(
    lr_best.predict_proba(tst_df), 
    columns=['1', '2', '3','5', '6',' 7']
).idxmax(axis=1).values 


[[ 0.0123551   0.20842819  2.43011462 -1.90387155  1.34745638  0.54039402
   0.97597419  0.97464276 -1.03684327]
 [ 0.04334085 -1.29190458 -0.16990804  0.61149354 -0.82582306 -0.88670009
  -0.36218658 -1.60086424  1.30201044]
 [-0.04873692  0.12164573  0.77086466 -0.38798551 -0.36310504 -0.59247793
  -0.11149336 -0.79614014 -0.11833471]
 [-0.00687261 -1.13744967 -0.84023633  2.6318559  -0.45358029  1.08763935
   0.11127178 -1.81785069 -0.52340045]
 [-0.05969279  1.89047704  0.06615453  0.6393691   0.74084952 -3.14511428
   0.43921125 -3.88120924 -0.79494857]
 [ 0.03907187  0.67790933 -0.74742826  1.35922793  0.59348196 -0.20952724
  -0.37065465  1.84006822 -0.94786754]]
[-115.62128136   79.92116387   21.66987749   41.20593901  -86.90250019
  -52.56780406]
[2 1 1 7 1 7]
(6, 6)


array(['2', '1', '1', ' 7', '1', ' 7'], dtype=object)

or

In [97]:
lr_best = gcv.best_estimator_
print(lr_best.predict(tst_df))
print(lr_best.predict_proba(tst_df))


[2 1 1 7 1 7]
[[2.48844460e-05 6.15218132e-01 2.42757047e-02 1.09289469e-03
  5.80501619e-05 3.59330334e-01]
 [4.14677714e-01 1.09822046e-02 2.13503733e-02 7.26242320e-05
  3.95723808e-01 1.57193275e-01]
 [8.20931197e-01 3.49401009e-04 7.82255221e-04 5.52053068e-02
  1.28179599e-09 1.22731839e-01]
 [1.29408600e-01 1.64712946e-04 1.50056744e-05 3.80308284e-01
  2.55194112e-13 4.90103398e-01]
 [9.26919780e-01 2.46255976e-02 1.12969398e-02 9.80523286e-03
  1.32158405e-09 2.73524486e-02]
 [3.68655132e-05 1.82724983e-02 3.46333209e-03 3.10611262e-02
  1.83082656e-08 9.47166160e-01]]


#### Precision Recall F-1 Score

In [110]:
#Without pipeline
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=24, test_size=0.3, stratify=y['Type_of_glass'])
print(y_train['Type_of_glass'].value_counts(normalize=True)*100)
print(y_test['Type_of_glass'].value_counts(normalize=True)*100)

lr = LogisticRegression(solver='lbfgs', multi_class='ovr')

lr.fit(X_train, y_train['Type_of_glass'])

Type_of_glass
2    35.570470
1    32.885906
7    13.422819
3     8.053691
5     6.040268
6     4.026846
Name: proportion, dtype: float64
Type_of_glass
2    35.384615
1    32.307692
7    13.846154
3     7.692308
5     6.153846
6     4.615385
Name: proportion, dtype: float64


In [None]:
y_pred = lr.predict(X_test)
print(accuracy_score(y_test['Type_of_glass'], y_pred))
print(confusion_matrix(y_test['Type_of_glass'], y_pred))
print(classification_report(y_test['Type_of_glass'], y_pred))

0.5692307692307692
[[18  3  0  0  0  0]
 [13 10  0  0  0  0]
 [ 3  2  0  0  0  0]
 [ 0  2  0  1  0  1]
 [ 0  2  0  0  0  1]
 [ 0  1  0  0  0  8]]
              precision    recall  f1-score   support

           1       0.53      0.86      0.65        21
           2       0.50      0.43      0.47        23
           3       0.00      0.00      0.00         5
           5       1.00      0.25      0.40         4
           6       0.00      0.00      0.00         3
           7       0.80      0.89      0.84         9

    accuracy                           0.57        65
   macro avg       0.47      0.41      0.39        65
weighted avg       0.52      0.57      0.52        65



**Using F1 score as parameter scoring='f1_macro'**

In [116]:
gcv = GridSearchCV(
    estimator=lr,
    param_grid=params,
    cv=kfold, scoring="f1_macro"
)

In [117]:
gcv.fit(X, y['Type_of_glass'])

print(gcv.best_params_)
print(gcv.best_score_)

pd_cv = pd.DataFrame(gcv.cv_results_)
print(pd_cv.shape) 

{'C': 8.947473684210527, 'multi_class': 'ovr', 'solver': 'newton-cg'}
0.526898505456462
(200, 16)


In [118]:
lr_best = gcv.best_estimator_
print(lr_best.predict(tst_df))
print(lr_best.predict_proba(tst_df))


[2 6 1 1 1 7]
[[1.72290430e-07 6.73625582e-01 8.18956654e-03 8.53040880e-04
  6.40793377e-10 3.17331637e-01]
 [4.37183330e-01 2.10450337e-03 2.09082008e-02 1.01856432e-05
  4.65563901e-01 7.42298787e-02]
 [9.62227078e-01 2.05922284e-07 3.53053736e-04 3.11701969e-03
  4.48468439e-10 3.43026420e-02]
 [4.43744795e-01 4.13346672e-07 3.98199683e-06 1.24007518e-01
  1.37460048e-16 4.32243291e-01]
 [9.67362231e-01 8.90856905e-03 3.88821169e-03 2.48422064e-03
  4.65641349e-14 1.73567677e-02]
 [2.44105498e-04 1.89456572e-03 1.49755519e-03 8.59599540e-03
  1.37100418e-11 9.87767778e-01]]
