In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import os


import warnings
warnings.filterwarnings('ignore')


In [81]:
pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [82]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
glass_identification = fetch_ucirepo(id=42) 
  
# data (as pandas dataframes) 
X = glass_identification.data.features 
y = glass_identification.data.targets 
  
# metadata 
print(glass_identification.metadata) 
  
# variable information 
print(glass_identification.variables) 


{'uci_id': 42, 'name': 'Glass Identification', 'repository_url': 'https://archive.ics.uci.edu/dataset/42/glass+identification', 'data_url': 'https://archive.ics.uci.edu/static/public/42/data.csv', 'abstract': 'From USA Forensic Science Service; 6 types of glass; defined in terms of their oxide content (i.e. Na, Fe, K, etc)', 'area': 'Physics and Chemistry', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 214, 'num_features': 9, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Type_of_glass'], 'index_col': ['Id_number'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1987, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5WW2P', 'creators': ['B. German'], 'intro_paper': None, 'additional_info': {'summary': 'Vina conducted a comparison test of her rule-based system, BEAGLE, the nearest-neighbor algorithm, and discriminant analysis.  BEAGLE is a product available through VRS Consulting, In

In [83]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RI      214 non-null    float64
 1   Na      214 non-null    float64
 2   Mg      214 non-null    float64
 3   Al      214 non-null    float64
 4   Si      214 non-null    float64
 5   K       214 non-null    float64
 6   Ca      214 non-null    float64
 7   Ba      214 non-null    float64
 8   Fe      214 non-null    float64
dtypes: float64(9)
memory usage: 15.2 KB


In [84]:
y['Type_of_glass'].value_counts()

Type_of_glass
2    76
1    70
7    29
3    17
5    13
6     9
Name: count, dtype: int64

In [85]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=24,test_size=0.3,stratify=y['Type_of_glass'])
print(y_train['Type_of_glass'].value_counts(normalize=True)*100)
print(y_test['Type_of_glass'].value_counts(normalize=True)*100)

Type_of_glass
2    35.570470
1    32.885906
7    13.422819
3     8.053691
5     6.040268
6     4.026846
Name: proportion, dtype: float64
Type_of_glass
2    35.384615
1    32.307692
7    13.846154
3     7.692308
5     6.153846
6     4.615385
Name: proportion, dtype: float64


In [86]:
lr = LogisticRegression(solver='lbfgs', multi_class='ovr')
lr.fit(X_train,y_train['Type_of_glass'])
y_pred = lr.predict(X_test)


In [112]:
lr = LogisticRegression(random_state=24)
kfold = StratifiedKFold(n_splits=5, random_state=24, shuffle=True)

params = {'solver':['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga'], 'multi_class':['ovr','multinomial'],'C':np.linspace(0.001, 10, 20)}

In [113]:
#gcv = GridSearchCV(lr, param_grid=params, cv=kfold)
gcv = GridSearchCV(lr, param_grid=params, cv=kfold, scoring='f1_macro')

In [114]:
gcv.fit(X,y['Type_of_glass'])

In [115]:
gcv.best_params_

{'C': 8.947473684210527, 'multi_class': 'ovr', 'solver': 'newton-cg'}

In [116]:
gcv.best_score_

0.526898505456462

In [117]:
pd_cv= pd.DataFrame(gcv.cv_results_)
print(pd_cv.shape)

(240, 16)


##

In [118]:
lr_best = LogisticRegression(random_state=24, C= 4.211105263157895, multi_class='ovr', solver= 'newton-cg')

In [119]:
lr_best.fit(X,y)

In [120]:
###Unlabelled Data
tst= pd.read_csv("tst_Glass.csv")
tst

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
0,1.5321,14.0,0.0,0.34,70.23,0.001,6.7,1.23,0.0
1,1.5212,15.0,3.0,1.23,75.9,0.1,7.0,0.0,0.44
2,1.5112,13.0,3.5,2.3,73.0,3.4,14.0,2.3,0.22
3,1.5,12.4,1.23,3.22,74.22,4.5,10.0,3.1,0.1
4,1.52,13.0,2.4,0.34,71.22,3.2,9.0,1.44,0.001
5,1.51,16.0,2.7,4.0,70.0,2.0,6.0,2.9,0.89


In [121]:
y_pred_prob = lr_best.predict_proba(tst)
print(y_pred_prob.shape)

(6, 6)


In [122]:
y_pred_prob

array([[2.48844460e-05, 6.15218132e-01, 2.42757047e-02, 1.09289469e-03,
        5.80501619e-05, 3.59330334e-01],
       [4.14677714e-01, 1.09822046e-02, 2.13503733e-02, 7.26242320e-05,
        3.95723808e-01, 1.57193275e-01],
       [8.20931197e-01, 3.49401009e-04, 7.82255221e-04, 5.52053068e-02,
        1.28179599e-09, 1.22731839e-01],
       [1.29408600e-01, 1.64712946e-04, 1.50056744e-05, 3.80308284e-01,
        2.55194112e-13, 4.90103398e-01],
       [9.26919780e-01, 2.46255976e-02, 1.12969398e-02, 9.80523286e-03,
        1.32158405e-09, 2.73524486e-02],
       [3.68655132e-05, 1.82724983e-02, 3.46333209e-03, 3.10611262e-02,
        1.83082656e-08, 9.47166160e-01]])

In [123]:
tst.shape

(6, 9)

In [124]:
#y.unique

In [125]:
pd_probs =pd.DataFrame(y_pred_prob,columns=['1', '2','3','5','6','7'])
pd_probs

Unnamed: 0,1,2,3,5,6,7
0,2.5e-05,0.615218,0.024276,0.001093,5.805016e-05,0.35933
1,0.414678,0.010982,0.02135,7.3e-05,0.3957238,0.157193
2,0.820931,0.000349,0.000782,0.055205,1.281796e-09,0.122732
3,0.129409,0.000165,1.5e-05,0.380308,2.551941e-13,0.490103
4,0.92692,0.024626,0.011297,0.009805,1.321584e-09,0.027352
5,3.7e-05,0.018272,0.003463,0.031061,1.830827e-08,0.947166


In [126]:
predictions= lr_best.predict(tst)
predictions



array([2, 1, 1, 7, 1, 7], dtype=int64)

In [127]:
#If .predict() of any classification model gives us class of highest probability.
#.predict_proba() gives probabilities(only with classification)

In [128]:
tst.columns

Index(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe'], dtype='object')

In [129]:
X.columns

Index(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe'], dtype='object')

In [130]:
####### Simpler way of Inferencing

In [131]:
best_model= gcv.best_estimator_
print(best_model.predict(tst))
print(best_model.predict_proba(tst))

[2 6 1 1 1 7]
[[1.72290430e-07 6.73625582e-01 8.18956654e-03 8.53040880e-04
  6.40793377e-10 3.17331637e-01]
 [4.37183330e-01 2.10450337e-03 2.09082008e-02 1.01856432e-05
  4.65563901e-01 7.42298787e-02]
 [9.62227078e-01 2.05922284e-07 3.53053736e-04 3.11701969e-03
  4.48468439e-10 3.43026420e-02]
 [4.43744795e-01 4.13346672e-07 3.98199683e-06 1.24007518e-01
  1.37460048e-16 4.32243291e-01]
 [9.67362231e-01 8.90856905e-03 3.88821169e-03 2.48422064e-03
  4.65641349e-14 1.73567677e-02]
 [2.44105498e-04 1.89456572e-03 1.49755519e-03 8.59599540e-03
  1.37100418e-11 9.87767778e-01]]


In [132]:
print(accuracy_score(y_test['Type_of_glass'],y_pred))
print(confusion_matrix(y_test['Type_of_glass'],y_pred))
print(classification_report(y_test['Type_of_glass'],y_pred))     

0.5692307692307692
[[18  3  0  0  0  0]
 [13 10  0  0  0  0]
 [ 3  2  0  0  0  0]
 [ 0  2  0  1  0  1]
 [ 0  2  0  0  0  1]
 [ 0  1  0  0  0  8]]
              precision    recall  f1-score   support

           1       0.53      0.86      0.65        21
           2       0.50      0.43      0.47        23
           3       0.00      0.00      0.00         5
           5       1.00      0.25      0.40         4
           6       0.00      0.00      0.00         3
           7       0.80      0.89      0.84         9

    accuracy                           0.57        65
   macro avg       0.47      0.41      0.39        65
weighted avg       0.52      0.57      0.52        65

