In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from mlxtend.classifier import StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, classification_report, precision_recall_curve
from sklearn.model_selection import train_test_split
from imblearn.metrics import geometric_mean_score, sensitivity_score, specificity_score
from xgboost import XGBClassifier
from shaphypetune import BoostRFE
from sklearn.base import BaseEstimator
from sklearn.utils.metaestimators import if_delegate_has_method

import torch
import re, pickle, random, os
import warnings
warnings.filterwarnings('ignore')

from collections import Counter

In [2]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

RANDOM_STATE = 42
seed_everything(seed=RANDOM_STATE)

In [3]:
df = pd.read_csv("glass.csv")
df.shape

(214, 10)

In [4]:
df.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,target
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [5]:
df['target'].value_counts()

2    76
1    70
7    29
3    17
5    13
6     9
Name: target, dtype: int64

In [6]:
df.isnull().sum()

RI        0
Na        0
Mg        0
Al        0
Si        0
K         0
Ca        0
Ba        0
Fe        0
target    0
dtype: int64

In [7]:
df.describe()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,target
count,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0
mean,1.518365,13.40785,2.684533,1.444907,72.650935,0.497056,8.956963,0.175047,0.057009,2.780374
std,0.003037,0.816604,1.442408,0.49927,0.774546,0.652192,1.423153,0.497219,0.097439,2.103739
min,1.51115,10.73,0.0,0.29,69.81,0.0,5.43,0.0,0.0,1.0
25%,1.516523,12.9075,2.115,1.19,72.28,0.1225,8.24,0.0,0.0,1.0
50%,1.51768,13.3,3.48,1.36,72.79,0.555,8.6,0.0,0.0,2.0
75%,1.519157,13.825,3.6,1.63,73.0875,0.61,9.1725,0.0,0.1,3.0
max,1.53393,17.38,4.49,3.5,75.41,6.21,16.19,3.15,0.51,7.0


# Train and Test Split

In [8]:
x = df.drop(['target'], axis=1)
y = df['target']

X_train,X_test,y_train,y_test=train_test_split(x, y, train_size=0.8, stratify = y, random_state=100)

In [9]:
y_train.shape, y_test.shape

((171,), (43,))

In [10]:
y_train.value_counts()/len(y_train)

2    0.356725
1    0.327485
7    0.134503
3    0.081871
5    0.058480
6    0.040936
Name: target, dtype: float64

In [11]:
y_test.value_counts()/len(y_test)

2    0.348837
1    0.325581
7    0.139535
3    0.069767
5    0.069767
6    0.046512
Name: target, dtype: float64

In [12]:
from sklearn.preprocessing import StandardScaler
Scaler_X = StandardScaler()
X_train = Scaler_X.fit_transform(X_train)
X_test = Scaler_X.transform(X_test)

# Model Building

In [13]:
model = list()
precision = list()
recall = list()
F1score = list()
AUCROC = list()
balanced_acc = list()
GMean = list()
sensi = list()
speci = list()

In [14]:
def test_eval(clf_model, X_test, y_test, algo=None):

    y_prob=clf_model.predict_proba(X_test)
    y_pred=clf_model.predict(X_test)

    print('Confusion Matrix')
    print('='*60)
    print(confusion_matrix(y_test,y_pred),"\n")
    print('Classification Report')
    print('='*60)
    print(classification_report(y_test,y_pred),"\n")
    print('='*60)
    print('AUC-ROC')
    print(roc_auc_score(y_test, y_prob, multi_class='ovr'), "\n")
    print('Balanced Accuracy')
    print(balanced_accuracy_score(y_test, y_pred),"\n")
    print('Geometric Mean Score')
    print(geometric_mean_score(y_test, y_pred, average='macro'),"\n")
    print('Sensitivity')
    print(sensitivity_score(y_test, y_pred, average='macro'),"\n")
    print('Specificity')
    print(specificity_score(y_test, y_pred, average='macro'))
          
    model.append(algo)
    precision.append(precision_score(y_test,y_pred, average='macro'))
    recall.append(recall_score(y_test,y_pred, average='macro'))
    F1score.append(f1_score(y_test,y_pred, average='macro'))
    AUCROC.append(roc_auc_score(y_test, y_prob, multi_class='ovr', average='macro'))
    balanced_acc.append(balanced_accuracy_score(y_test, y_pred))
    GMean.append(geometric_mean_score(y_test, y_pred, average='macro'))
    sensi.append(sensitivity_score(y_test, y_pred, average='macro'))
    speci.append(specificity_score(y_test, y_pred, average='macro'))


## Model-1: Logistic Regression

In [15]:
clf_LR = LogisticRegression(multi_class='ovr', solver='liblinear')
clf_LR.fit(X_train, y_train)

LogisticRegression(multi_class='ovr', solver='liblinear')

In [16]:
test_eval(clf_LR, X_test, y_test, 'Logistic Regression')

Confusion Matrix
[[11  3  0  0  0  0]
 [ 2 12  0  0  0  1]
 [ 2  1  0  0  0  0]
 [ 0  3  0  0  0  0]
 [ 0  2  0  0  0  0]
 [ 1  0  0  0  0  5]] 

Classification Report
              precision    recall  f1-score   support

           1       0.69      0.79      0.73        14
           2       0.57      0.80      0.67        15
           3       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         3
           6       0.00      0.00      0.00         2
           7       0.83      0.83      0.83         6

    accuracy                           0.65        43
   macro avg       0.35      0.40      0.37        43
weighted avg       0.54      0.65      0.59        43
 

AUC-ROC
0.8568903187700497 

Balanced Accuracy
0.40317460317460324 

Geometric Mean Score
0.6067737509154467 

Sensitivity
0.40317460317460324 

Specificity
0.9131884347401589


## Model-2: Decision Tree

In [17]:
clf_DT = DecisionTreeClassifier()
clf_DT.fit(X_train, y_train)

DecisionTreeClassifier()

In [18]:
test_eval(clf_DT, X_test, y_test, 'Decision Tree')

Confusion Matrix
[[ 6  6  1  0  1  0]
 [ 2 12  0  1  0  0]
 [ 2  1  0  0  0  0]
 [ 0  2  0  1  0  0]
 [ 0  0  0  0  2  0]
 [ 1  0  0  0  0  5]] 

Classification Report
              precision    recall  f1-score   support

           1       0.55      0.43      0.48        14
           2       0.57      0.80      0.67        15
           3       0.00      0.00      0.00         3
           5       0.50      0.33      0.40         3
           6       0.67      1.00      0.80         2
           7       1.00      0.83      0.91         6

    accuracy                           0.60        43
   macro avg       0.55      0.57      0.54        43
weighted avg       0.58      0.60      0.58        43
 

AUC-ROC
0.7355837905669698 

Balanced Accuracy
0.5658730158730159 

Geometric Mean Score
0.7157386156255995 

Sensitivity
0.5658730158730159 

Specificity
0.9052945652609236


## Model-3: Gaussian NB

In [19]:
clf_NB = GaussianNB()
clf_NB.fit(X_train, y_train)

GaussianNB()

In [20]:
test_eval(clf_NB, X_test, y_test, 'Gaussian NB')

Confusion Matrix
[[7 3 3 0 1 0]
 [5 3 6 1 0 0]
 [1 1 0 0 1 0]
 [0 2 0 1 0 0]
 [0 0 0 0 2 0]
 [1 0 0 0 0 5]] 

Classification Report
              precision    recall  f1-score   support

           1       0.50      0.50      0.50        14
           2       0.33      0.20      0.25        15
           3       0.00      0.00      0.00         3
           5       0.50      0.33      0.40         3
           6       0.50      1.00      0.67         2
           7       1.00      0.83      0.91         6

    accuracy                           0.42        43
   macro avg       0.47      0.48      0.45        43
weighted avg       0.48      0.42      0.44        43
 

AUC-ROC
0.7765466205121377 

Balanced Accuracy
0.4777777777777778 

Geometric Mean Score
0.6462983529661894 

Sensitivity
0.4777777777777778 

Specificity
0.8742590812607633


## Model-4: K-Nearest Neighbour

In [21]:
clf_KNN = KNeighborsClassifier()
clf_KNN.fit(X_train, y_train)

KNeighborsClassifier()

In [22]:
test_eval(clf_KNN, X_test, y_test, 'KNN')

Confusion Matrix
[[11  3  0  0  0  0]
 [ 4 10  0  1  0  0]
 [ 2  1  0  0  0  0]
 [ 0  1  0  2  0  0]
 [ 0  0  1  0  1  0]
 [ 1  0  0  0  0  5]] 

Classification Report
              precision    recall  f1-score   support

           1       0.61      0.79      0.69        14
           2       0.67      0.67      0.67        15
           3       0.00      0.00      0.00         3
           5       0.67      0.67      0.67         3
           6       1.00      0.50      0.67         2
           7       1.00      0.83      0.91         6

    accuracy                           0.67        43
   macro avg       0.66      0.58      0.60        43
weighted avg       0.66      0.67      0.66        43
 

AUC-ROC
0.8821674777625157 

Balanced Accuracy
0.5753968253968255 

Geometric Mean Score
0.728236773437006 

Sensitivity
0.5753968253968255 

Specificity
0.9216748768472907


## MODEL-5 Support Vector Classifier

In [23]:
clf_SVC = SVC(probability=True)
clf_SVC.fit(X_train, y_train)

SVC(probability=True)

In [24]:
test_eval(clf_SVC, X_test, y_test, 'SVC')

Confusion Matrix
[[12  2  0  0  0  0]
 [ 2 13  0  0  0  0]
 [ 2  1  0  0  0  0]
 [ 0  1  0  2  0  0]
 [ 0  0  0  0  2  0]
 [ 1  0  0  0  0  5]] 

Classification Report
              precision    recall  f1-score   support

           1       0.71      0.86      0.77        14
           2       0.76      0.87      0.81        15
           3       0.00      0.00      0.00         3
           5       1.00      0.67      0.80         3
           6       1.00      1.00      1.00         2
           7       1.00      0.83      0.91         6

    accuracy                           0.79        43
   macro avg       0.75      0.70      0.72        43
weighted avg       0.75      0.79      0.76        43
 

AUC-ROC
0.8931602168671134 

Balanced Accuracy
0.703968253968254 

Geometric Mean Score
0.8166872915927297 

Sensitivity
0.703968253968254 

Specificity
0.9474548440065682


# MODEL-6 XGBoost

In [25]:
class BoostRFEWrap(BaseEstimator, BoostRFE):

    @if_delegate_has_method(delegate='estimator')
    def predict_proba(self, X):
        return self.predict(X, method='predict_proba')


xgb_params = {'max_depth': (5,10),
          'learning_rate': (0.001, 0.3),
          'n_estimators': (25, 200),
          'reg_alpha' : (0.1, 1),
          'reg_lambda': (0.1, 1),
          'subsample': (0.5,  0.9),
          'colsample_bytree': (0.5,1),
          'min_child_weight': (0,10),
          'gamma': (0,1)}

clf_XGB = BoostRFEWrap(XGBClassifier(), param_grid=xgb_params, importance_type='shap_importances', train_importance=False, min_features_to_select=1, step=1)

In [26]:
xclf = clf_XGB.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=6, verbose=0)


512 trials detected for ('max_depth', 'learning_rate', 'n_estimators', 'reg_alpha', 'reg_lambda', 'subsample', 'colsample_bytree', 'min_child_weight', 'gamma')

trial: 0001 ### iterations: 00013 ### eval_score: 0.23256
trial: 0002 ### iterations: 00005 ### eval_score: 0.2093
trial: 0003 ### iterations: 00006 ### eval_score: 0.30233
trial: 0004 ### iterations: 00009 ### eval_score: 0.30233
trial: 0005 ### iterations: 00002 ### eval_score: 0.16279
trial: 0006 ### iterations: 00006 ### eval_score: 0.18605
trial: 0007 ### iterations: 00002 ### eval_score: 0.32558
trial: 0008 ### iterations: 00002 ### eval_score: 0.32558
trial: 0009 ### iterations: 00007 ### eval_score: 0.23256
trial: 0010 ### iterations: 00006 ### eval_score: 0.27907
trial: 0011 ### iterations: 00002 ### eval_score: 0.27907
trial: 0012 ### iterations: 00002 ### eval_score: 0.27907
trial: 0013 ### iterations: 00004 ### eval_score: 0.23256
trial: 0014 ### iterations: 00005 ### eval_score: 0.2093
trial: 0015 ### iterations: 

trial: 0140 ### iterations: 00006 ### eval_score: 0.27907
trial: 0141 ### iterations: 00006 ### eval_score: 0.18605
trial: 0142 ### iterations: 00005 ### eval_score: 0.18605
trial: 0143 ### iterations: 00001 ### eval_score: 0.32558
trial: 0144 ### iterations: 00001 ### eval_score: 0.32558
trial: 0145 ### iterations: 00005 ### eval_score: 0.2093
trial: 0146 ### iterations: 00006 ### eval_score: 0.2093
trial: 0147 ### iterations: 00003 ### eval_score: 0.32558
trial: 0148 ### iterations: 00001 ### eval_score: 0.34884
trial: 0149 ### iterations: 00001 ### eval_score: 0.2093
trial: 0150 ### iterations: 00001 ### eval_score: 0.18605
trial: 0151 ### iterations: 00003 ### eval_score: 0.30233
trial: 0152 ### iterations: 00003 ### eval_score: 0.30233
trial: 0153 ### iterations: 00007 ### eval_score: 0.23256
trial: 0154 ### iterations: 00003 ### eval_score: 0.23256
trial: 0155 ### iterations: 00005 ### eval_score: 0.30233
trial: 0156 ### iterations: 00008 ### eval_score: 0.30233
trial: 0157 ### i

trial: 0282 ### iterations: 00006 ### eval_score: 0.23256
trial: 0283 ### iterations: 00002 ### eval_score: 0.30233
trial: 0284 ### iterations: 00002 ### eval_score: 0.30233
trial: 0285 ### iterations: 00002 ### eval_score: 0.18605
trial: 0286 ### iterations: 00004 ### eval_score: 0.2093
trial: 0287 ### iterations: 00004 ### eval_score: 0.27907
trial: 0288 ### iterations: 00004 ### eval_score: 0.27907
trial: 0289 ### iterations: 00006 ### eval_score: 0.23256
trial: 0290 ### iterations: 00001 ### eval_score: 0.2093
trial: 0291 ### iterations: 00002 ### eval_score: 0.32558
trial: 0292 ### iterations: 00001 ### eval_score: 0.34884
trial: 0293 ### iterations: 00011 ### eval_score: 0.2093
trial: 0294 ### iterations: 00002 ### eval_score: 0.2093
trial: 0295 ### iterations: 00006 ### eval_score: 0.30233
trial: 0296 ### iterations: 00002 ### eval_score: 0.32558
trial: 0297 ### iterations: 00005 ### eval_score: 0.2093
trial: 0298 ### iterations: 00006 ### eval_score: 0.23256
trial: 0299 ### ite

trial: 0424 ### iterations: 00006 ### eval_score: 0.32558
trial: 0425 ### iterations: 00006 ### eval_score: 0.2093
trial: 0426 ### iterations: 00007 ### eval_score: 0.18605
trial: 0427 ### iterations: 00008 ### eval_score: 0.30233
trial: 0428 ### iterations: 00008 ### eval_score: 0.30233
trial: 0429 ### iterations: 00004 ### eval_score: 0.18605
trial: 0430 ### iterations: 00004 ### eval_score: 0.18605
trial: 0431 ### iterations: 00001 ### eval_score: 0.32558
trial: 0432 ### iterations: 00010 ### eval_score: 0.30233
trial: 0433 ### iterations: 00001 ### eval_score: 0.25581
trial: 0434 ### iterations: 00004 ### eval_score: 0.25581
trial: 0435 ### iterations: 00006 ### eval_score: 0.32558
trial: 0436 ### iterations: 00002 ### eval_score: 0.41861
trial: 0437 ### iterations: 00005 ### eval_score: 0.27907
trial: 0438 ### iterations: 00001 ### eval_score: 0.27907
trial: 0439 ### iterations: 00006 ### eval_score: 0.32558
trial: 0440 ### iterations: 00006 ### eval_score: 0.32558
trial: 0441 ###

In [27]:
test_eval(xclf, X_test, y_test, 'XGB')

Confusion Matrix
[[11  3  0  0  0  0]
 [ 0 15  0  0  0  0]
 [ 2  0  1  0  0  0]
 [ 0  0  0  2  0  1]
 [ 0  0  0  0  2  0]
 [ 1  0  0  0  0  5]] 

Classification Report
              precision    recall  f1-score   support

           1       0.79      0.79      0.79        14
           2       0.83      1.00      0.91        15
           3       1.00      0.33      0.50         3
           5       1.00      0.67      0.80         3
           6       1.00      1.00      1.00         2
           7       0.83      0.83      0.83         6

    accuracy                           0.84        43
   macro avg       0.91      0.77      0.80        43
weighted avg       0.85      0.84      0.83        43
 

AUC-ROC
0.8595036539002057 

Balanced Accuracy
0.7698412698412698 

Geometric Mean Score
0.859856514482821 

Sensitivity
0.7698412698412698 

Specificity
0.9603969733280078


# MODEL-7 Stacking

In [28]:
clf_DT = DecisionTreeClassifier()
clf_SVC = SVC(probability=True)
clf_KNN = KNeighborsClassifier()
clf_LR = LogisticRegression()

sclf = StackingClassifier(classifiers=[clf_DT, clf_SVC, clf_KNN], use_probas=True, average_probas=False, meta_classifier=clf_LR)

In [30]:
sclf.fit(X_train, y_train)
test_eval(sclf, X_test, y_test, 'Stacking')

Confusion Matrix
[[ 6  7  1  0  0  0]
 [ 2 10  2  0  0  1]
 [ 2  1  0  0  0  0]
 [ 0  2  0  1  0  0]
 [ 0  0  0  0  2  0]
 [ 1  0  0  0  0  5]] 

Classification Report
              precision    recall  f1-score   support

           1       0.55      0.43      0.48        14
           2       0.50      0.67      0.57        15
           3       0.00      0.00      0.00         3
           5       1.00      0.33      0.50         3
           6       1.00      1.00      1.00         2
           7       0.83      0.83      0.83         6

    accuracy                           0.56        43
   macro avg       0.65      0.54      0.56        43
weighted avg       0.58      0.56      0.55        43
 

AUC-ROC
0.832205173153449 

Balanced Accuracy
0.5436507936507936 

Geometric Mean Score
0.6974410123800467 

Sensitivity
0.5436507936507936 

Specificity
0.8947360537877779


# MODEL-8 Bagging

In [31]:
clf_bg = BaggingClassifier(clf_DT)

In [32]:
clf_bg.fit(X_train, y_train)
test_eval(clf_bg, X_test, y_test, 'Bagging')

Confusion Matrix
[[11  3  0  0  0  0]
 [ 1 13  0  0  0  1]
 [ 2  1  0  0  0  0]
 [ 0  2  0  1  0  0]
 [ 0  0  0  0  2  0]
 [ 1  0  0  0  0  5]] 

Classification Report
              precision    recall  f1-score   support

           1       0.73      0.79      0.76        14
           2       0.68      0.87      0.76        15
           3       0.00      0.00      0.00         3
           5       1.00      0.33      0.50         3
           6       1.00      1.00      1.00         2
           7       0.83      0.83      0.83         6

    accuracy                           0.74        43
   macro avg       0.71      0.64      0.64        43
weighted avg       0.71      0.74      0.71        43
 

AUC-ROC
0.8882024882024883 

Balanced Accuracy
0.6365079365079366 

Geometric Mean Score
0.7721890901718468 

Sensitivity
0.6365079365079366 

Specificity
0.9367927040340832


In [33]:
clf_eval_df = pd.DataFrame({'model':model,
                            'precision':precision,
                            'recall':recall,
                            'f1-score':F1score,
                            'AUC-ROC':AUCROC,
                            'balanced_acc':balanced_acc,
                            'GMean':GMean,
                            'sensitivity':sensi,
                            'specificity':speci})
clf_eval_df

Unnamed: 0,model,precision,recall,f1-score,AUC-ROC,balanced_acc,GMean,sensitivity,specificity
0,Logistic Regression,0.34871,0.403175,0.372222,0.85689,0.403175,0.606774,0.403175,0.913188
1,Decision Tree,0.547258,0.565873,0.542626,0.735584,0.565873,0.715739,0.565873,0.905295
2,Gaussian NB,0.472222,0.477778,0.454293,0.776547,0.477778,0.646298,0.477778,0.874259
3,KNN,0.657407,0.575397,0.599432,0.882167,0.575397,0.728237,0.575397,0.921675
4,SVC,0.745098,0.703968,0.715964,0.89316,0.703968,0.816687,0.703968,0.947455
5,XGB,0.90873,0.769841,0.80469,0.859504,0.769841,0.859857,0.769841,0.960397
6,Stacking,0.646465,0.543651,0.564127,0.832205,0.543651,0.697441,0.543651,0.894736
7,Bagging,0.70848,0.636508,0.642777,0.888202,0.636508,0.772189,0.636508,0.936793


In [34]:
clf_eval_df.to_excel("D:\Skripsi\Final\project_glass_final.xlsx")