In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from mlxtend.classifier import StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, classification_report, precision_recall_curve
from sklearn.model_selection import train_test_split
from imblearn.metrics import geometric_mean_score, sensitivity_score, specificity_score
from xgboost import XGBClassifier
from shaphypetune import BoostRFE
from sklearn.base import BaseEstimator
from sklearn.utils.metaestimators import if_delegate_has_method

import torch
import re, pickle, random, os
import warnings
warnings.filterwarnings('ignore')

from collections import Counter

In [2]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

RANDOM_STATE = 42
seed_everything(seed=RANDOM_STATE)

In [3]:
col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'acceptability']
df = pd.read_csv('car.csv', names = col_names)
df.shape

(1728, 7)

In [4]:
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,acceptability
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [5]:
df['acceptability'].value_counts()

unacc    1210
acc       384
good       69
vgood      65
Name: acceptability, dtype: int64

In [6]:
df.isnull().sum()

buying           0
maint            0
doors            0
persons          0
lug_boot         0
safety           0
acceptability    0
dtype: int64

In [7]:
df.describe()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,acceptability
count,1728,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,3,4
top,vhigh,vhigh,2,2,small,low,unacc
freq,432,432,432,576,576,576,1210


In [8]:
df['buying'] = df['buying'].replace('vhigh', 3)
df['buying'] = df['buying'].replace('high', 2)
df['buying'] = df['buying'].replace('med', 1)
df['buying'] = df['buying'].replace('low', 0)
df['maint'] = df['maint'].replace('vhigh', 3)
df['maint'] = df['maint'].replace('high', 2)
df['maint'] = df['maint'].replace('med', 1)
df['maint'] = df['maint'].replace('low', 0)
df['doors'] = df['doors'].replace('5more', 3)
df['doors'] = df['doors'].replace('4', 2)
df['doors'] = df['doors'].replace('3', 1)
df['doors'] = df['doors'].replace('2', 0)
df['persons'] = df['persons'].replace('more', 2)
df['persons'] = df['persons'].replace('4', 1)
df['persons'] = df['persons'].replace('2', 0)
df['lug_boot'] = df['lug_boot'].replace('big', 2)
df['lug_boot'] = df['lug_boot'].replace('med', 1)
df['lug_boot'] = df['lug_boot'].replace('small', 0)
df['safety'] = df['safety'].replace('high', 2)
df['safety'] = df['safety'].replace('med', 1)
df['safety'] = df['safety'].replace('low', 0)
df['acceptability'] = df['acceptability'].replace('vgood', 3)
df['acceptability'] = df['acceptability'].replace('good', 2)
df['acceptability'] = df['acceptability'].replace('acc', 1)
df['acceptability'] = df['acceptability'].replace('unacc', 0)

df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,acceptability
0,3,3,0,0,0,0,0
1,3,3,0,0,0,1,0
2,3,3,0,0,0,2,0
3,3,3,0,0,1,0,0
4,3,3,0,0,1,1,0


# Train and Test Split

In [9]:
x = df.drop(['acceptability'], axis=1)
y = df['acceptability']

X_train,X_test,y_train,y_test=train_test_split(x, y, train_size=0.8, stratify = y, random_state=100)

In [10]:
y_train.shape, y_test.shape

((1382,), (346,))

In [11]:
y_train.value_counts()/len(y_train)

0    0.700434
1    0.222142
2    0.039797
3    0.037627
Name: acceptability, dtype: float64

In [12]:
y_test.value_counts()/len(y_test)

0    0.699422
1    0.222543
2    0.040462
3    0.037572
Name: acceptability, dtype: float64

In [13]:
from sklearn.preprocessing import StandardScaler
Scaler_X = StandardScaler()
X_train = Scaler_X.fit_transform(X_train)
X_test = Scaler_X.transform(X_test)

# Model Building

In [14]:
model = list()
precision = list()
recall = list()
F1score = list()
AUCROC = list()
balanced_acc = list()
GMean = list()
sensi = list()
speci = list()

In [15]:
def test_eval(clf_model, X_test, y_test, algo=None):

    y_prob=clf_model.predict_proba(X_test)
    y_pred=clf_model.predict(X_test)

    print('Confusion Matrix')
    print('='*60)
    print(confusion_matrix(y_test,y_pred),"\n")
    print('Classification Report')
    print('='*60)
    print(classification_report(y_test,y_pred),"\n")
    print('='*60)
    print('AUC-ROC')
    print(roc_auc_score(y_test, y_prob, multi_class='ovr'), "\n")
    print('Balanced Accuracy')
    print(balanced_accuracy_score(y_test, y_pred),"\n")
    print('Geometric Mean Score')
    print(geometric_mean_score(y_test, y_pred, average='macro'),"\n")
    print('Sensitivity')
    print(sensitivity_score(y_test, y_pred, average='macro'),"\n")
    print('Specificity')
    print(specificity_score(y_test, y_pred, average='macro'))
          
    model.append(algo)
    precision.append(precision_score(y_test,y_pred, average='macro'))
    recall.append(recall_score(y_test,y_pred, average='macro'))
    F1score.append(f1_score(y_test,y_pred, average='macro'))
    AUCROC.append(roc_auc_score(y_test, y_prob, multi_class='ovr', average='macro'))
    balanced_acc.append(balanced_accuracy_score(y_test, y_pred))
    GMean.append(geometric_mean_score(y_test, y_pred, average='macro'))
    sensi.append(sensitivity_score(y_test, y_pred, average='macro'))
    speci.append(specificity_score(y_test, y_pred, average='macro'))


## Model-1: Logistic Regression

In [16]:
clf_LR = LogisticRegression(multi_class='ovr', solver='liblinear')
clf_LR.fit(X_train, y_train)

LogisticRegression(multi_class='ovr', solver='liblinear')

In [17]:
test_eval(clf_LR, X_test, y_test, 'Logistic Regression')

Confusion Matrix
[[224  18   0   0]
 [ 27  50   0   0]
 [  1  12   1   0]
 [  0  11   0   2]] 

Classification Report
              precision    recall  f1-score   support

           0       0.89      0.93      0.91       242
           1       0.55      0.65      0.60        77
           2       1.00      0.07      0.13        14
           3       1.00      0.15      0.27        13

    accuracy                           0.80       346
   macro avg       0.86      0.45      0.48       346
weighted avg       0.82      0.80      0.78       346
 

AUC-ROC
0.9414287702743995 

Balanced Accuracy
0.45006130233402963 

Geometric Mean Score
0.6345230796894841 

Sensitivity
0.45006130233402963 

Specificity
0.8945882184729768


## Model-2: Decision Tree

In [18]:
clf_DT = DecisionTreeClassifier()
clf_DT.fit(X_train, y_train)

DecisionTreeClassifier()

In [19]:
test_eval(clf_DT, X_test, y_test, 'Decision Tree')

Confusion Matrix
[[239   2   1   0]
 [  2  73   2   0]
 [  0   3  11   0]
 [  0   0   0  13]] 

Classification Report
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       242
           1       0.94      0.95      0.94        77
           2       0.79      0.79      0.79        14
           3       1.00      1.00      1.00        13

    accuracy                           0.97       346
   macro avg       0.93      0.93      0.93       346
weighted avg       0.97      0.97      0.97       346
 

AUC-ROC
0.9593144081434348 

Balanced Accuracy
0.9303423848878394 

Geometric Mean Score
0.9588768197949443 

Sensitivity
0.9303423848878394 

Specificity
0.9882864313990305


## Model-3: Gaussian NB

In [20]:
clf_NB = GaussianNB()
clf_NB.fit(X_train, y_train)

GaussianNB()

In [21]:
test_eval(clf_NB, X_test, y_test, 'Gaussian NB')

Confusion Matrix
[[202  10   1  29]
 [ 23  18   1  35]
 [  1   4   0   9]
 [  0   0   0  13]] 

Classification Report
              precision    recall  f1-score   support

           0       0.89      0.83      0.86       242
           1       0.56      0.23      0.33        77
           2       0.00      0.00      0.00        14
           3       0.15      1.00      0.26        13

    accuracy                           0.67       346
   macro avg       0.40      0.52      0.36       346
weighted avg       0.76      0.67      0.69       346
 

AUC-ROC
0.8703461421570892 

Balanced Accuracy
0.5171192443919717 

Geometric Mean Score
0.6718911453741685 

Sensitivity
0.5171192443919717 

Specificity
0.8729857109901451


## Model-4: K-Nearest Neighbour

In [22]:
clf_KNN = KNeighborsClassifier()
clf_KNN.fit(X_train, y_train)

KNeighborsClassifier()

In [23]:
test_eval(clf_KNN, X_test, y_test, 'KNN')

Confusion Matrix
[[238   4   0   0]
 [  3  74   0   0]
 [  0   4  10   0]
 [  0   2   0  11]] 

Classification Report
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       242
           1       0.88      0.96      0.92        77
           2       1.00      0.71      0.83        14
           3       1.00      0.85      0.92        13

    accuracy                           0.96       346
   macro avg       0.97      0.88      0.91       346
weighted avg       0.96      0.96      0.96       346
 

AUC-ROC
0.9959723067594207 

Balanced Accuracy
0.8762373989646718 

Geometric Mean Score
0.9283183231036639 

Sensitivity
0.8762373989646718 

Specificity
0.9834947812410637


## MODEL-5 Support Vector Classifier

In [24]:
clf_SVC = SVC(probability=True)
clf_SVC.fit(X_train, y_train)

SVC(probability=True)

In [25]:
test_eval(clf_SVC, X_test, y_test, 'SVC')

Confusion Matrix
[[236   6   0   0]
 [  1  76   0   0]
 [  0   3   9   2]
 [  0   1   0  12]] 

Classification Report
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       242
           1       0.88      0.99      0.93        77
           2       1.00      0.64      0.78        14
           3       0.86      0.92      0.89        13

    accuracy                           0.96       346
   macro avg       0.93      0.88      0.90       346
weighted avg       0.97      0.96      0.96       346
 

AUC-ROC
0.9978953917979918 

Balanced Accuracy
0.8820384161293253 

Geometric Mean Score
0.9329503558171994 

Sensitivity
0.8820384161293253 

Specificity
0.9868009720472546


# MODEL-6 XGBoost

In [26]:
class BoostRFEWrap(BaseEstimator, BoostRFE):

    @if_delegate_has_method(delegate='estimator')
    def predict_proba(self, X):
        return self.predict(X, method='predict_proba')


xgb_params = {'max_depth': (5,10),
          'learning_rate': (0.001, 0.3),
          'n_estimators': (25, 200),
          'reg_alpha' : (0.1, 1),
          'reg_lambda': (0.1, 1),
          'subsample': (0.5,  0.9),
          'colsample_bytree': (0.5,1),
          'min_child_weight': (0,10),
          'gamma': (0,1)}

clf_XGB = BoostRFEWrap(XGBClassifier(), param_grid=xgb_params, importance_type='shap_importances', train_importance=False, min_features_to_select=1, step=1)

In [27]:
xclf = clf_XGB.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=6, verbose=0)


512 trials detected for ('max_depth', 'learning_rate', 'n_estimators', 'reg_alpha', 'reg_lambda', 'subsample', 'colsample_bytree', 'min_child_weight', 'gamma')

trial: 0001 ### iterations: 00043 ### eval_score: 0.03468
trial: 0002 ### iterations: 00024 ### eval_score: 0.0578
trial: 0003 ### iterations: 00027 ### eval_score: 0.07803
trial: 0004 ### iterations: 00025 ### eval_score: 0.09249
trial: 0005 ### iterations: 00007 ### eval_score: 0.02601
trial: 0006 ### iterations: 00004 ### eval_score: 0.04046
trial: 0007 ### iterations: 00021 ### eval_score: 0.0578
trial: 0008 ### iterations: 00008 ### eval_score: 0.07225
trial: 0009 ### iterations: 00032 ### eval_score: 0.03179
trial: 0010 ### iterations: 00041 ### eval_score: 0.0289
trial: 0011 ### iterations: 00036 ### eval_score: 0.05202
trial: 0012 ### iterations: 00020 ### eval_score: 0.06358
trial: 0013 ### iterations: 00006 ### eval_score: 0.02312
trial: 0014 ### iterations: 00006 ### eval_score: 0.02023
trial: 0015 ### iterations: 0

trial: 0140 ### iterations: 00000 ### eval_score: 0.2052
trial: 0141 ### iterations: 00003 ### eval_score: 0.02312
trial: 0142 ### iterations: 00003 ### eval_score: 0.02023
trial: 0143 ### iterations: 00000 ### eval_score: 0.09249
trial: 0144 ### iterations: 00000 ### eval_score: 0.09249
trial: 0145 ### iterations: 00001 ### eval_score: 0.20231
trial: 0146 ### iterations: 00000 ### eval_score: 0.19075
trial: 0147 ### iterations: 00000 ### eval_score: 0.19653
trial: 0148 ### iterations: 00001 ### eval_score: 0.19075
trial: 0149 ### iterations: 00004 ### eval_score: 0.04335
trial: 0150 ### iterations: 00009 ### eval_score: 0.04046
trial: 0151 ### iterations: 00001 ### eval_score: 0.12428
trial: 0152 ### iterations: 00001 ### eval_score: 0.12428
trial: 0153 ### iterations: 00000 ### eval_score: 0.2052
trial: 0154 ### iterations: 00000 ### eval_score: 0.19653
trial: 0155 ### iterations: 00000 ### eval_score: 0.2052
trial: 0156 ### iterations: 00000 ### eval_score: 0.19653
trial: 0157 ### i

trial: 0282 ### iterations: 00037 ### eval_score: 0.03757
trial: 0283 ### iterations: 00033 ### eval_score: 0.05491
trial: 0284 ### iterations: 00020 ### eval_score: 0.06647
trial: 0285 ### iterations: 00023 ### eval_score: 0.0578
trial: 0286 ### iterations: 00025 ### eval_score: 0.0289
trial: 0287 ### iterations: 00022 ### eval_score: 0.04624
trial: 0288 ### iterations: 00018 ### eval_score: 0.05202
trial: 0289 ### iterations: 00036 ### eval_score: 0.04335
trial: 0290 ### iterations: 00041 ### eval_score: 0.06069
trial: 0291 ### iterations: 00025 ### eval_score: 0.08959
trial: 0292 ### iterations: 00025 ### eval_score: 0.09249
trial: 0293 ### iterations: 00036 ### eval_score: 0.02023
trial: 0294 ### iterations: 00023 ### eval_score: 0.06069
trial: 0295 ### iterations: 00021 ### eval_score: 0.06358
trial: 0296 ### iterations: 00003 ### eval_score: 0.09249
trial: 0297 ### iterations: 00036 ### eval_score: 0.03468
trial: 0298 ### iterations: 00024 ### eval_score: 0.0578
trial: 0299 ### i

trial: 0424 ### iterations: 00002 ### eval_score: 0.13295
trial: 0425 ### iterations: 00000 ### eval_score: 0.19075
trial: 0426 ### iterations: 00000 ### eval_score: 0.19075
trial: 0427 ### iterations: 00000 ### eval_score: 0.19075
trial: 0428 ### iterations: 00000 ### eval_score: 0.19075
trial: 0429 ### iterations: 00000 ### eval_score: 0.08959
trial: 0430 ### iterations: 00000 ### eval_score: 0.08671
trial: 0431 ### iterations: 00000 ### eval_score: 0.10405
trial: 0432 ### iterations: 00000 ### eval_score: 0.10116
trial: 0433 ### iterations: 00000 ### eval_score: 0.2052
trial: 0434 ### iterations: 00000 ### eval_score: 0.2052
trial: 0435 ### iterations: 00000 ### eval_score: 0.2052
trial: 0436 ### iterations: 00000 ### eval_score: 0.2052
trial: 0437 ### iterations: 00003 ### eval_score: 0.12717
trial: 0438 ### iterations: 00003 ### eval_score: 0.12717
trial: 0439 ### iterations: 00002 ### eval_score: 0.13295
trial: 0440 ### iterations: 00002 ### eval_score: 0.13295
trial: 0441 ### it

In [28]:
test_eval(xclf, X_test, y_test, 'XGB')

Confusion Matrix
[[240   2   0   0]
 [  0  77   0   0]
 [  0   0  13   1]
 [  0   0   0  13]] 

Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       242
           1       0.97      1.00      0.99        77
           2       1.00      0.93      0.96        14
           3       0.93      1.00      0.96        13

    accuracy                           0.99       346
   macro avg       0.98      0.98      0.98       346
weighted avg       0.99      0.99      0.99       346
 

AUC-ROC
0.9994053235107178 

Balanced Accuracy
0.9800767414403778 

Geometric Mean Score
0.988695728781396 

Sensitivity
0.9800767414403778 

Specificity
0.9973905131897698


# MODEL-7 Stacking

In [29]:
clf_DT = DecisionTreeClassifier()
clf_SVC = SVC(probability=True)
clf_KNN = KNeighborsClassifier()
clf_LR = LogisticRegression()

sclf = StackingClassifier(classifiers=[clf_DT, clf_SVC, clf_KNN], use_probas=True, average_probas=False, meta_classifier=clf_LR)

In [30]:
sclf.fit(X_train, y_train)
test_eval(sclf, X_test, y_test, 'Stacking')

Confusion Matrix
[[239   2   1   0]
 [  2  74   1   0]
 [  0   3  11   0]
 [  0   0   0  13]] 

Classification Report
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       242
           1       0.94      0.96      0.95        77
           2       0.85      0.79      0.81        14
           3       1.00      1.00      1.00        13

    accuracy                           0.97       346
   macro avg       0.94      0.93      0.94       346
weighted avg       0.97      0.97      0.97       346
 

AUC-ROC
0.997710928758412 

Balanced Accuracy
0.9335891381345927 

Geometric Mean Score
0.960914398679201 

Sensitivity
0.9335891381345927 

Specificity
0.9890394434472233


# MODEL-8 Bagging

In [31]:
clf_bg = BaggingClassifier(clf_DT)

In [32]:
clf_bg.fit(X_train, y_train)
test_eval(clf_bg, X_test, y_test, 'Bagging')

Confusion Matrix
[[239   2   1   0]
 [  1  74   2   0]
 [  0   0  13   1]
 [  0   0   0  13]] 

Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       242
           1       0.97      0.96      0.97        77
           2       0.81      0.93      0.87        14
           3       0.93      1.00      0.96        13

    accuracy                           0.98       346
   macro avg       0.93      0.97      0.95       346
weighted avg       0.98      0.98      0.98       346
 

AUC-ROC
0.9986689269863918 

Balanced Accuracy
0.9693034238488785 

Geometric Mean Score
0.980945610914472 

Sensitivity
0.9693034238488785 

Specificity
0.9927276308913453


In [33]:
clf_eval_df = pd.DataFrame({'model':model,
                            'precision':precision,
                            'recall':recall,
                            'f1-score':F1score,
                            'AUC-ROC':AUCROC,
                            'balanced_acc':balanced_acc,
                            'GMean':GMean,
                            'sensitivity':sensi,
                            'specificity':speci})
clf_eval_df

Unnamed: 0,model,precision,recall,f1-score,AUC-ROC,balanced_acc,GMean,sensitivity,specificity
0,Logistic Regression,0.859585,0.450061,0.47553,0.941429,0.450061,0.634523,0.450061,0.894588
1,Decision Tree,0.928328,0.930342,0.929324,0.959314,0.930342,0.958877,0.930342,0.988286
2,Gaussian NB,0.401867,0.517119,0.364037,0.870346,0.517119,0.671891,0.517119,0.872986
3,KNN,0.967126,0.876237,0.91369,0.995972,0.876237,0.928318,0.876237,0.983495
4,SVC,0.934161,0.882038,0.89735,0.997895,0.882038,0.93295,0.882038,0.986801
5,XGB,0.975814,0.980077,0.977239,0.999405,0.980077,0.988696,0.980077,0.997391
6,Stacking,0.943641,0.933589,0.938295,0.997711,0.933589,0.960914,0.933589,0.989039
7,Bagging,0.927647,0.969303,0.947163,0.998669,0.969303,0.980946,0.969303,0.992728


In [34]:
clf_eval_df.to_excel("D:\Skripsi\Final\project_car_final.xlsx")