In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from mlxtend.classifier import StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, classification_report, precision_recall_curve
from sklearn.model_selection import train_test_split
from imblearn.metrics import geometric_mean_score, sensitivity_score, specificity_score
from xgboost import XGBClassifier
from shaphypetune import BoostRFE
from sklearn.base import BaseEstimator
from sklearn.utils.metaestimators import if_delegate_has_method

import torch
import re, pickle, random, os
import warnings
warnings.filterwarnings('ignore')

from collections import Counter

In [2]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

RANDOM_STATE = 42
seed_everything(seed=RANDOM_STATE)

In [3]:
df = pd.read_csv("whitewine.csv",delimiter=';')
df.shape

(4898, 12)

In [4]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [5]:
df['quality'].value_counts()

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: quality, dtype: int64

In [6]:
df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [7]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


# Train and Test Split

In [8]:
x = df.drop(['quality'], axis=1)
y = df['quality']

X_train,X_test,y_train,y_test=train_test_split(x, y, train_size=0.8, stratify = y, random_state=100)

In [9]:
y_train.shape, y_test.shape

((3918,), (980,))

In [10]:
y_train.value_counts()/len(y_train)

6    0.448698
5    0.297601
7    0.179684
8    0.035733
4    0.033180
3    0.004084
9    0.001021
Name: quality, dtype: float64

In [11]:
y_test.value_counts()/len(y_test)

6    0.448980
5    0.296939
7    0.179592
8    0.035714
4    0.033673
3    0.004082
9    0.001020
Name: quality, dtype: float64

In [12]:
from sklearn.preprocessing import StandardScaler
Scaler_X = StandardScaler()
X_train = Scaler_X.fit_transform(X_train)
X_test = Scaler_X.transform(X_test)

# Model Building

In [13]:
model = list()
precision = list()
recall = list()
F1score = list()
AUCROC = list()
balanced_acc = list()
GMean = list()
sensi = list()
speci = list()

In [14]:
def test_eval(clf_model, X_test, y_test, algo=None):

    y_prob=clf_model.predict_proba(X_test)
    y_pred=clf_model.predict(X_test)

    print('Confusion Matrix')
    print('='*60)
    print(confusion_matrix(y_test,y_pred),"\n")
    print('Classification Report')
    print('='*60)
    print(classification_report(y_test,y_pred),"\n")
    print('='*60)
    print('AUC-ROC')
    print(roc_auc_score(y_test, y_prob, multi_class='ovr'), "\n")
    print('Balanced Accuracy')
    print(balanced_accuracy_score(y_test, y_pred),"\n")
    print('Geometric Mean Score')
    print(geometric_mean_score(y_test, y_pred, average='macro'),"\n")
    print('Sensitivity')
    print(sensitivity_score(y_test, y_pred, average='macro'),"\n")
    print('Specificity')
    print(specificity_score(y_test, y_pred, average='macro'))
          
    model.append(algo)
    precision.append(precision_score(y_test,y_pred, average='macro'))
    recall.append(recall_score(y_test,y_pred, average='macro'))
    F1score.append(f1_score(y_test,y_pred, average='macro'))
    AUCROC.append(roc_auc_score(y_test, y_prob, multi_class='ovr', average='macro'))
    balanced_acc.append(balanced_accuracy_score(y_test, y_pred))
    GMean.append(geometric_mean_score(y_test, y_pred, average='macro'))
    sensi.append(sensitivity_score(y_test, y_pred, average='macro'))
    speci.append(specificity_score(y_test, y_pred, average='macro'))


## Model-1: Logistic Regression

In [15]:
clf_LR = LogisticRegression(multi_class='ovr', solver='liblinear')
clf_LR.fit(X_train, y_train)

LogisticRegression(multi_class='ovr', solver='liblinear')

In [16]:
test_eval(clf_LR, X_test, y_test, 'Logistic Regression')

Confusion Matrix
[[  0   0   2   2   0   0   0]
 [  0   1  20  12   0   0   0]
 [  0   0 155 135   1   0   0]
 [  0   0 104 329   7   0   0]
 [  0   0   9 145  22   0   0]
 [  0   0   1  31   3   0   0]
 [  0   0   0   1   0   0   0]] 

Classification Report
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       1.00      0.03      0.06        33
           5       0.53      0.53      0.53       291
           6       0.50      0.75      0.60       440
           7       0.67      0.12      0.21       176
           8       0.00      0.00      0.00        35
           9       0.00      0.00      0.00         1

    accuracy                           0.52       980
   macro avg       0.39      0.21      0.20       980
weighted avg       0.54      0.52      0.47       980
 

AUC-ROC
0.7460896389274225 

Balanced Accuracy
0.20509662159146694 

Geometric Mean Score
0.4257043238895362 

Sensitivity
0.20509662159146694

## Model-2: Decision Tree

In [17]:
clf_DT = DecisionTreeClassifier()
clf_DT.fit(X_train, y_train)

DecisionTreeClassifier()

In [18]:
test_eval(clf_DT, X_test, y_test, 'Decision Tree')

Confusion Matrix
[[  0   1   1   1   1   0   0]
 [  0  11   9  11   2   0   0]
 [  0  14 162  91  21   3   0]
 [  0  10  80 277  63   9   1]
 [  0   2   5  60  99  10   0]
 [  0   0   4   9   8  14   0]
 [  0   0   0   1   0   0   0]] 

Classification Report
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.29      0.33      0.31        33
           5       0.62      0.56      0.59       291
           6       0.62      0.63      0.62       440
           7       0.51      0.56      0.54       176
           8       0.39      0.40      0.39        35
           9       0.00      0.00      0.00         1

    accuracy                           0.57       980
   macro avg       0.35      0.35      0.35       980
weighted avg       0.58      0.57      0.57       980
 

AUC-ROC
0.6319321986352724 

Balanced Accuracy
0.3545828312580889 

Geometric Mean Score
0.5678165479162134 

Sensitivity
0.3545828312580889 


## Model-3: Gaussian NB

In [19]:
clf_NB = GaussianNB()
clf_NB.fit(X_train, y_train)

GaussianNB()

In [20]:
test_eval(clf_NB, X_test, y_test, 'Gaussian NB')

Confusion Matrix
[[  1   0   0   1   2   0   0]
 [  0   4  14   9   6   0   0]
 [  3  15 162  77  34   0   0]
 [  3   8 133 135 159   2   0]
 [  0   0  19  42 113   2   0]
 [  1   0   2   9  22   1   0]
 [  0   0   0   0   1   0   0]] 

Classification Report
              precision    recall  f1-score   support

           3       0.12      0.25      0.17         4
           4       0.15      0.12      0.13        33
           5       0.49      0.56      0.52       291
           6       0.49      0.31      0.38       440
           7       0.34      0.64      0.44       176
           8       0.20      0.03      0.05        35
           9       0.00      0.00      0.00         1

    accuracy                           0.42       980
   macro avg       0.26      0.27      0.24       980
weighted avg       0.44      0.42      0.41       980
 

AUC-ROC
0.7565487359876609 

Balanced Accuracy
0.2721926024392887 

Geometric Mean Score
0.49046170963745095 

Sensitivity
0.2721926024392887 

## Model-4: K-Nearest Neighbour

In [21]:
clf_KNN = KNeighborsClassifier()
clf_KNN.fit(X_train, y_train)

KNeighborsClassifier()

In [22]:
test_eval(clf_KNN, X_test, y_test, 'KNN')

Confusion Matrix
[[  0   0   3   1   0   0   0]
 [  0   3  13  15   2   0   0]
 [  1   8 178  95   8   1   0]
 [  0   2 130 260  45   3   0]
 [  0   1  14  72  86   3   0]
 [  0   0   5   7  18   5   0]
 [  0   0   0   1   0   0   0]] 

Classification Report
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.21      0.09      0.13        33
           5       0.52      0.61      0.56       291
           6       0.58      0.59      0.58       440
           7       0.54      0.49      0.51       176
           8       0.42      0.14      0.21        35
           9       0.00      0.00      0.00         1

    accuracy                           0.54       980
   macro avg       0.32      0.27      0.29       980
weighted avg       0.53      0.54      0.53       980
 

AUC-ROC
0.6831905329090204 

Balanced Accuracy
0.27499936244413414 

Geometric Mean Score
0.497334200852124 

Sensitivity
0.27499936244413414 

## MODEL-5 Support Vector Classifier

In [23]:
clf_SVC = SVC(probability=True)
clf_SVC.fit(X_train, y_train)

SVC(probability=True)

In [24]:
test_eval(clf_SVC, X_test, y_test, 'SVC')

Confusion Matrix
[[  0   0   2   2   0   0   0]
 [  0   1  20  12   0   0   0]
 [  0   1 177 113   0   0   0]
 [  0   0  96 327  17   0   0]
 [  0   0   5 123  48   0   0]
 [  0   0   0  29   6   0   0]
 [  0   0   0   1   0   0   0]] 

Classification Report
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.50      0.03      0.06        33
           5       0.59      0.61      0.60       291
           6       0.54      0.74      0.62       440
           7       0.68      0.27      0.39       176
           8       0.00      0.00      0.00        35
           9       0.00      0.00      0.00         1

    accuracy                           0.56       980
   macro avg       0.33      0.24      0.24       980
weighted avg       0.56      0.56      0.53       980
 

AUC-ROC
0.8225047524369591 

Balanced Accuracy
0.23635136341321908 

Geometric Mean Score
0.4602332888971787 

Sensitivity
0.23635136341321908

# MODEL-6 XGBoost

In [25]:
class BoostRFEWrap(BaseEstimator, BoostRFE):

    @if_delegate_has_method(delegate='estimator')
    def predict_proba(self, X):
        return self.predict(X, method='predict_proba')


xgb_params = {'max_depth': (5,10),
          'learning_rate': (0.001, 0.3),
          'n_estimators': (25, 200),
          'reg_alpha' : (0.1, 1),
          'reg_lambda': (0.1, 1),
          'subsample': (0.5,  0.9),
          'colsample_bytree': (0.5,1),
          'min_child_weight': (0,10),
          'gamma': (0,1)}

clf_XGB = BoostRFEWrap(XGBClassifier(), param_grid=xgb_params, importance_type='shap_importances', train_importance=False, min_features_to_select=1, step=1)

In [26]:
xclf = clf_XGB.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=6, verbose=0)


512 trials detected for ('max_depth', 'learning_rate', 'n_estimators', 'reg_alpha', 'reg_lambda', 'subsample', 'colsample_bytree', 'min_child_weight', 'gamma')

trial: 0001 ### iterations: 00039 ### eval_score: 0.33367
trial: 0002 ### iterations: 00028 ### eval_score: 0.36224
trial: 0003 ### iterations: 00039 ### eval_score: 0.37551
trial: 0004 ### iterations: 00026 ### eval_score: 0.39694
trial: 0005 ### iterations: 00027 ### eval_score: 0.32041
trial: 0006 ### iterations: 00029 ### eval_score: 0.31837
trial: 0007 ### iterations: 00020 ### eval_score: 0.37551
trial: 0008 ### iterations: 00022 ### eval_score: 0.38367
trial: 0009 ### iterations: 00029 ### eval_score: 0.32959
trial: 0010 ### iterations: 00041 ### eval_score: 0.34286
trial: 0011 ### iterations: 00056 ### eval_score: 0.33265
trial: 0012 ### iterations: 00018 ### eval_score: 0.37653
trial: 0013 ### iterations: 00017 ### eval_score: 0.33469
trial: 0014 ### iterations: 00024 ### eval_score: 0.35
trial: 0015 ### iterations: 0

trial: 0141 ### iterations: 00015 ### eval_score: 0.38673
trial: 0142 ### iterations: 00004 ### eval_score: 0.39082
trial: 0143 ### iterations: 00002 ### eval_score: 0.44388
trial: 0144 ### iterations: 00002 ### eval_score: 0.44388
trial: 0145 ### iterations: 00017 ### eval_score: 0.37959
trial: 0146 ### iterations: 00012 ### eval_score: 0.39388
trial: 0147 ### iterations: 00004 ### eval_score: 0.44592
trial: 0148 ### iterations: 00004 ### eval_score: 0.45204
trial: 0149 ### iterations: 00012 ### eval_score: 0.39082
trial: 0150 ### iterations: 00007 ### eval_score: 0.3949
trial: 0151 ### iterations: 00012 ### eval_score: 0.43469
trial: 0152 ### iterations: 00012 ### eval_score: 0.43776
trial: 0153 ### iterations: 00007 ### eval_score: 0.37959
trial: 0154 ### iterations: 00021 ### eval_score: 0.37347
trial: 0155 ### iterations: 00024 ### eval_score: 0.42347
trial: 0156 ### iterations: 00016 ### eval_score: 0.42449
trial: 0157 ### iterations: 00004 ### eval_score: 0.39796
trial: 0158 ###

trial: 0283 ### iterations: 00026 ### eval_score: 0.4051
trial: 0284 ### iterations: 00023 ### eval_score: 0.41429
trial: 0285 ### iterations: 00049 ### eval_score: 0.36837
trial: 0286 ### iterations: 00041 ### eval_score: 0.37959
trial: 0287 ### iterations: 00063 ### eval_score: 0.37347
trial: 0288 ### iterations: 00022 ### eval_score: 0.41224
trial: 0289 ### iterations: 00026 ### eval_score: 0.39694
trial: 0290 ### iterations: 00013 ### eval_score: 0.41735
trial: 0291 ### iterations: 00041 ### eval_score: 0.39286
trial: 0292 ### iterations: 00026 ### eval_score: 0.41939
trial: 0293 ### iterations: 00050 ### eval_score: 0.34898
trial: 0294 ### iterations: 00026 ### eval_score: 0.3949
trial: 0295 ### iterations: 00015 ### eval_score: 0.41326
trial: 0296 ### iterations: 00023 ### eval_score: 0.40918
trial: 0297 ### iterations: 00062 ### eval_score: 0.35204
trial: 0298 ### iterations: 00046 ### eval_score: 0.38367
trial: 0299 ### iterations: 00040 ### eval_score: 0.3898
trial: 0300 ### i

trial: 0425 ### iterations: 00017 ### eval_score: 0.43469
trial: 0426 ### iterations: 00017 ### eval_score: 0.43469
trial: 0427 ### iterations: 00014 ### eval_score: 0.4449
trial: 0428 ### iterations: 00029 ### eval_score: 0.44184
trial: 0429 ### iterations: 00005 ### eval_score: 0.44082
trial: 0430 ### iterations: 00005 ### eval_score: 0.4398
trial: 0431 ### iterations: 00006 ### eval_score: 0.4449
trial: 0432 ### iterations: 00014 ### eval_score: 0.44286
trial: 0433 ### iterations: 00021 ### eval_score: 0.44694
trial: 0434 ### iterations: 00014 ### eval_score: 0.44592
trial: 0435 ### iterations: 00011 ### eval_score: 0.45408
trial: 0436 ### iterations: 00039 ### eval_score: 0.45408
trial: 0437 ### iterations: 00005 ### eval_score: 0.43163
trial: 0438 ### iterations: 00005 ### eval_score: 0.43061
trial: 0439 ### iterations: 00008 ### eval_score: 0.44898
trial: 0440 ### iterations: 00006 ### eval_score: 0.45
trial: 0441 ### iterations: 00008 ### eval_score: 0.4449
trial: 0442 ### itera

In [27]:
test_eval(xclf, X_test, y_test, 'XGB')

Confusion Matrix
[[  0   0   2   2   0   0   0]
 [  0  12  12   9   0   0   0]
 [  0   4 200  83   3   1   0]
 [  0   0  57 344  37   2   0]
 [  0   0   4  71  99   2   0]
 [  0   0   0  12  10  13   0]
 [  0   0   0   1   0   0   0]] 

Classification Report
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.75      0.36      0.49        33
           5       0.73      0.69      0.71       291
           6       0.66      0.78      0.72       440
           7       0.66      0.56      0.61       176
           8       0.72      0.37      0.49        35
           9       0.00      0.00      0.00         1

    accuracy                           0.68       980
   macro avg       0.50      0.40      0.43       980
weighted avg       0.68      0.68      0.67       980
 

AUC-ROC
0.8650982861038237 

Balanced Accuracy
0.3952383343215449 

Geometric Mean Score
0.6053361396687325 

Sensitivity
0.3952383343215449 


# MODEL-7 Stacking

In [28]:
clf_DT = DecisionTreeClassifier()
clf_SVC = SVC(probability=True)
clf_KNN = KNeighborsClassifier()
clf_LR = LogisticRegression()

sclf = StackingClassifier(classifiers=[clf_DT, clf_SVC, clf_KNN], use_probas=True, average_probas=False, meta_classifier=clf_LR)

In [29]:
sclf.fit(X_train, y_train)
test_eval(sclf, X_test, y_test, 'Stacking')

Confusion Matrix
[[  0   1   1   0   1   1   0]
 [  0  11   8  12   2   0   0]
 [  0  16 163  90  17   5   0]
 [  2  11  86 270  62   9   0]
 [  3   2   5  59  96  11   0]
 [  0   0   3  10   9  13   0]
 [  0   0   0   1   0   0   0]] 

Classification Report
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.27      0.33      0.30        33
           5       0.61      0.56      0.59       291
           6       0.61      0.61      0.61       440
           7       0.51      0.55      0.53       176
           8       0.33      0.37      0.35        35
           9       0.00      0.00      0.00         1

    accuracy                           0.56       980
   macro avg       0.33      0.35      0.34       980
weighted avg       0.57      0.56      0.57       980
 

AUC-ROC
0.7722694830717949 

Balanced Accuracy
0.34628432441392676 

Geometric Mean Score
0.5606705618884659 

Sensitivity
0.34628432441392676

# MODEL-8 Bagging

In [30]:
clf_bg = BaggingClassifier(clf_DT)

In [31]:
clf_bg.fit(X_train, y_train)
test_eval(clf_bg, X_test, y_test, 'Bagging')

Confusion Matrix
[[  0   0   2   2   0   0   0]
 [  0  12  12   8   1   0   0]
 [  0   5 200  83   3   0   0]
 [  0   3  95 308  30   4   0]
 [  0   0  11  68  91   6   0]
 [  0   0   3  10   8  14   0]
 [  0   0   0   1   0   0   0]] 

Classification Report
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.60      0.36      0.45        33
           5       0.62      0.69      0.65       291
           6       0.64      0.70      0.67       440
           7       0.68      0.52      0.59       176
           8       0.58      0.40      0.47        35
           9       0.00      0.00      0.00         1

    accuracy                           0.64       980
   macro avg       0.45      0.38      0.41       980
weighted avg       0.64      0.64      0.63       980
 

AUC-ROC
0.7435284534849114 

Balanced Accuracy
0.3811381487927879 

Geometric Mean Score
0.5917726027835611 

Sensitivity
0.3811381487927879 


In [32]:
clf_eval_df = pd.DataFrame({'model':model,
                            'precision':precision,
                            'recall':recall,
                            'f1-score':F1score,
                            'AUC-ROC':AUCROC,
                            'balanced_acc':balanced_acc,
                            'GMean':GMean,
                            'sensitivity':sensi,
                            'specificity':speci})
clf_eval_df

Unnamed: 0,model,precision,recall,f1-score,AUC-ROC,balanced_acc,GMean,sensitivity,specificity
0,Logistic Regression,0.385943,0.205097,0.200416,0.74609,0.205097,0.425704,0.205097,0.883604
1,Decision Tree,0.346417,0.354583,0.349827,0.631932,0.354583,0.567817,0.354583,0.909282
2,Gaussian NB,0.256268,0.272193,0.241567,0.756549,0.272193,0.490462,0.272193,0.883759
3,KNN,0.323897,0.274999,0.285569,0.683191,0.274999,0.497334,0.274999,0.899425
4,SVC,0.329253,0.236351,0.23849,0.822505,0.236351,0.460233,0.236351,0.896186
5,XGB,0.503275,0.395238,0.430212,0.865098,0.395238,0.605336,0.395238,0.927116
6,Stacking,0.334091,0.346284,0.3393,0.772269,0.346284,0.560671,0.346284,0.907784
7,Bagging,0.446915,0.381138,0.405348,0.743528,0.381138,0.591773,0.381138,0.918813


In [33]:
clf_eval_df.to_excel("D:\Skripsi\Final\project_whitewine_final.xlsx")