In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from mlxtend.classifier import StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, classification_report, precision_recall_curve
from sklearn.model_selection import train_test_split
from imblearn.metrics import geometric_mean_score, sensitivity_score, specificity_score
from xgboost import XGBClassifier
from shaphypetune import BoostRFE
from sklearn.base import BaseEstimator
from sklearn.utils.metaestimators import if_delegate_has_method

import torch
import re, pickle, random, os
import warnings
warnings.filterwarnings('ignore')

from collections import Counter

In [2]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

RANDOM_STATE = 42
seed_everything(seed=RANDOM_STATE)

In [3]:
df = pd.read_csv("redwine.csv",delimiter=';')
df.shape

(1599, 12)

In [4]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [5]:
df['quality'].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [6]:
df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [7]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


# Train and Test Split

In [8]:
x = df.drop(['quality'], axis=1)
y = df['quality']

X_train,X_test,y_train,y_test=train_test_split(x, y, train_size=0.8, stratify = y, random_state=100)

In [9]:
y_train.shape, y_test.shape

((1279,), (320,))

In [10]:
y_train.value_counts()/len(y_train)

5    0.426114
6    0.398749
7    0.124316
4    0.032838
8    0.011728
3    0.006255
Name: quality, dtype: float64

In [11]:
y_test.value_counts()/len(y_test)

5    0.425000
6    0.400000
7    0.125000
4    0.034375
8    0.009375
3    0.006250
Name: quality, dtype: float64

In [12]:
from sklearn.preprocessing import StandardScaler
Scaler_X = StandardScaler()
X_train = Scaler_X.fit_transform(X_train)
X_test = Scaler_X.transform(X_test)

# Model Building

In [13]:
model = list()
precision = list()
recall = list()
F1score = list()
AUCROC = list()
balanced_acc = list()
GMean = list()
sensi = list()
speci = list()

In [14]:
def test_eval(clf_model, X_test, y_test, algo=None):

    y_prob=clf_model.predict_proba(X_test)
    y_pred=clf_model.predict(X_test)

    print('Confusion Matrix')
    print('='*60)
    print(confusion_matrix(y_test,y_pred),"\n")
    print('Classification Report')
    print('='*60)
    print(classification_report(y_test,y_pred),"\n")
    print('='*60)
    print('AUC-ROC')
    print(roc_auc_score(y_test, y_prob, multi_class='ovr'), "\n")
    print('Balanced Accuracy')
    print(balanced_accuracy_score(y_test, y_pred),"\n")
    print('Geometric Mean Score')
    print(geometric_mean_score(y_test, y_pred, average='macro'),"\n")
    print('Sensitivity')
    print(sensitivity_score(y_test, y_pred, average='macro'),"\n")
    print('Specificity')
    print(specificity_score(y_test, y_pred, average='macro'))
          
    model.append(algo)
    precision.append(precision_score(y_test,y_pred, average='macro'))
    recall.append(recall_score(y_test,y_pred, average='macro'))
    F1score.append(f1_score(y_test,y_pred, average='macro'))
    AUCROC.append(roc_auc_score(y_test, y_prob, multi_class='ovr', average='macro'))
    balanced_acc.append(balanced_accuracy_score(y_test, y_pred))
    GMean.append(geometric_mean_score(y_test, y_pred, average='macro'))
    sensi.append(sensitivity_score(y_test, y_pred, average='macro'))
    speci.append(specificity_score(y_test, y_pred, average='macro'))


## Model-1: Logistic Regression

In [15]:
clf_LR = LogisticRegression(multi_class='ovr', solver='liblinear')
clf_LR.fit(X_train, y_train)

LogisticRegression(multi_class='ovr', solver='liblinear')

In [16]:
test_eval(clf_LR, X_test, y_test, 'Logistic Regression')

Confusion Matrix
[[  0   0   2   0   0   0]
 [  0   0   9   1   1   0]
 [  0   0 110  25   1   0]
 [  0   0  48  78   2   0]
 [  0   0   1  33   6   0]
 [  0   0   0   0   3   0]] 

Classification Report
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00        11
           5       0.65      0.81      0.72       136
           6       0.57      0.61      0.59       128
           7       0.46      0.15      0.23        40
           8       0.00      0.00      0.00         3

    accuracy                           0.61       320
   macro avg       0.28      0.26      0.26       320
weighted avg       0.56      0.61      0.57       320
 

AUC-ROC
0.8181440947737783 

Balanced Accuracy
0.26136642156862744 

Geometric Mean Score
0.48237614379785465 

Sensitivity
0.26136642156862744 

Specificity
0.890270229468599


## Model-2: Decision Tree

In [17]:
clf_DT = DecisionTreeClassifier()
clf_DT.fit(X_train, y_train)

DecisionTreeClassifier()

In [18]:
test_eval(clf_DT, X_test, y_test, 'Decision Tree')

Confusion Matrix
[[ 0  2  0  0  0  0]
 [ 0  2  6  2  1  0]
 [ 0  7 97 28  4  0]
 [ 0  7 32 72 17  0]
 [ 0  3  6 10 20  1]
 [ 0  0  0  3  0  0]] 

Classification Report
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.10      0.18      0.12        11
           5       0.69      0.71      0.70       136
           6       0.63      0.56      0.59       128
           7       0.48      0.50      0.49        40
           8       0.00      0.00      0.00         3

    accuracy                           0.60       320
   macro avg       0.31      0.33      0.32       320
weighted avg       0.61      0.60      0.60       320
 

AUC-ROC
0.6126041693314006 

Balanced Accuracy
0.32625891265597146 

Geometric Mean Score
0.5415627962311462 

Sensitivity
0.32625891265597146 

Specificity
0.8989494260068298


## Model-3: Gaussian NB

In [19]:
clf_NB = GaussianNB()
clf_NB.fit(X_train, y_train)

GaussianNB()

In [20]:
test_eval(clf_NB, X_test, y_test, 'Gaussian NB')

Confusion Matrix
[[ 0  0  2  0  0  0]
 [ 0  2  4  4  1  0]
 [ 1  5 89 36  5  0]
 [ 0  3 35 74 14  2]
 [ 0  0  1 14 19  6]
 [ 0  0  0  0  2  1]] 

Classification Report
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.20      0.18      0.19        11
           5       0.68      0.65      0.67       136
           6       0.58      0.58      0.58       128
           7       0.46      0.47      0.47        40
           8       0.11      0.33      0.17         3

    accuracy                           0.58       320
   macro avg       0.34      0.37      0.35       320
weighted avg       0.59      0.58      0.58       320
 

AUC-ROC
0.7025279754401752 

Balanced Accuracy
0.3704480466428996 

Geometric Mean Score
0.5751419644663356 

Sensitivity
0.3704480466428996 

Specificity
0.8929410811796378


## Model-4: K-Nearest Neighbour

In [21]:
clf_KNN = KNeighborsClassifier()
clf_KNN.fit(X_train, y_train)

KNeighborsClassifier()

In [22]:
test_eval(clf_KNN, X_test, y_test, 'KNN')

Confusion Matrix
[[ 0  0  1  1  0  0]
 [ 0  2  5  4  0  0]
 [ 1  1 90 43  1  0]
 [ 0  0 56 63  8  1]
 [ 0  0  5 16 19  0]
 [ 0  0  0  2  1  0]] 

Classification Report
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.67      0.18      0.29        11
           5       0.57      0.66      0.61       136
           6       0.49      0.49      0.49       128
           7       0.66      0.47      0.55        40
           8       0.00      0.00      0.00         3

    accuracy                           0.54       320
   macro avg       0.40      0.30      0.32       320
weighted avg       0.54      0.54      0.54       320
 

AUC-ROC
0.6317945795063217 

Balanced Accuracy
0.30179506461675576 

Geometric Mean Score
0.5137248634458315 

Sensitivity
0.30179506461675576 

Specificity
0.8744783008879784


## MODEL-5 Support Vector Classifier

In [23]:
clf_SVC = SVC(probability=True)
clf_SVC.fit(X_train, y_train)

SVC(probability=True)

In [24]:
test_eval(clf_SVC, X_test, y_test, 'SVC')

Confusion Matrix
[[  0   0   2   0   0   0]
 [  0   0   9   2   0   0]
 [  0   0 106  29   1   0]
 [  0   0  48  75   5   0]
 [  0   0   2  25  13   0]
 [  0   0   0   2   1   0]] 

Classification Report
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00        11
           5       0.63      0.78      0.70       136
           6       0.56      0.59      0.57       128
           7       0.65      0.33      0.43        40
           8       0.00      0.00      0.00         3

    accuracy                           0.61       320
   macro avg       0.31      0.28      0.28       320
weighted avg       0.58      0.61      0.58       320
 

AUC-ROC
0.8311091612733549 

Balanced Accuracy
0.28172487745098035 

Geometric Mean Score
0.5007999985652012 

Sensitivity
0.28172487745098035 

Specificity
0.8902324879227054


# MODEL-6 XGBoost

In [25]:
class BoostRFEWrap(BaseEstimator, BoostRFE):

    @if_delegate_has_method(delegate='estimator')
    def predict_proba(self, X):
        return self.predict(X, method='predict_proba')


xgb_params = {'max_depth': (5,10),
          'learning_rate': (0.001, 0.3),
          'n_estimators': (25, 200),
          'reg_alpha' : (0.1, 1),
          'reg_lambda': (0.1, 1),
          'subsample': (0.5,  0.9),
          'colsample_bytree': (0.5,1),
          'min_child_weight': (0,10),
          'gamma': (0,1)}

clf_XGB = BoostRFEWrap(XGBClassifier(), param_grid=xgb_params, importance_type='shap_importances', train_importance=False, min_features_to_select=1, step=1)

In [26]:
xclf = clf_XGB.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=6, verbose=0)


512 trials detected for ('max_depth', 'learning_rate', 'n_estimators', 'reg_alpha', 'reg_lambda', 'subsample', 'colsample_bytree', 'min_child_weight', 'gamma')

trial: 0001 ### iterations: 00022 ### eval_score: 0.3125
trial: 0002 ### iterations: 00017 ### eval_score: 0.325
trial: 0003 ### iterations: 00034 ### eval_score: 0.34063
trial: 0004 ### iterations: 00017 ### eval_score: 0.34063
trial: 0005 ### iterations: 00021 ### eval_score: 0.30625
trial: 0006 ### iterations: 00014 ### eval_score: 0.3125
trial: 0007 ### iterations: 00026 ### eval_score: 0.325
trial: 0008 ### iterations: 00016 ### eval_score: 0.34375
trial: 0009 ### iterations: 00021 ### eval_score: 0.31562
trial: 0010 ### iterations: 00014 ### eval_score: 0.31562
trial: 0011 ### iterations: 00030 ### eval_score: 0.31562
trial: 0012 ### iterations: 00016 ### eval_score: 0.31875
trial: 0013 ### iterations: 00019 ### eval_score: 0.30938
trial: 0014 ### iterations: 00016 ### eval_score: 0.3
trial: 0015 ### iterations: 00012 ##

trial: 0141 ### iterations: 00001 ### eval_score: 0.3125
trial: 0142 ### iterations: 00001 ### eval_score: 0.31562
trial: 0143 ### iterations: 00000 ### eval_score: 0.3875
trial: 0144 ### iterations: 00000 ### eval_score: 0.38125
trial: 0145 ### iterations: 00006 ### eval_score: 0.35625
trial: 0146 ### iterations: 00008 ### eval_score: 0.35625
trial: 0147 ### iterations: 00016 ### eval_score: 0.38438
trial: 0148 ### iterations: 00007 ### eval_score: 0.4
trial: 0149 ### iterations: 00002 ### eval_score: 0.35313
trial: 0150 ### iterations: 00002 ### eval_score: 0.35313
trial: 0151 ### iterations: 00002 ### eval_score: 0.36875
trial: 0152 ### iterations: 00002 ### eval_score: 0.3625
trial: 0153 ### iterations: 00001 ### eval_score: 0.35938
trial: 0154 ### iterations: 00035 ### eval_score: 0.32812
trial: 0155 ### iterations: 00011 ### eval_score: 0.38438
trial: 0156 ### iterations: 00009 ### eval_score: 0.38438
trial: 0157 ### iterations: 00003 ### eval_score: 0.35
trial: 0158 ### iteratio

trial: 0284 ### iterations: 00040 ### eval_score: 0.33125
trial: 0285 ### iterations: 00032 ### eval_score: 0.3125
trial: 0286 ### iterations: 00016 ### eval_score: 0.33437
trial: 0287 ### iterations: 00016 ### eval_score: 0.34375
trial: 0288 ### iterations: 00019 ### eval_score: 0.33437
trial: 0289 ### iterations: 00037 ### eval_score: 0.325
trial: 0290 ### iterations: 00014 ### eval_score: 0.34063
trial: 0291 ### iterations: 00032 ### eval_score: 0.36875
trial: 0292 ### iterations: 00026 ### eval_score: 0.34687
trial: 0293 ### iterations: 00015 ### eval_score: 0.35625
trial: 0294 ### iterations: 00014 ### eval_score: 0.34375
trial: 0295 ### iterations: 00010 ### eval_score: 0.36562
trial: 0296 ### iterations: 00028 ### eval_score: 0.34375
trial: 0297 ### iterations: 00042 ### eval_score: 0.30938
trial: 0298 ### iterations: 00015 ### eval_score: 0.325
trial: 0299 ### iterations: 00021 ### eval_score: 0.34375
trial: 0300 ### iterations: 00031 ### eval_score: 0.34063
trial: 0301 ### ite

trial: 0427 ### iterations: 00010 ### eval_score: 0.38438
trial: 0428 ### iterations: 00008 ### eval_score: 0.38438
trial: 0429 ### iterations: 00010 ### eval_score: 0.3875
trial: 0430 ### iterations: 00009 ### eval_score: 0.3875
trial: 0431 ### iterations: 00001 ### eval_score: 0.4
trial: 0432 ### iterations: 00001 ### eval_score: 0.4
trial: 0433 ### iterations: 00009 ### eval_score: 0.38438
trial: 0434 ### iterations: 00008 ### eval_score: 0.38438
trial: 0435 ### iterations: 00003 ### eval_score: 0.39375
trial: 0436 ### iterations: 00004 ### eval_score: 0.39687
trial: 0437 ### iterations: 00010 ### eval_score: 0.3625
trial: 0438 ### iterations: 00010 ### eval_score: 0.35938
trial: 0439 ### iterations: 00001 ### eval_score: 0.37812
trial: 0440 ### iterations: 00006 ### eval_score: 0.37812
trial: 0441 ### iterations: 00011 ### eval_score: 0.40313
trial: 0442 ### iterations: 00014 ### eval_score: 0.37812
trial: 0443 ### iterations: 00014 ### eval_score: 0.38438
trial: 0444 ### iteration

In [27]:
test_eval(xclf, X_test, y_test, 'XGB')

Confusion Matrix
[[  0   0   2   0   0   0]
 [  0   1   9   1   0   0]
 [  0   3 110  22   1   0]
 [  0   0  30  91   7   0]
 [  0   0   1  16  23   0]
 [  0   0   0   1   2   0]] 

Classification Report
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.25      0.09      0.13        11
           5       0.72      0.81      0.76       136
           6       0.69      0.71      0.70       128
           7       0.70      0.57      0.63        40
           8       0.00      0.00      0.00         3

    accuracy                           0.70       320
   macro avg       0.39      0.36      0.37       320
weighted avg       0.68      0.70      0.69       320
 

AUC-ROC
0.8078887198012291 

Balanced Accuracy
0.36427835338680925 

Geometric Mean Score
0.5788036049697353 

Sensitivity
0.36427835338680925 

Specificity
0.9196637955871809


# MODEL-7 Stacking

In [28]:
clf_DT = DecisionTreeClassifier()
clf_SVC = SVC(probability=True)
clf_KNN = KNeighborsClassifier()
clf_LR = LogisticRegression()

sclf = StackingClassifier(classifiers=[clf_DT, clf_SVC, clf_KNN], use_probas=True, average_probas=False, meta_classifier=clf_LR)

In [29]:
sclf.fit(X_train, y_train)
test_eval(sclf, X_test, y_test, 'Stacking')

Confusion Matrix
[[  0   1   1   0   0   0]
 [  0   2   6   2   1   0]
 [  0   6 100  28   2   0]
 [  0   8  30  74  16   0]
 [  0   3   7  12  17   1]
 [  0   0   0   2   1   0]] 

Classification Report
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.10      0.18      0.13        11
           5       0.69      0.74      0.71       136
           6       0.63      0.58      0.60       128
           7       0.46      0.42      0.44        40
           8       0.00      0.00      0.00         3

    accuracy                           0.60       320
   macro avg       0.31      0.32      0.31       320
weighted avg       0.61      0.60      0.60       320
 

AUC-ROC
0.79329669278576 

Balanced Accuracy
0.32003954991087347 

Geometric Mean Score
0.5366331877206278 

Sensitivity
0.32003954991087347 

Specificity
0.8998112209675325


# MODEL-8 Bagging

In [30]:
clf_bg = BaggingClassifier(clf_DT)

In [31]:
clf_bg.fit(X_train, y_train)
test_eval(clf_bg, X_test, y_test, 'Bagging')

Confusion Matrix
[[  0   0   2   0   0   0]
 [  0   0   8   3   0   0]
 [  0   1 107  27   1   0]
 [  0   2  41  78   7   0]
 [  0   1   3  16  19   1]
 [  0   0   0   1   2   0]] 

Classification Report
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00        11
           5       0.66      0.79      0.72       136
           6       0.62      0.61      0.62       128
           7       0.66      0.47      0.55        40
           8       0.00      0.00      0.00         3

    accuracy                           0.64       320
   macro avg       0.32      0.31      0.31       320
weighted avg       0.61      0.64      0.62       320
 

AUC-ROC
0.6963390346563427 

Balanced Accuracy
0.3118566176470588 

Geometric Mean Score
0.5302700847786508 

Sensitivity
0.3118566176470588 

Specificity
0.9016527047997034


In [32]:
clf_eval_df = pd.DataFrame({'model':model,
                            'precision':precision,
                            'recall':recall,
                            'f1-score':F1score,
                            'AUC-ROC':AUCROC,
                            'balanced_acc':balanced_acc,
                            'GMean':GMean,
                            'sensitivity':sensi,
                            'specificity':speci})
clf_eval_df

Unnamed: 0,model,precision,recall,f1-score,AUC-ROC,balanced_acc,GMean,sensitivity,specificity
0,Logistic Regression,0.279657,0.261366,0.255675,0.818144,0.261366,0.482376,0.261366,0.89027
1,Decision Tree,0.314243,0.326259,0.317626,0.612604,0.326259,0.541563,0.326259,0.898949
2,Gaussian NB,0.338673,0.370448,0.345178,0.702528,0.370448,0.575142,0.370448,0.892941
3,KNN,0.397243,0.301795,0.323508,0.631795,0.301795,0.513725,0.301795,0.874478
4,SVC,0.308107,0.281725,0.284619,0.831109,0.281725,0.5008,0.281725,0.890232
5,XGB,0.394218,0.364278,0.371677,0.807889,0.364278,0.578804,0.364278,0.919664
6,Stacking,0.313504,0.32004,0.314417,0.793297,0.32004,0.536633,0.32004,0.899811
7,Bagging,0.323961,0.311857,0.314644,0.696339,0.311857,0.53027,0.311857,0.901653


In [33]:
clf_eval_df.to_excel("D:\Skripsi\Final\project_redwine_final.xlsx")