In [139]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import warnings

In [140]:
df = pd.read_csv('winequality-red.csv')

In [141]:
num_cols = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol']

In [142]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [143]:
X = df.drop(['quality'], axis=1)
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [144]:
y = df['quality']
y.head()

0    5
1    5
2    5
3    6
4    5
Name: quality, dtype: int64

In [145]:
y.unique()

array([5, 6, 7, 4, 8, 3], dtype=int64)

In [146]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

num_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    [
        ('StandardScaler', num_transformer, num_cols)
    ]
)

In [147]:
X = preprocessor.fit_transform(X)
X.shape

(1599, 11)

In [148]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_test.shape, y_test.shape

((320, 11), (320,))

In [149]:
y_train.value_counts()

quality
5    551
6    506
7    157
4     43
8     13
3      9
Name: count, dtype: int64

In [150]:
y_test.value_counts()

quality
6    132
5    130
7     42
4     10
8      5
3      1
Name: count, dtype: int64

In [151]:
def evaluate_model(true, predicted):
    cm = confusion_matrix(true, predicted)
    cr = classification_report(true, predicted)
    acc_score = accuracy_score(true, predicted)
    return cm, cr, acc_score


In [152]:
models = {
    'DecisionTree Classifier' : DecisionTreeClassifier(),
    'KNeighbor Classifier' : KNeighborsClassifier(),
    'Random Forest Classifier': RandomForestClassifier(),
    'Adaboost Classifier' : AdaBoostClassifier(),
    'Support Vector Classifier' : SVC(),
    'CatBoost Classifier' : CatBoostClassifier(allow_writing_files=False, silent=True)
    # 'XGB Classifier' : XGBClassifier()
    
}

In [153]:
model_list = []
accuracy_list = []

for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_cm, train_cr, train_as = evaluate_model(y_train, y_train_pred)
    test_cm, test_cr, test_as = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for training set:')
    print('Confusion Matrix: {}'.format(train_cm))
    print('Classification Report: {}'.format(train_cr))
    print('Accuracy Score: {}'.format(train_as))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print('Confusion Matrix: {}'.format(test_cm))
    print('Classification Report: {}'.format(test_cr))
    print('Accuracy Score: {}'.format(test_as))

    accuracy_list.append(test_as)
    
    print('='*35)
    print('\n')

DecisionTree Classifier
Model performance for training set:
Confusion Matrix: [[  9   0   0   0   0   0]
 [  0  43   0   0   0   0]
 [  0   0 551   0   0   0]
 [  0   0   0 506   0   0]
 [  0   0   0   0 157   0]
 [  0   0   0   0   0  13]]
Classification Report:               precision    recall  f1-score   support

           3       1.00      1.00      1.00         9
           4       1.00      1.00      1.00        43
           5       1.00      1.00      1.00       551
           6       1.00      1.00      1.00       506
           7       1.00      1.00      1.00       157
           8       1.00      1.00      1.00        13

    accuracy                           1.00      1279
   macro avg       1.00      1.00      1.00      1279
weighted avg       1.00      1.00      1.00      1279

Accuracy Score: 1.0
----------------------------------
Model performance for Test set
Confusion Matrix: [[ 0  0  0  1  0  0]
 [ 0  0  4  6  0  0]
 [ 1  4 86 35  4  0]
 [ 0  3 40 71 17  1]
 [ 0 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest Classifier
Model performance for training set:
Confusion Matrix: [[  9   0   0   0   0   0]
 [  0  43   0   0   0   0]
 [  0   0 551   0   0   0]
 [  0   0   0 506   0   0]
 [  0   0   0   0 157   0]
 [  0   0   0   0   0  13]]
Classification Report:               precision    recall  f1-score   support

           3       1.00      1.00      1.00         9
           4       1.00      1.00      1.00        43
           5       1.00      1.00      1.00       551
           6       1.00      1.00      1.00       506
           7       1.00      1.00      1.00       157
           8       1.00      1.00      1.00        13

    accuracy                           1.00      1279
   macro avg       1.00      1.00      1.00      1279
weighted avg       1.00      1.00      1.00      1279

Accuracy Score: 1.0
----------------------------------
Model performance for Test set
Confusion Matrix: [[  0   0   1   0   0   0]
 [  0   0   6   4   0   0]
 [  0   0 100  29   1   0]
 [  0  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Support Vector Classifier
Model performance for training set:
Confusion Matrix: [[  0   0   6   3   0   0]
 [  0   1  28  13   1   0]
 [  0   0 455  94   2   0]
 [  0   0 134 358  14   0]
 [  0   0   9  93  55   0]
 [  0   0   0   9   4   0]]
Classification Report:               precision    recall  f1-score   support

           3       0.00      0.00      0.00         9
           4       1.00      0.02      0.05        43
           5       0.72      0.83      0.77       551
           6       0.63      0.71      0.67       506
           7       0.72      0.35      0.47       157
           8       0.00      0.00      0.00        13

    accuracy                           0.68      1279
   macro avg       0.51      0.32      0.33      1279
weighted avg       0.68      0.68      0.65      1279

Accuracy Score: 0.6794370602032838
----------------------------------
Model performance for Test set
Confusion Matrix: [[ 0  0  1  0  0  0]
 [ 0  0  8  2  0  0]
 [ 0  0 99 31  0  0]
 [ 0  0 4

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [154]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)


xgb = XGBClassifier()
xgb.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

train_cm, train_cr, train_as = evaluate_model(y_train, y_train_pred)
test_cm, test_cr, test_as = evaluate_model(y_test, y_test_pred)

print('XGB Classifier')
model_list.append('XGB Classifier')

print('Model performance for training set:')
print('Confusion Matrix: {}'.format(train_cm))
print('Classification Report: {}'.format(train_cr))
print('Accuracy Score: {}'.format(train_as))

print('----------------------------------')

print('Model performance for Test set')
print('Confusion Matrix: {}'.format(test_cm))
print('Classification Report: {}'.format(test_cr))
print('Accuracy Score: {}'.format(test_as))

accuracy_list.append(test_as)

XGB Classifier
Model performance for training set:
Confusion Matrix: [[  0   0   0   9   0   0   0   0   0]
 [  0   0   0   0  43   0   0   0   0]
 [  0   0   0   0   0 551   0   0   0]
 [  0   0   0   0   0   0 506   0   0]
 [  0   0   0   0   0   0   0 157   0]
 [  0   0   0   0   0   0   0   0  13]
 [  0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0]]
Classification Report:               precision    recall  f1-score   support

           0       0.00      0.00      0.00       9.0
           1       0.00      0.00      0.00      43.0
           2       0.00      0.00      0.00     551.0
           3       0.00      0.00      0.00     506.0
           4       0.00      0.00      0.00     157.0
           5       0.00      0.00      0.00      13.0
           6       0.00      0.00      0.00       0.0
           7       0.00      0.00      0.00       0.0
           8       0.00      0.00      0.00       0.0

    accuracy  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [156]:
pd.DataFrame(list(zip(model_list, accuracy_list)), columns=['model name' , 'accuracy_score']).sort_values(by=["accuracy_score"],ascending=False)

Unnamed: 0,model name,accuracy_score
2,Random Forest Classifier,0.678125
5,CatBoost Classifier,0.66875
6,XGB Classifier,0.66875
4,Support Vector Classifier,0.603125
0,DecisionTree Classifier,0.5625
1,KNeighbor Classifier,0.553125
3,Adaboost Classifier,0.528125
