In [59]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Modelling
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [60]:
df = pd.read_csv('data/winequality.csv')
df.shape

(6497, 13)

In [61]:
df.columns

Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'],
      dtype='object')

In [62]:
# using median to fill missing values
for column in df.columns:
    if df[column].isnull().sum() > 0:
        df[column].fillna(df[column].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

# Feature Engineering

In [107]:
df['bound SO2'] = round(df['total sulfur dioxide'] - df['free sulfur dioxide'],2)

- This is the portion of sulfur dioxide that has already been used up to protect the wine by binding to spoilage compounds (like acetaldehyde, which smells nutty or like bruised apples) and other molecules
- This feature helps the model distinguish between wines that are stable because they were clean from the start, and wines that are stable because they were heavily treated.

In [108]:
df['acidity ph ratio'] = round(df['fixed acidity'] / df['pH'],2)

- The sensory perception of acidity. Fixed acidity is the quantity of acid, while pH is its strength.
- This feature describes the crucial concept of acidic balance, which is far more predictive than either fixed acidity or pH alone

In [109]:
df['total acidity'] = round(df['fixed acidity'] + df['volatile acidity'],2)

- The total acid "load" in the wine. It combines the desirable fruit acids (fixed) with the undesirable vinegar-like acid (volatile).
- It gives the model a single, powerful measure for overall sourness. A model can learn that extremely high or low values are indicative of poor quality.

In [110]:
df['sugar to acidity ratio'] = round(df['residual sugar'] / df['total acidity'],2)

-  The most critical measure of balance, especially in white wines. It describes the interplay between sweetness and sourness.
- This feature provides a direct numerical representation of balance. The model can learn that there's an optimal range for this ratio that is strongly associated with high quality scores.

In [111]:
df['quality category'] = df['quality'].apply(lambda x: 1 if x >=6 else 0)

- Instead of predicting the exact score (a regression or multi-class classification problem), it would be more practical to build a binary classifier for is_good = 1 (e.g., quality >= 6) vs. not_good = 0 (quality < 6). This simplifies the problem and mitigates the severe class imbalance issue.

In [112]:
df['quality category'].value_counts()

quality category
1    4113
0    2384
Name: count, dtype: int64

In [113]:
X = df.drop(columns=['type','quality','quality category'])

In [114]:
y = df['quality category']

In [115]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   fixed acidity           6497 non-null   float64
 1   volatile acidity        6497 non-null   float64
 2   citric acid             6497 non-null   float64
 3   residual sugar          6497 non-null   float64
 4   chlorides               6497 non-null   float64
 5   free sulfur dioxide     6497 non-null   float64
 6   total sulfur dioxide    6497 non-null   float64
 7   density                 6497 non-null   float64
 8   pH                      6497 non-null   float64
 9   sulphates               6497 non-null   float64
 10  alcohol                 6497 non-null   float64
 11  bound SO2               6497 non-null   float64
 12  acidity ph ratio        6497 non-null   float64
 13  total acidity           6497 non-null   float64
 14  sugar to acidity ratio  6497 non-null   

In [116]:
num_features = X.select_dtypes(exclude="object").columns

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    [
         ("StandardScaler", numeric_transformer, num_features),        
    ]
    )

In [117]:
num_features

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'bound SO2', 'acidity ph ratio',
       'total acidity', 'sugar to acidity ratio'],
      dtype='object')

In [118]:
X = preprocessor.fit_transform(X)


In [119]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=56)

In [120]:
X_train

array([[-0.24407731,  0.36689339, -1.02408531, ..., -0.02841688,
        -0.19075415,  1.51714035],
       [ 0.52771852, -0.72703776,  0.90402115, ...,  0.79839327,
         0.42051324,  1.48740659],
       [-0.39843648, -0.90935962, -0.61091964, ..., -0.54237994,
        -0.49638785,  0.22372173],
       ...,
       [-1.01587314, -0.60548985, -1.36839003, ..., -1.07868923,
        -1.05547388, -0.77235928],
       [-0.62997523, -0.90935962,  0.69743832, ..., -0.69880349,
        -0.72002226, -0.46015479],
       [-0.39843648,  1.21772873, -0.33547586, ..., -0.29657152,
        -0.23548104,  0.34265677]], shape=(5197, 15))

## Logestic Regression

In [121]:
lr = LogisticRegression()

In [122]:
lr.fit(X_train, y_train)

In [123]:
lr_predictions = lr.predict(X_test)

In [124]:
labels = [0, 1]  # Adjust according to your classes
cr_lr = classification_report(y_test, lr_predictions)
cm_lr = confusion_matrix(y_test, lr_predictions)
auc_lr = roc_auc_score(y_test, lr_predictions)
print("AUC Score:", auc_lr)
print("Classification Report:\n", cr_lr)
print("Confusion Matrix:\n")
print(pd.DataFrame(cm_lr, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score: 0.6874647526275314
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.53      0.59       470
           1       0.76      0.85      0.80       830

    accuracy                           0.73      1300
   macro avg       0.71      0.69      0.69      1300
weighted avg       0.73      0.73      0.72      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          247          223
Actual 1          125          705


## Decision Tree Classifier

In [125]:
dtree = DecisionTreeClassifier()

In [126]:
dtree.fit(X_train, y_train)

In [127]:
dtree_predictions = dtree.predict(X_test)

In [151]:
cr_dtree = classification_report(y_test, dtree_predictions)
cm_dtree = confusion_matrix(y_test, dtree_predictions)
auc_dtree = roc_auc_score(y_test, dtree_predictions)
print("AUC Score for Decision Tree:", auc_dtree)
print("Classification Report:\n", cr_dtree)
print("Confusion Matrix:\n")
print(pd.DataFrame(cm_dtree, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for Decision Tree: 0.749359138682389
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.69      0.68       470
           1       0.82      0.81      0.81       830

    accuracy                           0.77      1300
   macro avg       0.75      0.75      0.75      1300
weighted avg       0.77      0.77      0.77      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          325          145
Actual 1          160          670


## Randon forest Classifier

In [129]:
rfc = RandomForestClassifier()

In [130]:
rfc.fit(X_train, y_train)

In [131]:
rfc_predictions = rfc.predict(X_test)

In [132]:
cr_rfc = classification_report(y_test, rfc_predictions)
cm_rfc = confusion_matrix(y_test, rfc_predictions)
auc_rfc = roc_auc_score(y_test, rfc_predictions)
print("AUC Score for Random Forest:", auc_rfc)
print("Classification Report:\n", cr_rfc)
print("Confusion Matrix:\n")
print(pd.DataFrame(cm_rfc, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for Random Forest: 0.8250704947449371
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.75      0.78       470
           1       0.86      0.90      0.88       830

    accuracy                           0.85      1300
   macro avg       0.84      0.83      0.83      1300
weighted avg       0.84      0.85      0.84      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          352          118
Actual 1           82          748


## Light Boost Classifier

In [133]:
lgbc = LGBMClassifier()
lgbc.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 3283, number of negative: 1914
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000615 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2508
[LightGBM] [Info] Number of data points in the train set: 5197, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.631711 -> initscore=0.539562
[LightGBM] [Info] Start training from score 0.539562


In [134]:
lgbc_predicitons = lgbc.predict(X_test)



In [135]:
lgbc_cr = classification_report(y_test, lgbc_predicitons)
lgbc_cm = confusion_matrix(y_test, lgbc_predicitons)
lgbc_auc = roc_auc_score(y_test, lgbc_predicitons)
print("AUC Score for LightGBM:", lgbc_auc)
print("Classification Report:\n", lgbc_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(lgbc_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for LightGBM: 0.7808254293770828
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.69      0.72       470
           1       0.83      0.87      0.85       830

    accuracy                           0.81      1300
   macro avg       0.79      0.78      0.79      1300
weighted avg       0.80      0.81      0.80      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          324          146
Actual 1          106          724


## ADA Boost Classifier

In [136]:
ada = AdaBoostClassifier()
ada.fit(X_train, y_train)

In [137]:
ada_predicitions = ada.predict(X_test)

In [138]:
ada_cr = classification_report(y_test, ada_predicitions)
ada_cm = confusion_matrix(y_test, ada_predicitions)
auc_ada = roc_auc_score(y_test, ada_predicitions)
print("AUC Score for AdaBoost:", auc_ada)
print("Classification Report:\n", ada_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(ada_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for AdaBoost: 0.7205460138426045
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.58      0.64       470
           1       0.78      0.86      0.82       830

    accuracy                           0.76      1300
   macro avg       0.74      0.72      0.73      1300
weighted avg       0.75      0.76      0.75      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          273          197
Actual 1          116          714


## cat boost classifier

In [139]:
cat = CatBoostClassifier()
cat.fit(X_train, y_train)

Learning rate set to 0.020824
0:	learn: 0.6852841	total: 12ms	remaining: 12s
1:	learn: 0.6783266	total: 19ms	remaining: 9.5s
2:	learn: 0.6722057	total: 27.5ms	remaining: 9.15s
3:	learn: 0.6659130	total: 34.8ms	remaining: 8.68s
4:	learn: 0.6596128	total: 41.3ms	remaining: 8.21s
5:	learn: 0.6538585	total: 47.6ms	remaining: 7.88s
6:	learn: 0.6484762	total: 54.4ms	remaining: 7.71s
7:	learn: 0.6423548	total: 61.9ms	remaining: 7.67s
8:	learn: 0.6372154	total: 68.5ms	remaining: 7.55s
9:	learn: 0.6320512	total: 75.1ms	remaining: 7.43s
10:	learn: 0.6273413	total: 81.2ms	remaining: 7.3s
11:	learn: 0.6229657	total: 88.6ms	remaining: 7.29s
12:	learn: 0.6186487	total: 95.4ms	remaining: 7.24s
13:	learn: 0.6149331	total: 102ms	remaining: 7.17s
14:	learn: 0.6111935	total: 110ms	remaining: 7.23s
15:	learn: 0.6074665	total: 116ms	remaining: 7.14s
16:	learn: 0.6037187	total: 122ms	remaining: 7.05s
17:	learn: 0.5999034	total: 128ms	remaining: 6.98s
18:	learn: 0.5962787	total: 134ms	remaining: 6.91s
19:	le

<catboost.core.CatBoostClassifier at 0x1f4555ff9e0>

In [140]:
cat_predicitions = cat.predict(X_test)

In [141]:
cat_cr = classification_report(y_test, cat_predicitions)
cat_cm = confusion_matrix(y_test, cat_predicitions)
cat_auc = roc_auc_score(y_test, cat_predicitions)
print("AUC Score for CatBoost:", cat_auc)
print("Classification Report:\n", cat_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(cat_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))  

AUC Score for CatBoost: 0.7761086900794668
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.68      0.71       470
           1       0.83      0.87      0.85       830

    accuracy                           0.80      1300
   macro avg       0.79      0.78      0.78      1300
weighted avg       0.80      0.80      0.80      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          319          151
Actual 1          105          725


## K Nearest Neighbour Classsification

In [142]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [143]:
knn_predicitions = knn.predict(X_test)

In [144]:
knn_cr = classification_report(y_test, knn_predicitions)
knn_cm = confusion_matrix(y_test, knn_predicitions)
knn_auc = roc_auc_score(y_test, knn_predicitions)
print("AUC Score for KNN:", knn_auc)
print("Classification Report:\n", knn_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(knn_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for KNN: 0.7292745449884646
Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.61      0.65       470
           1       0.79      0.85      0.82       830

    accuracy                           0.76      1300
   macro avg       0.74      0.73      0.73      1300
weighted avg       0.76      0.76      0.76      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          288          182
Actual 1          128          702


## Support Vector Classifier

In [145]:
svc = SVC()
svc.fit(X_train, y_train)

In [146]:
svc_predicitions = svc.predict(X_test)

In [147]:
svc_cr = classification_report(y_test, svc_predicitions)
svc_cm = confusion_matrix(y_test, svc_predicitions)
svc_auc = roc_auc_score(y_test, svc_predicitions)
print("AUC Score for SVC:", svc_auc)
print("Classification Report:\n", svc_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(svc_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for SVC: 0.7423481158677263
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.61      0.67       470
           1       0.80      0.88      0.84       830

    accuracy                           0.78      1300
   macro avg       0.77      0.74      0.75      1300
weighted avg       0.78      0.78      0.77      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          285          185
Actual 1          101          729


### Training and evaluating models after applying SMOTE on datasets

In [148]:
# using ADASYN(Adaptive Synthetic Sampling Approach)
from imblearn.over_sampling import ADASYN

# Apply ADASYN to balance the training data
adasyn = ADASYN(sampling_strategy='minority',random_state=56)
X_train_balanced, y_train_balanced = adasyn.fit_resample(X_train, y_train)

# Check new class distribution
print(pd.Series(y_train_balanced).value_counts())

quality category
1    3283
0    3220
Name: count, dtype: int64


In [149]:
df.columns

Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality', 'quality category', 'bound SO2', 'acidity ph ratio',
       'total acidity', 'sugar to acidity ratio'],
      dtype='object')

In [150]:
# Understanding the distribution of wine by type
sns.axes_style('whitegrid')
plt.figure(figsize=(16, 8))
ax = sns.countplot(x='quality',hue='type')
plt.title('Wine Type Distribution')
plt.xlabel('Wine Type')
plt.ylabel('Count')


ValueError: Could not interpret value `quality` for `x`. Value is a string, but `data` was not passed.

<Figure size 1600x800 with 0 Axes>