In [104]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Modelling
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [2]:
df = pd.read_csv('data/winequality.csv')
df.shape

(6497, 13)

In [3]:
df.columns

Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'],
      dtype='object')

In [4]:
# using median to fill missing values
for column in df.columns:
    if df[column].isnull().sum() > 0:
        df[column].fillna(df[column].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

# Feature Engineering

In [5]:
df['bound SO2'] = round(df['total sulfur dioxide'] - df['free sulfur dioxide'],2)

- This is the portion of sulfur dioxide that has already been used up to protect the wine by binding to spoilage compounds (like acetaldehyde, which smells nutty or like bruised apples) and other molecules
- This feature helps the model distinguish between wines that are stable because they were clean from the start, and wines that are stable because they were heavily treated.

In [6]:
df['acidity ph ratio'] = round(df['fixed acidity'] / df['pH'],2)

- The sensory perception of acidity. Fixed acidity is the quantity of acid, while pH is its strength.
- This feature describes the crucial concept of acidic balance, which is far more predictive than either fixed acidity or pH alone

In [7]:
df['total acidity'] = round(df['fixed acidity'] + df['volatile acidity'],2)

- The total acid "load" in the wine. It combines the desirable fruit acids (fixed) with the undesirable vinegar-like acid (volatile).
- It gives the model a single, powerful measure for overall sourness. A model can learn that extremely high or low values are indicative of poor quality.

In [8]:
df['sugar to acidity ratio'] = round(df['residual sugar'] / df['total acidity'],2)

-  The most critical measure of balance, especially in white wines. It describes the interplay between sweetness and sourness.
- This feature provides a direct numerical representation of balance. The model can learn that there's an optimal range for this ratio that is strongly associated with high quality scores.

In [9]:
df['quality category'] = df['quality'].apply(lambda x: 1 if x >=6 else 0)

- Instead of predicting the exact score (a regression or multi-class classification problem), it would be more practical to build a binary classifier for is_good = 1 (e.g., quality >= 6) vs. not_good = 0 (quality < 6). This simplifies the problem and mitigates the severe class imbalance issue.

In [10]:
df['quality category'].value_counts()

quality category
1    4113
0    2384
Name: count, dtype: int64

In [94]:
X = df.drop(columns=['type','quality','quality category'])

In [23]:
y = df['quality category']

In [24]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   type                    6497 non-null   object 
 1   fixed acidity           6497 non-null   float64
 2   volatile acidity        6497 non-null   float64
 3   citric acid             6497 non-null   float64
 4   residual sugar          6497 non-null   float64
 5   chlorides               6497 non-null   float64
 6   free sulfur dioxide     6497 non-null   float64
 7   total sulfur dioxide    6497 non-null   float64
 8   density                 6497 non-null   float64
 9   pH                      6497 non-null   float64
 10  sulphates               6497 non-null   float64
 11  alcohol                 6497 non-null   float64
 12  bound SO2               6497 non-null   float64
 13  acidity ph ratio        6497 non-null   float64
 14  total acidity           6497 non-null   

In [None]:
num_features = X.select_dtypes(exclude="object").columns

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    [
         ("StandardScaler", numeric_transformer, num_features),        
    ]
    )

In [None]:
num_features

(Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
        'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
        'pH', 'sulphates', 'alcohol', 'bound SO2', 'acidity ph ratio',
        'total acidity', 'sugar to acidity ratio'],
       dtype='object'),
 Index(['type'], dtype='object'))

In [97]:
X = preprocessor.fit_transform(X)


In [98]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=56)

In [99]:
X_train

array([[-0.24407731,  0.36689339, -1.02408531, ..., -0.02841688,
        -0.19075415,  1.51714035],
       [ 0.52771852, -0.72703776,  0.90402115, ...,  0.79839327,
         0.42051324,  1.48740659],
       [-0.39843648, -0.90935962, -0.61091964, ..., -0.54237994,
        -0.49638785,  0.22372173],
       ...,
       [-1.01587314, -0.60548985, -1.36839003, ..., -1.07868923,
        -1.05547388, -0.77235928],
       [-0.62997523, -0.90935962,  0.69743832, ..., -0.69880349,
        -0.72002226, -0.46015479],
       [-0.39843648,  1.21772873, -0.33547586, ..., -0.29657152,
        -0.23548104,  0.34265677]], shape=(5197, 15))

In [None]:
# # using ADASYN(Adaptive Synthetic Sampling Approach)
# from imblearn.over_sampling import ADASYN

# # Apply ADASYN to balance the training data
# adasyn = ADASYN(sampling_strategy='minority',random_state=56)
# X_train_balanced, y_train_balanced = adasyn.fit_resample(X_train, y_train)

# # Check new class distribution
# print(pd.Series(y_train_balanced).value_counts())

quality category
1    3283
0    3219
Name: count, dtype: int64


## Logestic Regression

In [100]:
lr = LogisticRegression()

In [101]:
lr.fit(X_train, y_train)

In [102]:
lr_predictions = lr.predict(X_test)

In [105]:
labels = [0, 1]  # Adjust according to your classes
cr_lr = classification_report(y_test, lr_predictions)
cm_lr = confusion_matrix(y_test, lr_predictions)
auc_lr = roc_auc_score(y_test, lr_predictions)
print("AUC Score:", auc_lr)
print("Classification Report:\n", cr_lr)
print("Confusion Matrix:\n")
print(pd.DataFrame(cm_lr, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score: 0.6874647526275314
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.53      0.59       470
           1       0.76      0.85      0.80       830

    accuracy                           0.73      1300
   macro avg       0.71      0.69      0.69      1300
weighted avg       0.73      0.73      0.72      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          247          223
Actual 1          125          705


## Decision Tree Classifier

In [70]:
dtree = DecisionTreeClassifier()

In [71]:
dtree.fit(X_train, y_train)

In [72]:
dtree_predictions = dtree.predict(X_test)

In [106]:
cr_dtree = classification_report(y_test, dtree_predictions)
cm_dtree = confusion_matrix(y_test, dtree_predictions)
auc_dtree = roc_auc_score(y_test, dtree_predictions)
print("AUC Score for Decision Tree:", auc_dtree)
print("Classification Report:\n", cr_dtree)
print("Confusion Matrix:\n", cm_dtree)
print(pd.DataFrame(cm_dtree, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for Decision Tree: 0.7548192771084337
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.70      0.69       470
           1       0.83      0.81      0.82       830

    accuracy                           0.77      1300
   macro avg       0.75      0.75      0.75      1300
weighted avg       0.77      0.77      0.77      1300

Confusion Matrix:
 [[329 141]
 [158 672]]
          Predicted 0  Predicted 1
Actual 0          329          141
Actual 1          158          672


## Randon forest Classifier

In [43]:
rfc = RandomForestClassifier()

In [74]:
rfc.fit(X_train, y_train)

In [75]:
rfc_predictions = rfc.predict(X_test)

In [107]:
cr_rfc = classification_report(y_test, rfc_predictions)
cm_rfc = confusion_matrix(y_test, rfc_predictions)
auc_rfc = roc_auc_score(y_test, rfc_predictions)
print("AUC Score for Random Forest:", auc_rfc)
print("Classification Report:\n", cr_rfc)
print("Confusion Matrix:\n")
print(pd.DataFrame(cm_rfc, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for Random Forest: 0.8172007177646756
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.74      0.77       470
           1       0.86      0.89      0.88       830

    accuracy                           0.84      1300
   macro avg       0.83      0.82      0.82      1300
weighted avg       0.84      0.84      0.84      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          348          122
Actual 1           88          742


## Light Boost Classifier

In [77]:
lgbc = LGBMClassifier()
lgbc.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 3283, number of negative: 1914
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000726 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2510
[LightGBM] [Info] Number of data points in the train set: 5197, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.631711 -> initscore=0.539562
[LightGBM] [Info] Start training from score 0.539562


In [78]:
lgbc_predicitons = lgbc.predict(X_test)



In [108]:
lgbc_cr = classification_report(y_test, lgbc_predicitons)
lgbc_cm = confusion_matrix(y_test, lgbc_predicitons)
lgbc_auc = roc_auc_score(y_test, lgbc_predicitons)
print("AUC Score for LightGBM:", lgbc_auc)
print("Classification Report:\n", lgbc_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(lgbc_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for LightGBM: 0.7807869776980261
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.68      0.72       470
           1       0.83      0.88      0.85       830

    accuracy                           0.81      1300
   macro avg       0.80      0.78      0.79      1300
weighted avg       0.81      0.81      0.81      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          320          150
Actual 1           99          731


## ADA Boost Classifier

In [80]:
ada = AdaBoostClassifier()
ada.fit(X_train, y_train)

In [81]:
ada_predicitions = ada.predict(X_test)

In [109]:
ada_cr = classification_report(y_test, ada_predicitions)
ada_cm = confusion_matrix(y_test, ada_predicitions)
auc_ada = roc_auc_score(y_test, ada_predicitions)
print("AUC Score for AdaBoost:", auc_ada)
print("Classification Report:\n", ada_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(ada_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for AdaBoost: 0.7205460138426045
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.58      0.64       470
           1       0.78      0.86      0.82       830

    accuracy                           0.76      1300
   macro avg       0.74      0.72      0.73      1300
weighted avg       0.75      0.76      0.75      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          273          197
Actual 1          116          714


## cat boost classifier

In [83]:
cat = CatBoostClassifier()
cat.fit(X_train, y_train)

Learning rate set to 0.020824
0:	learn: 0.6871846	total: 8.34ms	remaining: 8.33s
1:	learn: 0.6800010	total: 16.1ms	remaining: 8.05s
2:	learn: 0.6738621	total: 23.2ms	remaining: 7.72s
3:	learn: 0.6672652	total: 29.6ms	remaining: 7.36s
4:	learn: 0.6617989	total: 36ms	remaining: 7.17s
5:	learn: 0.6559175	total: 41.8ms	remaining: 6.92s
6:	learn: 0.6498660	total: 48.3ms	remaining: 6.85s
7:	learn: 0.6437077	total: 54.7ms	remaining: 6.78s
8:	learn: 0.6385967	total: 61ms	remaining: 6.71s
9:	learn: 0.6330183	total: 66.9ms	remaining: 6.63s
10:	learn: 0.6279768	total: 72.7ms	remaining: 6.53s
11:	learn: 0.6233417	total: 79.4ms	remaining: 6.54s
12:	learn: 0.6193214	total: 85.3ms	remaining: 6.48s
13:	learn: 0.6152256	total: 91.4ms	remaining: 6.43s
14:	learn: 0.6114354	total: 97.4ms	remaining: 6.4s
15:	learn: 0.6070585	total: 104ms	remaining: 6.38s
16:	learn: 0.6032728	total: 111ms	remaining: 6.42s
17:	learn: 0.5996930	total: 117ms	remaining: 6.38s
18:	learn: 0.5958206	total: 123ms	remaining: 6.35s
1

<catboost.core.CatBoostClassifier at 0x1c7c64b7ec0>

In [84]:
cat_predicitions = cat.predict(X_test)

In [110]:
cat_cr = classification_report(y_test, cat_predicitions)
cat_cm = confusion_matrix(y_test, cat_predicitions)
cat_auc = roc_auc_score(y_test, cat_predicitions)
print("AUC Score for CatBoost:", cat_auc)
print("Classification Report:\n", cat_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(cat_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))  

AUC Score for CatBoost: 0.7799025890797232
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.69      0.72       470
           1       0.83      0.87      0.85       830

    accuracy                           0.81      1300
   macro avg       0.79      0.78      0.79      1300
weighted avg       0.80      0.81      0.80      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          322          148
Actual 1          104          726


## K Nearest Neighbour Classsification

In [86]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [87]:
knn_predicitions = knn.predict(X_test)

In [111]:
knn_cr = classification_report(y_test, knn_predicitions)
knn_cm = confusion_matrix(y_test, knn_predicitions)
knn_auc = roc_auc_score(y_test, knn_predicitions)
print("AUC Score for KNN:", knn_auc)
print("Classification Report:\n", knn_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(knn_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for KNN: 0.7282107152012305
Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.61      0.65       470
           1       0.79      0.85      0.82       830

    accuracy                           0.76      1300
   macro avg       0.74      0.73      0.73      1300
weighted avg       0.76      0.76      0.76      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          287          183
Actual 1          128          702


## Support Vector Classifier

In [89]:
svc = SVC()
svc.fit(X_train, y_train)

In [90]:
svc_predicitions = svc.predict(X_test)

In [113]:
svc_cr = classification_report(y_test, svc_predicitions)
svc_cm = confusion_matrix(y_test, svc_predicitions)
svc_auc = roc_auc_score(y_test, svc_predicitions)
print("AUC Score for SVC:", svc_auc)
print("Classification Report:\n", svc_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(svc_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for SVC: 0.7446167649320687
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.61      0.67       470
           1       0.80      0.88      0.84       830

    accuracy                           0.78      1300
   macro avg       0.77      0.74      0.75      1300
weighted avg       0.78      0.78      0.78      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          286          184
Actual 1           99          731
