In [20]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Modelling
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [21]:
df = pd.read_csv('data/winequality.csv')
df.shape

(6497, 13)

In [22]:
df.columns


Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'],
      dtype='object')

In [42]:
# using median to fill missing values
for column in df.columns:
    if df[column].isnull().sum() > 0:
        df[column].fillna(df[column].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

# Feature Engineering

In [43]:
df['bound SO2'] = round(df['total sulfur dioxide'] - df['free sulfur dioxide'],2)

- This is the portion of sulfur dioxide that has already been used up to protect the wine by binding to spoilage compounds (like acetaldehyde, which smells nutty or like bruised apples) and other molecules
- This feature helps the model distinguish between wines that are stable because they were clean from the start, and wines that are stable because they were heavily treated.

In [44]:
df['acidity ph ratio'] = round(df['fixed acidity'] / df['pH'],2)

- The sensory perception of acidity. Fixed acidity is the quantity of acid, while pH is its strength.
- This feature describes the crucial concept of acidic balance, which is far more predictive than either fixed acidity or pH alone

In [45]:
df['total acidity'] = round(df['fixed acidity'] + df['volatile acidity'],2)

- The total acid "load" in the wine. It combines the desirable fruit acids (fixed) with the undesirable vinegar-like acid (volatile).
- It gives the model a single, powerful measure for overall sourness. A model can learn that extremely high or low values are indicative of poor quality.

In [46]:
df['sugar to acidity ratio'] = round(df['residual sugar'] / df['total acidity'],2)

-  The most critical measure of balance, especially in white wines. It describes the interplay between sweetness and sourness.
- This feature provides a direct numerical representation of balance. The model can learn that there's an optimal range for this ratio that is strongly associated with high quality scores.

In [47]:
df['quality category'] = df['quality'].apply(lambda x: 1 if x >=6 else 0)

- Instead of predicting the exact score (a regression or multi-class classification problem), it would be more practical to build a binary classifier for is_good = 1 (e.g., quality >= 6) vs. not_good = 0 (quality < 6). This simplifies the problem and mitigates the severe class imbalance issue.

In [48]:
df['quality category'].value_counts()

quality category
1    4113
0    2384
Name: count, dtype: int64

In [49]:
X = df.drop(columns=['type','quality','quality category'])

In [50]:
y = df['quality category']

In [51]:
num_features = X.select_dtypes(exclude="object").columns

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    [
                 ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [52]:
X = preprocessor.fit_transform(X)


In [53]:
X.shape

(6497, 15)

In [54]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=56)

In [55]:
def evaluate_model(true,predicted):
    cm = confusion_matrix(true, predicted)
    cr = classification_report(true, predicted)
    return cm, cr

In [None]:
# models = {
#     "Logestic Regression": LogisticRegression(),
#     "KNN Classifier": KNeighborsClassifier(),
#     "Decision Tree Classifier": DecisionTreeClassifier(),
#     "Random Forest Classifier": RandomForestClassifier(),
#     "Support Vector Classifier": SVC(),
#     "Ada Boost Classifier": AdaBoostClassifier(),
#     "XG Boost Classifier": XGBClassifier(),
#     "Light GBM Classifier": LGBMClassifier(),
#     "Cat Boost Classifier": CatBoostClassifier()
# }
# model_list = []

# for i in range(len(list(models))):
#     model = list(models.values())[i]
#     model.fit(X_train, y_train)

#     y_train_pred = model.predict(X_train)
#     y_test_pred = model.predict(X_test)
    
#     cm_train, cr_train = evaluate_model(y_train, y_train_pred)
#     cm_test, cr_test = evaluate_model(y_test, y_test_pred) 

#     print(list(models.keys())[i])
#     model_list.append(list(models.keys())[i])

#     print("Model performance for training set")
#     print("- Classification Report: ".format(cr_train))
#     print("- Confusion Matrix: ".format(cm_train))

#     print("--------------------------------------------------------")

#     print("Model performance for test set")
#     print("- Classification Report: ".format(cr_test))
#     print("- Confusion Matrix: ".format(cm_test))

Logestic Regression
Model performance for training set
- Classification Report: 
- Confusion Matrix: 
--------------------------------------------------------
Model performance for test set
- Classification Report: 
- Confusion Matrix: 
KNN Classifier
Model performance for training set
- Classification Report: 
- Confusion Matrix: 
--------------------------------------------------------
Model performance for test set
- Classification Report: 
- Confusion Matrix: 
Decision Tree Classifier
Model performance for training set
- Classification Report: 
- Confusion Matrix: 
--------------------------------------------------------
Model performance for test set
- Classification Report: 
- Confusion Matrix: 
Random Forest Classifier
Model performance for training set
- Classification Report: 
- Confusion Matrix: 
--------------------------------------------------------
Model performance for test set
- Classification Report: 
- Confusion Matrix: 
Support Vector Classifier
Model performance for

  message = "The feature names should match those that were passed during fit.\n"
  message = "The feature names should match those that were passed during fit.\n"


Light GBM Classifier
Model performance for training set
- Classification Report: 
- Confusion Matrix: 
--------------------------------------------------------
Model performance for test set
- Classification Report: 
- Confusion Matrix: 
Learning rate set to 0.019669
0:	learn: 0.6854345	total: 174ms	remaining: 2m 53s
1:	learn: 0.6787849	total: 181ms	remaining: 1m 30s
2:	learn: 0.6726661	total: 188ms	remaining: 1m 2s
3:	learn: 0.6664798	total: 194ms	remaining: 48.3s
4:	learn: 0.6605277	total: 201ms	remaining: 39.9s
5:	learn: 0.6550816	total: 207ms	remaining: 34.3s
6:	learn: 0.6499015	total: 213ms	remaining: 30.3s
7:	learn: 0.6441749	total: 220ms	remaining: 27.3s
8:	learn: 0.6395071	total: 226ms	remaining: 24.9s
9:	learn: 0.6341633	total: 233ms	remaining: 23.1s
10:	learn: 0.6297197	total: 240ms	remaining: 21.6s
11:	learn: 0.6260609	total: 247ms	remaining: 20.3s
12:	learn: 0.6217130	total: 253ms	remaining: 19.2s
13:	learn: 0.6182363	total: 260ms	remaining: 18.3s
14:	learn: 0.6141048	total