In [2]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Modelling
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [3]:
df = pd.read_csv('data/winequality.csv')
df.shape

(6497, 13)

In [4]:
df.columns

Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'],
      dtype='object')

In [5]:
# using median to fill missing values
for column in df.columns:
    if df[column].isnull().sum() > 0:
        df[column].fillna(df[column].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

# Feature Engineering

In [6]:
df['bound SO2'] = round(df['total sulfur dioxide'] - df['free sulfur dioxide'],2)

- This is the portion of sulfur dioxide that has already been used up to protect the wine by binding to spoilage compounds (like acetaldehyde, which smells nutty or like bruised apples) and other molecules
- This feature helps the model distinguish between wines that are stable because they were clean from the start, and wines that are stable because they were heavily treated.

In [7]:
df['acidity ph ratio'] = round(df['fixed acidity'] / df['pH'],2)

- The sensory perception of acidity. Fixed acidity is the quantity of acid, while pH is its strength.
- This feature describes the crucial concept of acidic balance, which is far more predictive than either fixed acidity or pH alone

In [8]:
df['total acidity'] = round(df['fixed acidity'] + df['volatile acidity'],2)

- The total acid "load" in the wine. It combines the desirable fruit acids (fixed) with the undesirable vinegar-like acid (volatile).
- It gives the model a single, powerful measure for overall sourness. A model can learn that extremely high or low values are indicative of poor quality.

In [9]:
df['sugar to acidity ratio'] = round(df['residual sugar'] / df['total acidity'],2)

-  The most critical measure of balance, especially in white wines. It describes the interplay between sweetness and sourness.
- This feature provides a direct numerical representation of balance. The model can learn that there's an optimal range for this ratio that is strongly associated with high quality scores.

In [10]:
df['quality category'] = df['quality'].apply(lambda x: 1 if x >=6 else 0)

- Instead of predicting the exact score (a regression or multi-class classification problem), it would be more practical to build a binary classifier for is_good = 1 (e.g., quality >= 6) vs. not_good = 0 (quality < 6). This simplifies the problem and mitigates the severe class imbalance issue.

In [11]:
df['quality category'].value_counts()

quality category
1    4113
0    2384
Name: count, dtype: int64

In [12]:
X = df.drop(columns=['type','quality','quality category'])

In [13]:
y = df['quality category']

In [17]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   fixed acidity           6497 non-null   float64
 1   volatile acidity        6497 non-null   float64
 2   citric acid             6497 non-null   float64
 3   residual sugar          6497 non-null   float64
 4   chlorides               6497 non-null   float64
 5   free sulfur dioxide     6497 non-null   float64
 6   total sulfur dioxide    6497 non-null   float64
 7   density                 6497 non-null   float64
 8   pH                      6497 non-null   float64
 9   sulphates               6497 non-null   float64
 10  alcohol                 6497 non-null   float64
 11  bound SO2               6497 non-null   float64
 12  acidity ph ratio        6497 non-null   float64
 13  total acidity           6497 non-null   float64
 14  sugar to acidity ratio  6497 non-null   

In [19]:
num_features = X.select_dtypes(exclude="object").columns

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    [
                 ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [20]:
X = preprocessor.fit_transform(X)


In [23]:
X

array([[-0.16689773, -0.42316799,  0.28427265, ...,  0.1726991 ,
        -0.21311759,  3.12276347],
       [-0.70715481, -0.24084614,  0.14655076, ..., -0.76584215,
        -0.71256778, -0.7574924 ],
       [ 0.68207768, -0.36239404,  0.55971643, ...,  0.5078924 ,
         0.61432973,  0.10478668],
       ...,
       [-0.70715481,  1.03540687, -1.29952909, ..., -0.92226569,
        -0.55602369, -0.60882359],
       [-1.01587314,  1.85585523, -1.36839003, ..., -1.34684387,
        -0.75729466, -0.65342424],
       [-0.93869356, -0.18007218,  1.04174304, ..., -1.07868923,
        -0.92874771, -0.26688534]], shape=(6497, 15))

In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=56)

In [None]:
# using ADASYN(Adaptive Synthetic Sampling Approach)
from imblearn.over_sampling import ADASYN

# Apply ADASYN to balance the training data
adasyn = ADASYN(sampling_strategy='minority',random_state=56)
X_train_balanced, y_train_balanced = adasyn.fit_resample(X_train, y_train)

# Check new class distribution
print(pd.Series(y_train_balanced).value_counts())

quality category
1    3283
0    3220
Name: count, dtype: int64


## Logestic Regression

In [52]:
lr = LogisticRegression()

In [53]:
lr.fit(X_train_balanced, y_train_balanced)

In [54]:
lr_predictions = lr.predict(X_test)

In [68]:
labels = [0, 1]  # Adjust according to your classes
cr_lr = classification_report(y_test, lr_predictions)
cm_lr = confusion_matrix(y_test, lr_predictions)
print("Classification Report:\n", cr_lr)
print("Confusion Matrix:\n")
print(pd.DataFrame(cm_lr, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.76      0.68       470
           1       0.84      0.73      0.78       830

    accuracy                           0.74      1300
   macro avg       0.73      0.74      0.73      1300
weighted avg       0.76      0.74      0.74      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          359          111
Actual 1          228          602


## Decision Tree Classifier

In [56]:
dtree = DecisionTreeClassifier()

In [57]:
dtree.fit(X_train_balanced, y_train_balanced)

In [60]:
dtree_predictions = dtree.predict(X_test)

In [69]:
cr_dtree = classification_report(y_test, dtree_predictions)
cm_dtree = confusion_matrix(y_test, dtree_predictions)
print("Classification Report:\n", cr_dtree)
print("Confusion Matrix:\n", cm_dtree)
print(pd.DataFrame(cm_dtree, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.76      0.72       470
           1       0.86      0.80      0.83       830

    accuracy                           0.79      1300
   macro avg       0.77      0.78      0.77      1300
weighted avg       0.79      0.79      0.79      1300

Confusion Matrix:
 [[358 112]
 [166 664]]
          Predicted 0  Predicted 1
Actual 0          358          112
Actual 1          166          664


## Randon forest Classifier

In [62]:
rfc = RandomForestClassifier()

In [63]:
rfc.fit(X_train_balanced, y_train_balanced)

In [64]:
rfc_predictions = rfc.predict(X_test)

In [70]:
cr_rfc = classification_report(y_test, rfc_predictions)
cm_rfc = confusion_matrix(y_test, rfc_predictions)
print("Classification Report:\n", cr_rfc)
print("Confusion Matrix:\n")
print(pd.DataFrame(cm_rfc, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.79      0.77       470
           1       0.88      0.84      0.86       830

    accuracy                           0.83      1300
   macro avg       0.81      0.82      0.81      1300
weighted avg       0.83      0.83      0.83      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          372           98
Actual 1          129          701


## Light Boost Classifier

In [71]:
lgbc = LGBMClassifier()
lgbc.fit(X_train_balanced, y_train_balanced)

[LightGBM] [Info] Number of positive: 3283, number of negative: 3220
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001312 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3813
[LightGBM] [Info] Number of data points in the train set: 6503, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.504844 -> initscore=0.019376
[LightGBM] [Info] Start training from score 0.019376


In [72]:
lgbc_predicitons = lgbc.predict(X_test)



In [73]:
lgbc_cr = classification_report(y_test, lgbc_predicitons)
lgbc_cm = confusion_matrix(y_test, lgbc_predicitons)
print("Classification Report:\n", lgbc_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(lgbc_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.78      0.74       470
           1       0.87      0.82      0.84       830

    accuracy                           0.80      1300
   macro avg       0.79      0.80      0.79      1300
weighted avg       0.81      0.80      0.80      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          366          104
Actual 1          152          678


In [None]:
def evaluate_model(true,predicted):
    cm = confusion_matrix(true, predicted)
    cr = classification_report(true, predicted)
    return cm, cr

In [None]:
# models = {
#     "Logestic Regression": LogisticRegression(),
#     "KNN Classifier": KNeighborsClassifier(),
#     "Decision Tree Classifier": DecisionTreeClassifier(),
#     "Random Forest Classifier": RandomForestClassifier(),
#     "Support Vector Classifier": SVC(),
#     "Ada Boost Classifier": AdaBoostClassifier(),
#     "XG Boost Classifier": XGBClassifier(),
#     "Light GBM Classifier": LGBMClassifier(),
#     "Cat Boost Classifier": CatBoostClassifier()
# }
# model_list = []

# for i in range(len(list(models))):
#     model = list(models.values())[i]
#     model.fit(X_train, y_train)

#     y_train_pred = model.predict(X_train)
#     y_test_pred = model.predict(X_test)
    
#     cm_train, cr_train = evaluate_model(y_train, y_train_pred)
#     cm_test, cr_test = evaluate_model(y_test, y_test_pred) 

#     print(list(models.keys())[i])
#     model_list.append(list(models.keys())[i])

#     print("Model performance for training set")
#     print("- Classification Report: ".format(cr_train))
#     print("- Confusion Matrix: ".format(cm_train))

#     print("--------------------------------------------------------")

#     print("Model performance for test set")
#     print("- Classification Report: ".format(cr_test))
#     print("- Confusion Matrix: ".format(cm_test))

In [None]:
# 