In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Modelling
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('data/winequality.csv')
df.shape

(6497, 13)

In [3]:
df.columns

Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'],
      dtype='object')

In [4]:
# using median to fill missing values
for column in df.columns:
    if df[column].isnull().sum() > 0:
        df[column].fillna(df[column].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

# Feature Engineering

In [5]:
df['quality category'] = df['quality'].apply(lambda x: 1 if x >=7 else 0)

- Instead of predicting the exact score (a regression or multi-class classification problem), it would be more practical to build a binary classifier for is_good = 1 (e.g., quality >= 6) vs. not_good = 0 (quality < 6). This simplifies the problem and mitigates the severe class imbalance issue.

In [6]:
df['quality category'].value_counts()

quality category
0    5220
1    1277
Name: count, dtype: int64

In [7]:
X = df.drop(columns=['type','quality','quality category'])

In [8]:
y = df['quality category']

In [9]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         6497 non-null   float64
 1   volatile acidity      6497 non-null   float64
 2   citric acid           6497 non-null   float64
 3   residual sugar        6497 non-null   float64
 4   chlorides             6497 non-null   float64
 5   free sulfur dioxide   6497 non-null   float64
 6   total sulfur dioxide  6497 non-null   float64
 7   density               6497 non-null   float64
 8   pH                    6497 non-null   float64
 9   sulphates             6497 non-null   float64
 10  alcohol               6497 non-null   float64
dtypes: float64(11)
memory usage: 558.5 KB


In [10]:
num_features = X.select_dtypes(exclude="object").columns

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    [
         ("StandardScaler", numeric_transformer, num_features),        
    ]
    )

In [11]:
num_features

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol'],
      dtype='object')

In [12]:
X = preprocessor.fit_transform(X)


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=56)

In [14]:
X_train

array([[-0.24407731,  0.36689339, -1.02408531, ..., -0.86153461,
        -0.27697438, -0.91546416],
       [ 0.52771852, -0.72703776,  0.90402115, ..., -1.17281894,
        -0.47864451, -1.08316218],
       [-0.39843648, -0.90935962, -0.61091964, ...,  0.75714392,
        -0.68031463, -0.74776615],
       ...,
       [-1.01587314, -0.60548985, -1.36839003, ...,  0.75714392,
        -0.61309126, -0.74776615],
       [-0.62997523, -0.90935962,  0.69743832, ...,  0.50811645,
        -0.81476138,  1.01306302],
       [-0.39843648,  1.21772873, -0.33547586, ..., -0.36347968,
        -1.01643151, -0.91546416]], shape=(5197, 11))

## Logestic Regression

In [15]:
lr = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=56)

In [16]:
lr.fit(X_train, y_train)

In [17]:
lr_predictions = lr.predict(X_test)

In [18]:
labels = [0, 1]  # Adjust according to your classes
cr_lr = classification_report(y_test, lr_predictions)
cm_lr = confusion_matrix(y_test, lr_predictions)
auc_lr = roc_auc_score(y_test, lr_predictions)
print("AUC Score:", auc_lr)
print("Classification Report:\n", cr_lr)
print("Confusion Matrix:\n")
print(pd.DataFrame(cm_lr, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score: 0.7423960971948589
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.70      0.80      1045
           1       0.39      0.78      0.52       255

    accuracy                           0.72      1300
   macro avg       0.66      0.74      0.66      1300
weighted avg       0.82      0.72      0.74      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          732          313
Actual 1           55          200


## Decision Tree Classifier

In [19]:
dtree = DecisionTreeClassifier(class_weight='balanced', random_state=56)

In [20]:
dtree.fit(X_train, y_train)

In [21]:
dtree_predictions = dtree.predict(X_test)

In [22]:
cr_dtree = classification_report(y_test, dtree_predictions)
cm_dtree = confusion_matrix(y_test, dtree_predictions)
auc_dtree = roc_auc_score(y_test, dtree_predictions)
print("AUC Score for Decision Tree:", auc_dtree)
print("Classification Report:\n", cr_dtree)
print("Confusion Matrix:\n")
print(pd.DataFrame(cm_dtree, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for Decision Tree: 0.7468430434374707
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.90      0.90      1045
           1       0.59      0.60      0.59       255

    accuracy                           0.84      1300
   macro avg       0.74      0.75      0.75      1300
weighted avg       0.84      0.84      0.84      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          938          107
Actual 1          103          152


## Randon forest Classifier

In [23]:
rfc = RandomForestClassifier(class_weight='balanced')

In [24]:
rfc.fit(X_train, y_train)

In [25]:
rfc_predictions = rfc.predict(X_test)

In [26]:
cr_rfc = classification_report(y_test, rfc_predictions)
cm_rfc = confusion_matrix(y_test, rfc_predictions)
auc_rfc = roc_auc_score(y_test, rfc_predictions)
print("AUC Score for Random Forest:", auc_rfc)
print("Classification Report:\n", cr_rfc)
print("Confusion Matrix:\n")
print(pd.DataFrame(cm_rfc, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for Random Forest: 0.7549394877568253
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.96      0.93      1045
           1       0.76      0.55      0.64       255

    accuracy                           0.88      1300
   macro avg       0.83      0.75      0.78      1300
weighted avg       0.87      0.88      0.87      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0         1000           45
Actual 1          114          141


## Light Boost Classifier

In [27]:
lgbc = LGBMClassifier()
lgbc.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 1022, number of negative: 4175
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000345 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1548
[LightGBM] [Info] Number of data points in the train set: 5197, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.196652 -> initscore=-1.407353
[LightGBM] [Info] Start training from score -1.407353


In [28]:
lgbc_predicitons = lgbc.predict(X_test)



In [29]:
lgbc_cr = classification_report(y_test, lgbc_predicitons)
lgbc_cm = confusion_matrix(y_test, lgbc_predicitons)
lgbc_auc = roc_auc_score(y_test, lgbc_predicitons)
print("AUC Score for LightGBM:", lgbc_auc)
print("Classification Report:\n", lgbc_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(lgbc_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for LightGBM: 0.7305938643399944
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.94      0.92      1045
           1       0.69      0.52      0.59       255

    accuracy                           0.86      1300
   macro avg       0.79      0.73      0.75      1300
weighted avg       0.85      0.86      0.85      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          986           59
Actual 1          123          132


### XGB Classifier

In [30]:
xbg = XGBClassifier()
xbg.fit(X_train, y_train)

In [31]:
xgb_predictions = xbg.predict(X_test)

In [32]:
xgb_cr = classification_report(y_test, xgb_predictions)
xgb_cm = confusion_matrix(y_test, xgb_predictions)
xgb_auc = roc_auc_score(y_test, xgb_predictions)
print("AUC Score for XGBoost:", xgb_auc)
print("Classification Report:\n", xgb_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(xgb_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for XGBoost: 0.7752603433717985
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.94      0.92      1045
           1       0.71      0.61      0.66       255

    accuracy                           0.87      1300
   macro avg       0.81      0.78      0.79      1300
weighted avg       0.87      0.87      0.87      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          981           64
Actual 1           99          156


## K Nearest Neighbour Classsification

In [33]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [34]:
knn_predicitions = knn.predict(X_test)

In [35]:
knn_cr = classification_report(y_test, knn_predicitions)
knn_cm = confusion_matrix(y_test, knn_predicitions)
knn_auc = roc_auc_score(y_test, knn_predicitions)
print("AUC Score for KNN:", knn_auc)
print("Classification Report:\n", knn_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(knn_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for KNN: 0.7114551083591331
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.91      0.89      1045
           1       0.57      0.52      0.54       255

    accuracy                           0.83      1300
   macro avg       0.73      0.71      0.72      1300
weighted avg       0.82      0.83      0.83      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          946           99
Actual 1          123          132


## Support Vector Classifier

In [36]:
svc = SVC()
svc.fit(X_train, y_train)

In [37]:
svc_predicitions = svc.predict(X_test)

In [38]:
svc_cr = classification_report(y_test, svc_predicitions)
svc_cm = confusion_matrix(y_test, svc_predicitions)
svc_auc = roc_auc_score(y_test, svc_predicitions)
print("AUC Score for SVC:", svc_auc)
print("Classification Report:\n", svc_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(svc_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for SVC: 0.6328923914063233
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.96      0.90      1045
           1       0.63      0.31      0.42       255

    accuracy                           0.83      1300
   macro avg       0.74      0.63      0.66      1300
weighted avg       0.81      0.83      0.81      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          999           46
Actual 1          176           79


### Training and evaluating models after applying SMOTE on datasets

In [39]:
from imblearn.combine import SMOTETomek

# Apply SMOTE + Tomek Links to balance the training data
smote_tomek = SMOTETomek(random_state=56)
X_train_balanced, y_train_balanced = smote_tomek.fit_resample(X_train, y_train)

# Check new class distribution
print(pd.Series(y_train_balanced).value_counts())

quality category
0    4156
1    4156
Name: count, dtype: int64


In [40]:
df.columns

Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality', 'quality category'],
      dtype='object')

### Logestic Regression

In [41]:
lr1 = LogisticRegression()

In [42]:
lr1.fit(X_train_balanced, y_train_balanced)

In [43]:
lr1_predictions = lr1.predict(X_test)

In [44]:
lr1_cr = classification_report(y_test, lr1_predictions)
lr1_cm = confusion_matrix(y_test, lr1_predictions)
lr1_auc = roc_auc_score(y_test, lr1_predictions)
print("AUC Score for logestic regression:", lr1_auc)
print("Classification Report:\n", lr1_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(lr1_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for logestic regression: 0.7341213997560746
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.70      0.79      1045
           1       0.38      0.77      0.51       255

    accuracy                           0.71      1300
   macro avg       0.65      0.73      0.65      1300
weighted avg       0.82      0.71      0.74      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          727          318
Actual 1           58          197


### Decision Tree 

In [45]:
dtree1 = DecisionTreeClassifier()

In [46]:
dtree1.fit(X_train_balanced, y_train_balanced)

In [47]:
dtree1_predictions = dtree1.predict(X_test)

In [48]:
dtree1_cr = classification_report(y_test, dtree1_predictions)
dtree1_cm = confusion_matrix(y_test, dtree1_predictions)
dtree1_auc = roc_auc_score(y_test, dtree1_predictions)
print("AUC Score for Decision Tree:", dtree1_auc)
print("Classification Report:\n", dtree1_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(dtree1_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for Decision Tree: 0.7528567407824374
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.85      0.88      1045
           1       0.51      0.66      0.58       255

    accuracy                           0.81      1300
   macro avg       0.71      0.75      0.73      1300
weighted avg       0.83      0.81      0.82      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          885          160
Actual 1           87          168


### Randon Forest 

In [49]:
rfc1 = RandomForestClassifier()
rfc1.fit(X_train_balanced, y_train_balanced)

In [50]:
rfc1_predictions = rfc1.predict(X_test)

In [51]:
rfc1_cr = classification_report(y_test, rfc1_predictions)
rfc1_cm = confusion_matrix(y_test, rfc1_predictions)
rfc1_auc = roc_auc_score(y_test, rfc1_predictions)
print("AUC Score for Random Forest:", rfc1_auc)
print("Classification Report:\n", rfc1_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(rfc1_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for Random Forest: 0.8212590299277607
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.91      0.92      1045
           1       0.66      0.74      0.69       255

    accuracy                           0.87      1300
   macro avg       0.79      0.82      0.81      1300
weighted avg       0.88      0.87      0.88      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          946           99
Actual 1           67          188


### Light GB 

In [52]:
lgb1 = LGBMClassifier()
lgb1.fit(X_train_balanced, y_train_balanced)

[LightGBM] [Info] Number of positive: 4156, number of negative: 4156
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001234 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2805
[LightGBM] [Info] Number of data points in the train set: 8312, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [53]:
lgb1_predictions = lgb1.predict(X_test)



In [54]:
lgb1_cr = classification_report(y_test, lgb1_predictions)
lgb1_cm = confusion_matrix(y_test, lgb1_predictions)
lgb1_auc = roc_auc_score(y_test, lgb1_predictions)
print("AUC Score for light gb:", lgb1_auc)
print("Classification Report:\n", lgb1_cr)
print("Confusion Matrix:\n")    
print(pd.DataFrame(lgb1_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for light gb: 0.7800731775963974
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.87      0.89      1045
           1       0.56      0.69      0.62       255

    accuracy                           0.83      1300
   macro avg       0.74      0.78      0.76      1300
weighted avg       0.85      0.83      0.84      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          905          140
Actual 1           78          177


## KNN Classifier

In [55]:
knn1 = KNeighborsClassifier()
knn1.fit(X_train_balanced, y_train_balanced)

In [56]:
knn1_predictions = knn1.predict(X_test)

In [57]:
knn1_cr = classification_report(y_test, knn1_predictions)
knn1_cm = confusion_matrix(y_test, knn1_predictions)
knn1_auc = roc_auc_score(y_test, knn1_predictions)
print("AUC Score for KNN:", knn1_auc)
print("Classification Report:\n", knn1_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(knn1_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for KNN: 0.7819589079650999
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.75      0.84      1045
           1       0.44      0.81      0.57       255

    accuracy                           0.76      1300
   macro avg       0.69      0.78      0.71      1300
weighted avg       0.84      0.76      0.79      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          786          259
Actual 1           48          207


### SVC

In [58]:
svc1 = SVC()
svc1.fit(X_train_balanced, y_train_balanced)

In [59]:
svc1_predicitions = svc1.predict(X_test)

In [60]:
svc1_cr = classification_report(y_test, svc1_predicitions)
svc1_cm = confusion_matrix(y_test, svc1_predicitions)
svc1_auc = roc_auc_score(y_test, svc1_predicitions)
print("AUC Score for SVC:", svc1_auc)
print("Classification Report:\n", svc1_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(svc1_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for SVC: 0.7879819870531946
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.74      0.83      1045
           1       0.44      0.84      0.58       255

    accuracy                           0.76      1300
   macro avg       0.69      0.79      0.70      1300
weighted avg       0.85      0.76      0.78      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          774          271
Actual 1           42          213


### XGB Classifier

In [61]:
xgb1 = XGBClassifier()
xgb1.fit(X_train_balanced, y_train_balanced)

In [62]:
xgb1_predictions = xgb1.predict(X_test)

In [63]:
xgb1_cr = classification_report(y_test, xgb1_predictions)
xgb1_cm = confusion_matrix(y_test, xgb1_predictions)
xgb1_auc = roc_auc_score(y_test, xgb1_predictions)
print("AUC Score for XGBoost:", xgb1_auc)
print("Classification Report:\n", xgb1_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(xgb1_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for XGBoost: 0.7909372361384746
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.90      0.91      1045
           1       0.62      0.68      0.65       255

    accuracy                           0.86      1300
   macro avg       0.77      0.79      0.78      1300
weighted avg       0.86      0.86      0.86      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          940          105
Actual 1           81          174


In [64]:
import pandas as pd

def create_derived_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Takes the raw wine quality DataFrame and adds derived features.
    
    Args:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: The DataFrame with new, engineered features.
    """
    # Create a copy to avoid modifying the original DataFrame in place
    df_transformed = df.copy()

    df_transformed['quality category'] = df_transformed['quality'].apply(lambda x: 1 if x >= 7 else 0)

    print("Successfully added 1 derived feature.")
    return df_transformed

In [65]:
df_w_qualitycat = create_derived_features(df)

Successfully added 1 derived feature.


In [66]:
df_w_qualitycat.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,quality category
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,0
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,0
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,0
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,0
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,0
