In [293]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Modelling
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [None]:
df = pd.read_csv('data/winequality.csv')
df.shape

(6497, 13)

In [None]:
df.columns

Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'],
      dtype='object')

In [None]:
# using median to fill missing values
for column in df.columns:
    if df[column].isnull().sum() > 0:
        df[column].fillna(df[column].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

# Feature Engineering

In [248]:
df['quality category'] = df['quality'].apply(lambda x: 1 if x >=6 else 0)

- Instead of predicting the exact score (a regression or multi-class classification problem), it would be more practical to build a binary classifier for is_good = 1 (e.g., quality >= 6) vs. not_good = 0 (quality < 6). This simplifies the problem and mitigates the severe class imbalance issue.

In [249]:
df['quality category'].value_counts()

quality category
1    4113
0    2384
Name: count, dtype: int64

In [207]:
X = df.drop(columns=['type','quality','quality category'])

In [208]:
y = df['quality category']

In [209]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         6497 non-null   float64
 1   volatile acidity      6497 non-null   float64
 2   citric acid           6497 non-null   float64
 3   residual sugar        6497 non-null   float64
 4   chlorides             6497 non-null   float64
 5   free sulfur dioxide   6497 non-null   float64
 6   total sulfur dioxide  6497 non-null   float64
 7   density               6497 non-null   float64
 8   pH                    6497 non-null   float64
 9   sulphates             6497 non-null   float64
 10  alcohol               6497 non-null   float64
dtypes: float64(11)
memory usage: 558.5 KB


In [210]:
num_features = X.select_dtypes(exclude="object").columns

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    [
         ("StandardScaler", numeric_transformer, num_features),        
    ]
    )

In [211]:
num_features

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol'],
      dtype='object')

In [212]:
X = preprocessor.fit_transform(X)


In [213]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=56)

In [214]:
X_train

array([[-0.24407731,  0.36689339, -1.02408531, ..., -0.86153461,
        -0.27697438, -0.91546416],
       [ 0.52771852, -0.72703776,  0.90402115, ..., -1.17281894,
        -0.47864451, -1.08316218],
       [-0.39843648, -0.90935962, -0.61091964, ...,  0.75714392,
        -0.68031463, -0.74776615],
       ...,
       [-1.01587314, -0.60548985, -1.36839003, ...,  0.75714392,
        -0.61309126, -0.74776615],
       [-0.62997523, -0.90935962,  0.69743832, ...,  0.50811645,
        -0.81476138,  1.01306302],
       [-0.39843648,  1.21772873, -0.33547586, ..., -0.36347968,
        -1.01643151, -0.91546416]], shape=(5197, 11))

## Logestic Regression

In [250]:
lr = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=56)

In [251]:
lr.fit(X_train, y_train)

In [252]:
lr_predictions = lr.predict(X_test)

In [253]:
labels = [0, 1]  # Adjust according to your classes
cr_lr = classification_report(y_test, lr_predictions)
cm_lr = confusion_matrix(y_test, lr_predictions)
auc_lr = roc_auc_score(y_test, lr_predictions)
print("AUC Score:", auc_lr)
print("Classification Report:\n", cr_lr)
print("Confusion Matrix:\n")
print(pd.DataFrame(cm_lr, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score: 0.7305178159446295
Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.72      0.66       470
           1       0.82      0.74      0.78       830

    accuracy                           0.73      1300
   macro avg       0.72      0.73      0.72      1300
weighted avg       0.75      0.73      0.74      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          339          131
Actual 1          216          614


## Decision Tree Classifier

In [254]:
dtree = DecisionTreeClassifier(class_weight='balanced', random_state=56)

In [255]:
dtree.fit(X_train, y_train)

In [256]:
dtree_predictions = dtree.predict(X_test)

In [257]:
cr_dtree = classification_report(y_test, dtree_predictions)
cm_dtree = confusion_matrix(y_test, dtree_predictions)
auc_dtree = roc_auc_score(y_test, dtree_predictions)
print("AUC Score for Decision Tree:", auc_dtree)
print("Classification Report:\n", cr_dtree)
print("Confusion Matrix:\n")
print(pd.DataFrame(cm_dtree, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for Decision Tree: 0.7345296077928735
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.64      0.66       470
           1       0.80      0.83      0.81       830

    accuracy                           0.76      1300
   macro avg       0.74      0.73      0.74      1300
weighted avg       0.76      0.76      0.76      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          302          168
Actual 1          144          686


## Randon forest Classifier

In [262]:
rfc = RandomForestClassifier(class_weight='balanced')

In [263]:
rfc.fit(X_train, y_train)

In [264]:
rfc_predictions = rfc.predict(X_test)

In [265]:
cr_rfc = classification_report(y_test, rfc_predictions)
cm_rfc = confusion_matrix(y_test, rfc_predictions)
auc_rfc = roc_auc_score(y_test, rfc_predictions)
print("AUC Score for Random Forest:", auc_rfc)
print("Classification Report:\n", cr_rfc)
print("Confusion Matrix:\n")
print(pd.DataFrame(cm_rfc, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for Random Forest: 0.8089720584465522
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.72      0.76       470
           1       0.85      0.90      0.87       830

    accuracy                           0.83      1300
   macro avg       0.83      0.81      0.82      1300
weighted avg       0.83      0.83      0.83      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          338          132
Actual 1           84          746


## Light Boost Classifier

In [227]:
lgbc = LGBMClassifier()
lgbc.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 3283, number of negative: 1914
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000525 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1548
[LightGBM] [Info] Number of data points in the train set: 5197, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.631711 -> initscore=0.539562
[LightGBM] [Info] Start training from score 0.539562


In [228]:
lgbc_predicitons = lgbc.predict(X_test)



In [229]:
lgbc_cr = classification_report(y_test, lgbc_predicitons)
lgbc_cm = confusion_matrix(y_test, lgbc_predicitons)
lgbc_auc = roc_auc_score(y_test, lgbc_predicitons)
print("AUC Score for LightGBM:", lgbc_auc)
print("Classification Report:\n", lgbc_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(lgbc_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for LightGBM: 0.7796590617790311
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.70      0.72       470
           1       0.83      0.86      0.85       830

    accuracy                           0.80      1300
   macro avg       0.79      0.78      0.78      1300
weighted avg       0.80      0.80      0.80      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          328          142
Actual 1          115          715


### XGB Classifier

In [296]:
xbg = XGBClassifier()
xbg.fit(X_train, y_train)

In [297]:
xgb_predictions = xbg.predict(X_test)

In [298]:
xgb_cr = classification_report(y_test, xgb_predictions)
xgb_cm = confusion_matrix(y_test, xgb_predictions)
xgb_auc = roc_auc_score(y_test, xgb_predictions)
print("AUC Score for XGBoost:", xgb_auc)
print("Classification Report:\n", xgb_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(xgb_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for XGBoost: 0.7973852858241477
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.72      0.74       470
           1       0.85      0.87      0.86       830

    accuracy                           0.82      1300
   macro avg       0.81      0.80      0.80      1300
weighted avg       0.82      0.82      0.82      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          339          131
Actual 1          105          725


## K Nearest Neighbour Classsification

In [236]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [237]:
knn_predicitions = knn.predict(X_test)

In [238]:
knn_cr = classification_report(y_test, knn_predicitions)
knn_cm = confusion_matrix(y_test, knn_predicitions)
knn_auc = roc_auc_score(y_test, knn_predicitions)
print("AUC Score for KNN:", knn_auc)
print("Classification Report:\n", knn_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(knn_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for KNN: 0.7194180979236093
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.60      0.64       470
           1       0.79      0.84      0.81       830

    accuracy                           0.75      1300
   macro avg       0.73      0.72      0.72      1300
weighted avg       0.75      0.75      0.75      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          281          189
Actual 1          132          698


## Support Vector Classifier

In [239]:
svc = SVC()
svc.fit(X_train, y_train)

In [240]:
svc_predicitions = svc.predict(X_test)

In [241]:
svc_cr = classification_report(y_test, svc_predicitions)
svc_cm = confusion_matrix(y_test, svc_predicitions)
svc_auc = roc_auc_score(y_test, svc_predicitions)
print("AUC Score for SVC:", svc_auc)
print("Classification Report:\n", svc_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(svc_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for SVC: 0.7408613176108689
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.61      0.66       470
           1       0.80      0.87      0.83       830

    accuracy                           0.78      1300
   macro avg       0.76      0.74      0.75      1300
weighted avg       0.77      0.78      0.77      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          287          183
Actual 1          107          723


### Training and evaluating models after applying SMOTE on datasets

In [266]:
from imblearn.combine import SMOTETomek

# Apply SMOTE + Tomek Links to balance the training data
smote_tomek = SMOTETomek(random_state=56)
X_train_balanced, y_train_balanced = smote_tomek.fit_resample(X_train, y_train)

# Check new class distribution
print(pd.Series(y_train_balanced).value_counts())

quality category
0    3183
1    3183
Name: count, dtype: int64


In [245]:
df.columns

Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality', 'quality category'],
      dtype='object')

## Model Training after applying Smotetomek

### Logestic Regression

In [267]:
lr1 = LogisticRegression()

In [268]:
lr1.fit(X_train_balanced, y_train_balanced)

In [269]:
lr1_predictions = lr1.predict(X_test)

In [282]:
lr1_cr = classification_report(y_test, lr1_predictions)
lr1_cm = confusion_matrix(y_test, lr1_predictions)
lr1_auc = roc_auc_score(y_test, lr1_predictions)
print("AUC Score for logestic regression:", lr1_auc)
print("Classification Report:\n", lr1_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(lr1_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for logestic regression: 0.7281081773904127
Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.72      0.66       470
           1       0.82      0.73      0.78       830

    accuracy                           0.73      1300
   macro avg       0.71      0.73      0.72      1300
weighted avg       0.74      0.73      0.73      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          339          131
Actual 1          220          610


### Decision Tree 

In [271]:
dtree1 = DecisionTreeClassifier()

In [272]:
dtree1.fit(X_train_balanced, y_train_balanced)

In [273]:
dtree1_predictions = dtree1.predict(X_test)

In [283]:
dtree1_cr = classification_report(y_test, dtree1_predictions)
dtree1_cm = confusion_matrix(y_test, dtree1_predictions)
dtree1_auc = roc_auc_score(y_test, dtree1_predictions)
print("AUC Score for Decision Tree:", dtree1_auc)
print("Classification Report:\n", dtree1_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(dtree1_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for Decision Tree: 0.7552166111253524
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.72      0.69       470
           1       0.83      0.79      0.81       830

    accuracy                           0.76      1300
   macro avg       0.75      0.76      0.75      1300
weighted avg       0.77      0.76      0.77      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          339          131
Actual 1          175          655


### Randon Forest 

In [275]:
rfc1 = RandomForestClassifier()
rfc1.fit(X_train_balanced, y_train_balanced)

In [276]:
rfc1_predictions = rfc1.predict(X_test)

In [284]:
rfc1_cr = classification_report(y_test, rfc1_predictions)
rfc1_cm = confusion_matrix(y_test, rfc1_predictions)
rfc1_auc = roc_auc_score(y_test, rfc1_predictions)
print("AUC Score for Random Forest:", rfc1_auc)
print("Classification Report:\n", rfc1_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(rfc1_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for Random Forest: 0.8100871571391951
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.77      0.76       470
           1       0.87      0.85      0.86       830

    accuracy                           0.82      1300
   macro avg       0.81      0.81      0.81      1300
weighted avg       0.82      0.82      0.82      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          360          110
Actual 1          121          709


### Light GB 

In [278]:
lgb1 = LGBMClassifier()
lgb1.fit(X_train_balanced, y_train_balanced)

[LightGBM] [Info] Number of positive: 3183, number of negative: 3183
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000278 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2786
[LightGBM] [Info] Number of data points in the train set: 6366, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [279]:
lgb1_predictions = lgb1.predict(X_test)



In [285]:
lgb1_cr = classification_report(y_test, lgb1_predictions)
lgb1_cm = confusion_matrix(y_test, lgb1_predictions)
lgb1_auc = roc_auc_score(y_test, lgb1_predictions)
print("AUC Score for light gb:", lgb1_auc)
print("Classification Report:\n", lgb1_cr)
print("Confusion Matrix:\n")    
print(pd.DataFrame(lgb1_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for light gb: 0.7895283260702383
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.75      0.73       470
           1       0.85      0.83      0.84       830

    accuracy                           0.80      1300
   macro avg       0.78      0.79      0.79      1300
weighted avg       0.80      0.80      0.80      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          352          118
Actual 1          141          689


## KNN Classifier

In [286]:
knn1 = KNeighborsClassifier()
knn1.fit(X_train_balanced, y_train_balanced)

In [287]:
knn1_predictions = knn1.predict(X_test)

In [288]:
knn1_cr = classification_report(y_test, knn1_predictions)
knn1_cm = confusion_matrix(y_test, knn1_predictions)
knn1_auc = roc_auc_score(y_test, knn1_predictions)
print("AUC Score for KNN:", knn1_auc)
print("Classification Report:\n", knn1_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(knn1_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for KNN: 0.7388105613945142
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.72      0.67       470
           1       0.83      0.75      0.79       830

    accuracy                           0.74      1300
   macro avg       0.73      0.74      0.73      1300
weighted avg       0.75      0.74      0.75      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          340          130
Actual 1          204          626


### SVC

In [289]:
svc1 = SVC()
svc1.fit(X_train_balanced, y_train_balanced)

In [290]:
svc1_predicitions = svc1.predict(X_test)

In [291]:
svc1_cr = classification_report(y_test, svc1_predicitions)
svc1_cm = confusion_matrix(y_test, svc1_predicitions)
svc1_auc = roc_auc_score(y_test, svc1_predicitions)
print("AUC Score for SVC:", svc1_auc)
print("Classification Report:\n", svc1_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(svc1_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for SVC: 0.7695462701871315
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.76      0.71       470
           1       0.85      0.78      0.81       830

    accuracy                           0.77      1300
   macro avg       0.76      0.77      0.76      1300
weighted avg       0.78      0.77      0.78      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          357          113
Actual 1          183          647


### XGB Classifier

In [299]:
xgb1 = XGBClassifier()
xgb1.fit(X_train_balanced, y_train_balanced)

In [300]:
xgb1_predictions = xgb1.predict(X_test)

In [301]:
xgb1_cr = classification_report(y_test, xgb1_predictions)
xgb1_cm = confusion_matrix(y_test, xgb1_predictions)
xgb1_auc = roc_auc_score(y_test, xgb1_predictions)
print("AUC Score for XGBoost:", xgb1_auc)
print("Classification Report:\n", xgb1_cr)
print("Confusion Matrix:\n")
print(pd.DataFrame(xgb1_cm, index=[f"Actual {l}" for l in labels], columns=[f"Predicted {l}" for l in labels]))

AUC Score for XGBoost: 0.7991284286080492
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.74      0.74       470
           1       0.85      0.86      0.86       830

    accuracy                           0.82      1300
   macro avg       0.80      0.80      0.80      1300
weighted avg       0.82      0.82      0.82      1300

Confusion Matrix:

          Predicted 0  Predicted 1
Actual 0          348          122
Actual 1          118          712
