In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd
import warnings
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings('ignore', category=UndefinedMetricWarning)

In [3]:
df = pd.read_csv("../data/processed/WineQT_after_EDA.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,quality_encoded
0,0,0.746111,-0.534476,-0.349212,-1.531789,-1.011514,-1.549287,-1.818415,-0.763562,0.539409,-1.104678,-1.938349,3,0
1,1,0.24853,-0.452275,-0.79126,-1.537051,-0.002793,-1.614104,-2.127956,-0.879958,0.010934,-0.957717,-2.445859,3,0
2,2,-0.995421,1.123245,-2.0654,-1.423926,-0.827382,-1.614104,-2.147926,-1.112752,2.770749,-1.178159,-0.500404,3,0
3,3,0.24853,-0.918081,-0.97328,-1.56862,-0.443108,-1.238168,-1.808429,-0.96975,1.302762,-0.394364,-1.177084,3,0
4,4,-0.622235,0.671139,-2.013394,-1.468649,-0.931457,-1.60114,-2.177882,-0.919866,1.889957,-1.300627,-0.246649,3,0


In [4]:
df.drop(["Unnamed: 0"], axis=1, inplace=True)

In [5]:
lr = LogisticRegression(class_weight='balanced')
dt = DecisionTreeClassifier(class_weight='balanced')
rf = RandomForestClassifier(class_weight='balanced')

In [14]:
from sklearn.model_selection import train_test_split

X = df.drop(["quality", "quality_encoded"], axis=1)
y = df["quality_encoded"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [15]:
lr.fit(X_train, y_train)
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)

In [16]:
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score, cohen_kappa_score

def print_detailed_metrics(y_true, y_pred, model_name):
    print(f"\n{'='*60}")
    print(f"{model_name}")
    print(f"{'='*60}")
    print(f"\nAccuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Balanced Accuracy: {balanced_accuracy_score(y_true, y_pred):.4f}")
    print(f"Cohen's Kappa: {cohen_kappa_score(y_true, y_pred):.4f}")
    print(f"\nClassification Report:")
    print(classification_report(y_test, pred_lr, zero_division=0))
    print(f"\nConfusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    


In [17]:
pred_lr = lr.predict(X_test)
pred_dt = dt.predict(X_test)
pred_rf = rf.predict(X_test)

print_detailed_metrics(y_test, pred_lr, "Logistic Regression")
print_detailed_metrics(y_test, pred_dt, "Decision Tree")
print_detailed_metrics(y_test, pred_rf, "Random Forest")


Logistic Regression

Accuracy: 0.3005
Balanced Accuracy: 0.2995
Cohen's Kappa: 0.1843

Classification Report:
              precision    recall  f1-score   support

           0       0.20      0.31      0.24       971
           1       0.20      0.15      0.17       979
           2       0.54      0.41      0.47      1050
           3       0.50      0.74      0.60       979
           4       0.16      0.09      0.11       969
           5       0.21      0.17      0.19      1003
           6       0.21      0.24      0.22       974

    accuracy                           0.30      6925
   macro avg       0.29      0.30      0.29      6925
weighted avg       0.29      0.30      0.29      6925


Confusion Matrix:
[[297 155  19   2 112 155 231]
 [302 143  57  32  99 149 197]
 [110  36 431 315  36  43  79]
 [ 12   7 206 720  14  14   6]
 [179  95  47 301  85 117 145]
 [289 124  22  68  89 171 240]
 [312 144  18   2 106 158 234]]

Decision Tree

Accuracy: 0.5560
Balanced Accuracy: 0.5

In [18]:
from sklearn.ensemble import VotingClassifier, BaggingClassifier

voting_clf = VotingClassifier(
    estimators=[('lr', LogisticRegression()), ('dt', DecisionTreeClassifier()), ('rfc', RandomForestClassifier())],
    voting='hard'
)
bagging_lr_clf = BaggingClassifier(
    estimator=LogisticRegression(),
    max_samples=0.8,
    max_features=0.8,
    bootstrap=True,
    bootstrap_features=False,
)
bagging_dt_clf = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=100,
    max_samples=0.8,
    max_features=0.8,
    bootstrap=True,
    bootstrap_features=False,
)
bagging_rf_clf = BaggingClassifier(
    estimator=RandomForestClassifier(),
    n_estimators=10,
    max_samples=0.8,
    max_features=0.8,
    bootstrap=True,
    bootstrap_features=False,
)

In [19]:
voting_clf.fit(X_train, y_train)
bagging_lr_clf.fit(X_train, y_train)
bagging_dt_clf.fit(X_train, y_train)
bagging_rf_clf.fit(X_train, y_train)

In [20]:
pred_voting_clf = voting_clf.predict(X_test)
pred_bagging_lr_clf = bagging_lr_clf.predict(X_test)
pred_bagging_dt_clf = bagging_dt_clf.predict(X_test)
pred_bagging_rf_clf = bagging_rf_clf.predict(X_test)

print_detailed_metrics(y_test, pred_bagging_lr_clf, "Bagging Logistic Regression")
print_detailed_metrics(y_test, pred_bagging_dt_clf, "Bagging Decision Tree")


Bagging Logistic Regression

Accuracy: 0.2946
Balanced Accuracy: 0.2939
Cohen's Kappa: 0.1776

Classification Report:
              precision    recall  f1-score   support

           0       0.20      0.31      0.24       971
           1       0.20      0.15      0.17       979
           2       0.54      0.41      0.47      1050
           3       0.50      0.74      0.60       979
           4       0.16      0.09      0.11       969
           5       0.21      0.17      0.19      1003
           6       0.21      0.24      0.22       974

    accuracy                           0.30      6925
   macro avg       0.29      0.30      0.29      6925
weighted avg       0.29      0.30      0.29      6925


Confusion Matrix:
[[263 163  20   1 113 115 296]
 [272 140  50  34 107  94 282]
 [ 88  41 407 336  38  27 113]
 [  9   8 197 729  14   9  13]
 [167  90  43 300  84  85 200]
 [269 120  17  70  96 111 320]
 [288 149  15   2 107 107 306]]

Bagging Decision Tree

Accuracy: 0.5877
Balanc

In [21]:
print_detailed_metrics(y_test, pred_voting_clf, "Voting Classifier")


Voting Classifier

Accuracy: 0.5905
Balanced Accuracy: 0.5910
Cohen's Kappa: 0.5224

Classification Report:
              precision    recall  f1-score   support

           0       0.20      0.31      0.24       971
           1       0.20      0.15      0.17       979
           2       0.54      0.41      0.47      1050
           3       0.50      0.74      0.60       979
           4       0.16      0.09      0.11       969
           5       0.21      0.17      0.19      1003
           6       0.21      0.24      0.22       974

    accuracy                           0.30      6925
   macro avg       0.29      0.30      0.29      6925
weighted avg       0.29      0.30      0.29      6925


Confusion Matrix:
[[698 114  18   1  31  48  61]
 [234 526  42  28  27  43  79]
 [135  62 579 200  19  18  37]
 [ 15  12 125 783  33   9   2]
 [184  65  24 185 419  37  55]
 [227  87   9  37  40 533  70]
 [236 102  12   2  23  48 551]]


In [80]:
print_detailed_metrics(y_test, pred_bagging_rf_clf, "Bagging Random Forest")


Bagging Random Forest

Accuracy: 0.5988
Balanced Accuracy: 0.3594
Cohen's Kappa: 0.3816

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.05      0.40      0.10        10
           2       0.70      0.49      0.57       142
           3       0.57      0.29      0.39       140
           4       0.36      0.49      0.41        41
           5       0.06      0.40      0.11         5

    accuracy                           0.40       339
   macro avg       0.29      0.34      0.26       339
weighted avg       0.58      0.40      0.45       339


Confusion Matrix:
[[  0   0   0   1   0   0]
 [  0   1   7   2   0   0]
 [  0   4 104  28   6   0]
 [  0   4  40  72  23   1]
 [  0   0   0  13  25   3]
 [  0   0   0   2   2   1]]


In [22]:
# Summary comparison of all models
import numpy as np

models = ['LR', 'DT', 'RF', 'Voting', 'Bagging LR', 'Bagging DT', 'Bagging RF']
predictions = [pred_lr, pred_dt, pred_rf, pred_voting_clf, pred_bagging_lr_clf, pred_bagging_dt_clf, pred_bagging_rf_clf]

summary_data = []
for model, pred in zip(models, predictions):
    summary_data.append({
        'Model': model,
        'Accuracy': f"{accuracy_score(y_test, pred):.4f}",
        'Balanced Accuracy': f"{balanced_accuracy_score(y_test, pred):.4f}",
        'Cohen\'s Kappa': f"{cohen_kappa_score(y_test, pred):.4f}"
    })

summary_df = pd.DataFrame(summary_data)
print("\n" + "="*70)
print("MODEL COMPARISON SUMMARY")
print("="*70)
print(summary_df.to_string(index=False))


MODEL COMPARISON SUMMARY
     Model Accuracy Balanced Accuracy Cohen's Kappa
        LR   0.3005            0.2995        0.1843
        DT   0.5560            0.5560        0.4819
        RF   0.5900            0.5905        0.5219
    Voting   0.5905            0.5910        0.5224
Bagging LR   0.2946            0.2939        0.1776
Bagging DT   0.5877            0.5880        0.5192
Bagging RF   0.5867            0.5872        0.5180
