In [67]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd
import warnings
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings('ignore', category=UndefinedMetricWarning)

In [68]:
df = pd.read_csv("../data/processed/WineQT_after_EDA.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id,quality_encoded
0,0,-0.502843,0.962178,-1.343038,-0.618825,-0.183097,-0.457404,-0.3478,0.66805,1.237452,-0.608962,-0.987243,5,0,2
1,1,-0.267548,1.978506,-1.343038,0.571862,1.154826,0.996522,0.75409,0.119775,-0.792074,0.309972,-0.611947,5,1,2
2,2,-0.267548,1.300954,-1.132273,0.061567,0.789938,-0.041997,0.320012,0.22943,-0.399263,0.080239,-0.611947,5,2,2
3,3,1.732463,-1.409252,1.607677,-0.618825,-0.243911,0.165707,0.520356,0.777704,-1.053949,-0.455806,-0.611947,6,3,3
4,4,-0.502843,0.962178,-1.343038,-0.618825,-0.183097,-0.457404,-0.3478,0.66805,1.237452,-0.608962,-0.987243,5,4,2


In [69]:
df.drop(["Unnamed: 0", "Id"], axis=1, inplace=True)

In [70]:
lr = LogisticRegression(class_weight='balanced')
dt = DecisionTreeClassifier(class_weight='balanced')
rf = RandomForestClassifier(class_weight='balanced')

In [71]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

X = df.drop(["quality", "quality_encoded"], axis=1)
y = df["quality_encoded"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

# Check minimum class size and adjust k_neighbors accordingly
min_class_size = y_train.value_counts().min()
k_neighbors = min(5, min_class_size - 1) if min_class_size > 1 else 1

print(f"Original training set class distribution:\n{y_train.value_counts().sort_index()}")
print(f"\nMinimum class size: {min_class_size}, using k_neighbors={k_neighbors}")

# Apply SMOTE to balance classes in training data
smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(f"\nBalanced training set class distribution:\n{y_train_balanced.value_counts().sort_index()}")

Original training set class distribution:
0      2
1     20
2    289
3    283
4     83
5      9
Name: quality_encoded, dtype: int64

Minimum class size: 2, using k_neighbors=1

Balanced training set class distribution:
0    289
1    289
2    289
3    289
4    289
5    289
Name: quality_encoded, dtype: int64


In [72]:
lr.fit(X_train_balanced, y_train_balanced)
dt.fit(X_train_balanced, y_train_balanced)
rf.fit(X_train_balanced, y_train_balanced)

In [74]:
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score, cohen_kappa_score

def print_detailed_metrics(y_true, y_pred, model_name):
    print(f"\n{'='*60}")
    print(f"{model_name}")
    print(f"{'='*60}")
    print(f"\nAccuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Balanced Accuracy: {balanced_accuracy_score(y_true, y_pred):.4f}")
    print(f"Cohen's Kappa: {cohen_kappa_score(y_true, y_pred):.4f}")
    print(f"\nClassification Report:")
    print(classification_report(y_test, pred_lr, zero_division=0))
    print(f"\nConfusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    


In [75]:
pred_lr = lr.predict(X_test)
pred_dt = dt.predict(X_test)
pred_rf = rf.predict(X_test)

print_detailed_metrics(y_test, pred_lr, "Logistic Regression")
print_detailed_metrics(y_test, pred_dt, "Decision Tree")
print_detailed_metrics(y_test, pred_rf, "Random Forest")


Logistic Regression

Accuracy: 0.4012
Balanced Accuracy: 0.3444
Cohen's Kappa: 0.2155

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.05      0.40      0.10        10
           2       0.70      0.49      0.57       142
           3       0.57      0.29      0.39       140
           4       0.36      0.49      0.41        41
           5       0.06      0.40      0.11         5

    accuracy                           0.40       339
   macro avg       0.29      0.34      0.26       339
weighted avg       0.58      0.40      0.45       339


Confusion Matrix:
[[ 0  1  0  0  0  0]
 [ 1  4  2  3  0  0]
 [ 5 41 69 21  4  2]
 [ 1 26 26 41 29 17]
 [ 0  2  1  7 20 11]
 [ 0  0  0  0  3  2]]

Decision Tree

Accuracy: 0.5428
Balanced Accuracy: 0.3650
Cohen's Kappa: 0.3134

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0

In [76]:
from sklearn.ensemble import VotingClassifier, BaggingClassifier

voting_clf = VotingClassifier(
    estimators=[('lr', LogisticRegression()), ('dt', DecisionTreeClassifier()), ('rfc', RandomForestClassifier())],
    voting='hard'
)
bagging_lr_clf = BaggingClassifier(
    estimator=LogisticRegression(),
    max_samples=0.8,
    max_features=0.8,
    bootstrap=True,
    bootstrap_features=False,
)
bagging_dt_clf = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=100,
    max_samples=0.8,
    max_features=0.8,
    bootstrap=True,
    bootstrap_features=False,
)
bagging_rf_clf = BaggingClassifier(
    estimator=RandomForestClassifier(),
    n_estimators=10,
    max_samples=0.8,
    max_features=0.8,
    bootstrap=True,
    bootstrap_features=False,
)

In [77]:
voting_clf.fit(X_train_balanced, y_train_balanced)
bagging_lr_clf.fit(X_train_balanced, y_train_balanced)
bagging_dt_clf.fit(X_train_balanced, y_train_balanced)
bagging_rf_clf.fit(X_train_balanced, y_train_balanced)

In [78]:
pred_voting_clf = voting_clf.predict(X_test)
pred_bagging_lr_clf = bagging_lr_clf.predict(X_test)
pred_bagging_dt_clf = bagging_dt_clf.predict(X_test)
pred_bagging_rf_clf = bagging_rf_clf.predict(X_test)

print_detailed_metrics(y_test, pred_bagging_lr_clf, "Bagging Logistic Regression")
print_detailed_metrics(y_test, pred_bagging_dt_clf, "Bagging Decision Tree")


Bagging Logistic Regression

Accuracy: 0.3982
Balanced Accuracy: 0.3502
Cohen's Kappa: 0.2165

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.05      0.40      0.10        10
           2       0.70      0.49      0.57       142
           3       0.57      0.29      0.39       140
           4       0.36      0.49      0.41        41
           5       0.06      0.40      0.11         5

    accuracy                           0.40       339
   macro avg       0.29      0.34      0.26       339
weighted avg       0.58      0.40      0.45       339


Confusion Matrix:
[[ 0  1  0  0  0  0]
 [ 1  5  2  2  0  0]
 [ 6 46 65 18  3  4]
 [ 3 20 27 46 26 18]
 [ 0  2  1  7 17 14]
 [ 0  0  0  0  3  2]]

Bagging Decision Tree

Accuracy: 0.6018
Balanced Accuracy: 0.3606
Cohen's Kappa: 0.3842

Classification Report:
              precision    recall  f1-score   support

           0       0.00 

In [79]:
print_detailed_metrics(y_test, pred_voting_clf, "Voting Classifier")


Voting Classifier

Accuracy: 0.5634
Balanced Accuracy: 0.3521
Cohen's Kappa: 0.3419

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.05      0.40      0.10        10
           2       0.70      0.49      0.57       142
           3       0.57      0.29      0.39       140
           4       0.36      0.49      0.41        41
           5       0.06      0.40      0.11         5

    accuracy                           0.40       339
   macro avg       0.29      0.34      0.26       339
weighted avg       0.58      0.40      0.45       339


Confusion Matrix:
[[ 0  1  0  0  0  0]
 [ 1  2  7  0  0  0]
 [ 2  9 99 29  3  0]
 [ 0 12 40 67 18  3]
 [ 0  1  1 14 22  3]
 [ 0  0  1  1  2  1]]


In [80]:
print_detailed_metrics(y_test, pred_bagging_rf_clf, "Bagging Random Forest")


Bagging Random Forest

Accuracy: 0.5988
Balanced Accuracy: 0.3594
Cohen's Kappa: 0.3816

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.05      0.40      0.10        10
           2       0.70      0.49      0.57       142
           3       0.57      0.29      0.39       140
           4       0.36      0.49      0.41        41
           5       0.06      0.40      0.11         5

    accuracy                           0.40       339
   macro avg       0.29      0.34      0.26       339
weighted avg       0.58      0.40      0.45       339


Confusion Matrix:
[[  0   0   0   1   0   0]
 [  0   1   7   2   0   0]
 [  0   4 104  28   6   0]
 [  0   4  40  72  23   1]
 [  0   0   0  13  25   3]
 [  0   0   0   2   2   1]]


In [81]:
# Summary comparison of all models
import numpy as np

models = ['LR', 'DT', 'RF', 'Voting', 'Bagging LR', 'Bagging DT', 'Bagging RF']
predictions = [pred_lr, pred_dt, pred_rf, pred_voting_clf, pred_bagging_lr_clf, pred_bagging_dt_clf, pred_bagging_rf_clf]

summary_data = []
for model, pred in zip(models, predictions):
    summary_data.append({
        'Model': model,
        'Accuracy': f"{accuracy_score(y_test, pred):.4f}",
        'Balanced Accuracy': f"{balanced_accuracy_score(y_test, pred):.4f}",
        'Cohen\'s Kappa': f"{cohen_kappa_score(y_test, pred):.4f}"
    })

summary_df = pd.DataFrame(summary_data)
print("\n" + "="*70)
print("MODEL COMPARISON SUMMARY")
print("="*70)
print(summary_df.to_string(index=False))


MODEL COMPARISON SUMMARY
     Model Accuracy Balanced Accuracy Cohen's Kappa
        LR   0.4012            0.3444        0.2155
        DT   0.5428            0.3650        0.3134
        RF   0.6165            0.3607        0.4013
    Voting   0.5634            0.3521        0.3419
Bagging LR   0.3982            0.3502        0.2165
Bagging DT   0.6018            0.3606        0.3842
Bagging RF   0.5988            0.3594        0.3816
