In [7]:
'''Code Written by Sai Sukheshwar Boganadula and Bala Subramanyam Pavan Kumar Kasturi. '''
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_validate

# Load red wine data
df = pd.read_csv('winequality-red.csv', sep=';')

# Explore data
print(df.shape)
print(df['quality'].value_counts())

# Split data
X = df.drop('quality', axis=1)
y = df['quality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Define classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

clf1 = RandomForestClassifier()
clf2 = SVC()

# CV evaluation
cv = RepeatedKFold(n_splits=3, n_repeats=10, random_state=42)

# Fit classifiers with CV
clf1_scores = cross_validate(clf1, X_train_scaled, y_train, cv=cv, scoring='accuracy')
clf2_scores = cross_validate(clf2, X_train_scaled, y_train, cv=cv, scoring='accuracy')

# Identify best classifier
print('AVG Scores:')
print(clf1_scores['test_score'].mean(), clf2_scores['test_score'].mean())

best_clf = clf1

# Train final model
best_clf.fit(X_train_scaled, y_train)

# Evaluate on test set
y_pred = best_clf.predict(X_test_scaled)
print('Test Accuracy:', accuracy_score(y_test, y_pred))

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

# Split the balanced data
X_train_balanced, X_test_balanced, y_train_balanced, y_test_balanced = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

# Scale balanced data
X_train_scaled_balanced = scaler.fit_transform(X_train_balanced)
X_test_scaled_balanced = scaler.transform(X_test_balanced)

# Fit classifiers with CV on balanced data
clf1_scores_balanced = cross_validate(clf1, X_train_scaled_balanced, y_train_balanced, cv=cv, scoring='accuracy')
clf2_scores_balanced = cross_validate(clf2, X_train_scaled_balanced, y_train_balanced, cv=cv, scoring='accuracy')

# Identify best classifier on balanced data
print('AVG Scores on Balanced Data:')
print(clf1_scores_balanced['test_score'].mean(), clf2_scores_balanced['test_score'].mean())

best_clf_balanced = clf1

# Train final model on balanced data
best_clf_balanced.fit(X_train_scaled_balanced, y_train_balanced)

# Evaluate on test set
y_pred_balanced = best_clf_balanced.predict(X_test_scaled_balanced)
print('Balanced Test Accuracy:', accuracy_score(y_test_balanced, y_pred_balanced))


(1599, 12)
5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64
AVG Scores:
0.6653641338009845 0.6189202610929695
Test Accuracy: 0.659375
AVG Scores on Balanced Data:
0.8468259544958101 0.7463711957225821
Balanced Test Accuracy: 0.8731117824773413
