In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning, UndefinedMetricWarning
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
wine_data = pd.read_csv('winequality-red.csv', sep=';', header=0)

In [None]:
wine_data.head()

In [None]:
sns.set_style("whitegrid")


fig, ax = plt.subplots(figsize=(10, 6))


sns.countplot(x='quality', data=wine_data, palette="Set2", ax=ax)

ax.set_xlabel('Quality', fontsize=12)
ax.set_ylabel('Count', fontsize=12)


plt.tight_layout()


plt.show()


In [None]:
# Column names for the dataset
column_names = [
    'fixed acidity', 'volatile acidity', 'citric acid',
    'residual sugar', 'chlorides', 'free sulfur dioxide',
    'total sulfur dioxide', 'density', 'pH', 'sulphates',
    'alcohol', 'quality'
]



# Define the columns you want to use for plotting
selected_columns = column_names[:-1]  # All except 'quality'

# Set Seaborn style
sns.set_style("whitegrid")

# Calculate the number of rows and columns for subplots
n_cols = 3
n_rows = (len(selected_columns) + n_cols - 1) // n_cols  # To ensure all subplots are accounted for

# Create subplots
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(15, 10))  # Adjusted figsize for better visualization

# Flatten axes for easier iteration
axes = axes.flatten()

# Plot boxplots
for i, col in enumerate(selected_columns):
    sns.boxplot(x='quality', y=col, data=wine_data, ax=axes[i], palette="Set2")
    axes[i].set_ylabel(col.replace('_', ' ').title(), fontsize=12)  # Set y-axis label
    axes[i].set_xlabel('Quality', fontsize=12)  # Set x-axis label

# Remove extra axes if any
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])  # Delete unused axes

# Adjust layout to prevent overlapping
plt.tight_layout()

# Show the plots
plt.show()





In [None]:
X = wine_data.drop('quality', axis=1) # features
y = wine_data['quality'] # target
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


Feature selection with resampling


Precision, Recall, F1-Score: The model performs better than any other previous model, especially in class 7 and 8, which might conrtibute of the usage of interaction term and resampling.

In [None]:
# create interaction terms and combine them
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_interactions = poly.fit_transform(X_scaled)
X_enhanced = np.hstack([X_scaled, X_interactions])

In [None]:
# resampling
data_balanced = pd.concat([pd.DataFrame(X_enhanced), pd.Series(y, name='quality')], axis=1)
data_majority = data_balanced[data_balanced.quality == 6]
data_minority = data_balanced[data_balanced.quality != 6]
data_minority_upsampled = resample(data_minority, replace=True, n_samples=len(data_majority), random_state=42)
data_balanced = pd.concat([data_majority, data_minority_upsampled])

X_train_balanced, X_test_balanced, y_train_balanced, y_test_balanced = train_test_split(
    data_balanced.drop('quality', axis=1).values, data_balanced['quality'].values, test_size=0.2, random_state=42)


In [None]:
# define parameters
mlp = MLPClassifier(max_iter=1000)
parameter_space = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant', 'adaptive'],
}

# GridSearchCV
clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
clf.fit(X_train_balanced, y_train_balanced)

print('Best parameters found:\n', clf.best_params_)

# display result for comparison
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print(f"{mean:.3f} (+/-{std * 2:.3f}) for {params}")


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
best_params = {
    'activation': 'relu',
    'alpha': 0.05,
    'hidden_layer_sizes': (50,),
    'learning_rate': 'constant',
    'solver': 'adam'
}

In [None]:
ann_model = MLPClassifier(**best_params, max_iter=1000, random_state=42)
ann_model.fit(X_train, y_train)
ann_pd = ann_model.predict(X_test)
print("ANN Classification Report:")
print(classification_report(y_test, ann_pd))

In [None]:
# MLPClassifier with balanced data
mlp_balanced = MLPClassifier(**best_params, max_iter=1000, random_state=42)
mlp_balanced.fit(X_train_balanced, y_train_balanced)

predictions_balanced = mlp_balanced.predict(X_test_balanced)
accuracy_balanced = accuracy_score(y_test_balanced, predictions_balanced)
report_balanced = classification_report(y_test_balanced, predictions_balanced)

print("Accuracy Balanced with Feature Interactions:", accuracy_balanced)
print("Classification Report Balanced with Feature Interactions:\n", report_balanced)

using bagging aggregation

In [None]:
mlp_base = MLPClassifier(**best_params, max_iter=1000, random_state=42)


bagging_ensemble = BaggingClassifier(
    estimator=mlp_base,
    n_estimators=10,
    random_state=42
)


bagging_ensemble.fit(X_train_balanced, y_train_balanced)


y_pred = bagging_ensemble.predict(X_test_balanced)


accuracy = accuracy_score(y_test_balanced, y_pred)
report = classification_report(y_test_balanced, y_pred)

print("Bagging model accuracy:", accuracy)
print("Bagging model report:\n", report)

using K-fold Cross-Validation

In [None]:
k = 10
selector = SelectKBest(f_classif, k=k)
X_selected = selector.fit_transform(X_enhanced, y)




kf = KFold(n_splits=5, shuffle=True, random_state=42)


cross_val_scores = cross_val_score(bagging_ensemble, X_selected, y, cv=kf)


average_score = np.mean(cross_val_scores)


folds = range(1, len(cross_val_scores) + 1)
plt.figure(figsize=(10, 6))
plt.plot(folds, cross_val_scores, marker='o', linestyle='-', color='b', label='Cross-Validation Score per Fold')
plt.axhline(y=average_score, color='r', linestyle='--', label=f'Average Score: {average_score:.2%}')
plt.xlabel('Fold Number')
plt.ylabel('Accuracy')
plt.title('ANN Model Cross-Validation Scores')
plt.xticks(folds)
plt.legend()
plt.grid(True)
plt.show()


print("Cross-Validated Scores:", cross_val_scores)
print("Average Score:", average_score)

In [None]:
# train the DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion = 'entropy')
clf.fit(X_train_balanced, y_train_balanced)
accuracy_optimized = clf.score(X_test_balanced, y_test_balanced)
print('Optimized Accuracy: ', accuracy_optimized)

X_train, X_test, y_train, y_test = train_test_split(
    wine_data.drop('quality', axis=1).values, wine_data['quality'].values, test_size=0.2, random_state=42)
clf.fit(X_train, y_train)
accuracy_initial = clf.score(X_test, y_test)
print('Initial Accuracy: ', accuracy_initial)

In [None]:
for max_depth in range(1,12):
    clf = DecisionTreeClassifier(criterion = 'entropy', max_depth=max_depth)
    accuracy = cross_val_score(clf, X_train_balanced, y_train_balanced, cv=3)
    plt.subplot(3, 4, max_depth)
    plt.plot(range(2,5),accuracy, marker ='o')
    plt.plot([2,4],[accuracy.mean(),accuracy.mean()], color ='red')
    plt.xlabel('k')
    plt.ylabel('Accuracy')
    plt.title('Max Depth = {}'.format(max_depth))
plt.tight_layout()
plt.show()

In [None]:
i=0
for min_samples_split in range(2, 42, 4):
    clf = DecisionTreeClassifier(criterion = 'entropy', min_samples_split = min_samples_split)
    accuracy = cross_val_score(clf, X_train_balanced, y_train_balanced, cv=3)
    i += 1
    plt.subplot(3, 4, i)
    plt.plot(range(2,5),accuracy, marker ='o')
    plt.plot([2,4],[accuracy.mean(),accuracy.mean()], color ='red')
    plt.xlabel('k')
    plt.ylabel('Accuracy')
    plt.title('MSS = {}'.format(min_samples_split))
plt.tight_layout()
plt.show()


In [None]:
i=0
for min_samples_leaf in range(1, 41, 4):
    clf = DecisionTreeClassifier(criterion = 'entropy', min_samples_leaf = min_samples_leaf)
    accuracy = cross_val_score(clf, X_train_balanced, y_train_balanced, cv=3)
    i += 1
    plt.subplot(3, 4, i)
    plt.plot(range(2,5),accuracy, marker ='o')
    plt.plot([2,4],[accuracy.mean(),accuracy.mean()], color ='red')
    plt.xlabel('k')
    plt.ylabel('Accuracy')
    plt.title('MSL = {}'.format(min_samples_leaf))
plt.tight_layout()
plt.show()


In [None]:
i=0
for max_features in range(1, 21, 2):
    clf = DecisionTreeClassifier(criterion = 'entropy', max_features = max_features)
    accuracy = cross_val_score(clf, X_train_balanced, y_train_balanced, cv=3)
    i += 1
    plt.subplot(3, 4, i)
    plt.plot(range(2,5),accuracy, marker ='o')
    plt.plot([2,4],[accuracy.mean(),accuracy.mean()], color ='red')
    plt.xlabel('k')
    plt.ylabel('Accuracy')
    plt.title('Max Features = {}'.format(max_features))
plt.tight_layout()
plt.show()


In [None]:
rf_classifier = RandomForestClassifier()

rf_classifier.fit(X_train_balanced,y_train_balanced)

y_pred = rf_classifier.predict(X_test_balanced)

accuracy = accuracy_score(y_test_balanced, y_pred)
print("Accuracy on test set:", accuracy)


n_estimators_values = [10, 50, 100, 200, 300, 400, 500]

# List to store accuracies
accuracies = []

# Loop through each n_estimators value and train a Random Forest
for n in n_estimators_values:
    # Initialize the Random Forest with the current n_estimators
    rf = RandomForestClassifier(n_estimators=n, random_state=42)

    # Train the model
    rf.fit(X_train_balanced, y_train_balanced)

    # Predict on the test set
    y_pred = rf.predict(X_test_balanced)

    # Calculate accuracy and store it
    accuracy = accuracy_score(y_test_balanced, y_pred)
    accuracies.append(accuracy)

    print(f"Accuracy with {n} estimators: {accuracy:.2%}")

# Plot accuracy against the number of estimators
plt.figure(figsize=(10, 6))
plt.plot(n_estimators_values, accuracies, marker='o', linestyle='-', color='b')
plt.title('Impact of n_estimators on Accuracy')
plt.xlabel('Number of Estimators')
plt.ylabel('Accuracy')
plt.grid(True)
plt.show()