# 1. Import Libraries

In [None]:

# Data manipulation and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Scikit-learn classifiers
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
    BaggingClassifier,
    VotingClassifier,
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

# Preprocessing and evaluation tools
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)


# 2. Load & Inspect Data

In [2]:
# ------------------------------------------------
# Load the Wine Quality Dataset
# ------------------------------------------------

import pandas as pd

# Load the dataset from local path
df = pd.read_csv(r"C:\RaginiWorkArea\applied-ml-rsin\applied-ml-rsin\lab05\winequality-red.csv", sep=";")

# Display structure and the first few rows
df.info()
print(df.head())

# ------------------------------------------------
# Dataset Description:
# ------------------------------------------------
# The dataset includes 11 physicochemical input variables (features):
# - fixed acidity          mostly tartaric acid
# - volatile acidity       mostly acetic acid (vinegar)
# - citric acid            can add freshness and flavor
# - residual sugar         remaining sugar after fermentation
# - chlorides              salt content
# - free sulfur dioxide    protects wine from microbes
# - total sulfur dioxide   sum of free and bound forms
# - density                related to sugar content
# - pH                     acidity level (lower = more acidic)
# - sulphates              antioxidant and microbial stabilizer
# - alcohol                % alcohol by volume

# The target variable is:
# - quality (integer score from 0 to 10, rated by wine tasters)

# Plan:
# - We'll simplify 'quality' into three categories:
#     - low (3–4), medium (5–6), high (7–8)
# - This helps make it a classification problem.
# - Dataset contains 1599 samples and 12 columns (11 features + 1 target)

# ------------------------------------------------
# Load Spiral Dataset
# ------------------------------------------------



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00 

# 3. Prepare the data

In [4]:
# ------------------------------------------------
# Section 2: Prepare the Data
# ------------------------------------------------

import pandas as pd

# Assuming the dataframe 'df' is already loaded with the winequality dataset

# ------------------------------------------------
# Helper function for converting quality to a label
# ------------------------------------------------

# Takes one input, the quality (which we will temporarily name q while in the function)
# And returns a string of the quality label (low, medium, high)
# This function will be used to create the quality_label column
def quality_to_label(q):
    if q <= 4:
        return "low"
    elif q <= 6:
        return "medium"
    else:
        return "high"

# Apply the helper function to create a new 'quality_label' column
df["quality_label"] = df["quality"].apply(quality_to_label)

# ------------------------------------------------
# Helper function for converting quality to a numeric value
# ------------------------------------------------

# Create a numeric column for modeling: 0 = low, 1 = medium, 2 = high
def quality_to_number(q):
    if q <= 4:
        return 0
    elif q <= 6:
        return 1
    else:
        return 2

# Apply the helper function to create a new 'quality_numeric' column
df["quality_numeric"] = df["quality"].apply(quality_to_number)

# ------------------------------------------------
# Display the updated dataframe to check the new columns
# ------------------------------------------------
print(df.head())


   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality quality_label  quality_numeric  
0      9.4        5        medium

# 4. Feature Selection & Justification

In [5]:
# ------------------------------------------------
# Section 3: Feature Selection and Justification
# ------------------------------------------------

# Define input features (X) and target (y)
# Features: all columns except 'quality', 'quality_label', and 'quality_numeric'
# We drop these columns because:
# - 'quality' is the target variable, so it should not be used as an input feature.
# - 'quality_label' is a derived categorical label for the target, and we don't need it as an input feature.
# - 'quality_numeric' is the numeric version of 'quality_label', which is used as the target variable.
# Therefore, we exclude these from the feature set.

# Define the input features (X) - all other columns except 'quality', 'quality_label', and 'quality_numeric'
X = df.drop(columns=["quality", "quality_label", "quality_numeric"])  # Features

# Define the target variable (y) - quality_numeric is the numeric version of the quality score.
y = df["quality_numeric"]  # Target

# ------------------------------------------------
# Justification:
# ------------------------------------------------
# We selected all columns except the target and derived columns as the features because:
# - The columns we include are the physicochemical properties of the wine (e.g., acidity, alcohol content, etc.).
# - These features are the predictors that will help in determining the quality of the wine.
# - We drop 'quality', 'quality_label', and 'quality_numeric' since they are related to the target variable.
# - 'quality_numeric' is the target that we want to predict, so it cannot be used as a feature.
# - The features include important aspects like acidity, alcohol, pH, and other chemical compositions that can directly affect the wine's quality.
# These features are relevant and widely studied in predicting wine quality in similar datasets.

# Display the feature matrix and target to confirm
print("Input Features (X):\n", X.head())
print("\nTarget Variable (y):\n", y.head())


Input Features (X):
    fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  
0      9.4  
1      9.8  
2      9.8  
3      9.8  


# 5. Split the data into Train & Test

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# Assuming df is your DataFrame and 'quality_numeric' is your target column
X = df.drop(columns=["quality", "quality_label", "quality_numeric"])  # Features
y = df["quality_numeric"]  # Target

# Train/test split (stratify to preserve class balance)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# ------------------------------------------------
# Model 1: Random Forest (100 trees)
# ------------------------------------------------
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train and evaluate the Random Forest (100 trees)
rf_model.fit(X_train, y_train)

# Predictions
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

# Evaluation
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)
train_f1 = f1_score(y_train, y_train_pred, average="weighted")
test_f1 = f1_score(y_test, y_test_pred, average="weighted")

print(f"Random Forest (100 trees) Results:")
print("Confusion Matrix (Test):")
print(confusion_matrix(y_test, y_test_pred))
print(f"Train Accuracy: {train_acc:.4f}, Test Accuracy: {test_acc:.4f}")
print(f"Train F1 Score: {train_f1:.4f}, Test F1 Score: {test_f1:.4f}")


Random Forest (100 trees) Results:
Confusion Matrix (Test):
[[  0  13   0]
 [  0 256   8]
 [  0  15  28]]
Train Accuracy: 1.0000, Test Accuracy: 0.8875
Train F1 Score: 1.0000, Test F1 Score: 0.8661


In [6]:
# ------------------------------------------------
# Section 3: Feature Selection and Justification
# ------------------------------------------------

# Define input features (X) and target (y)
# Features: all columns except 'quality', 'quality_label', and 'quality_numeric'
# We drop these columns because:
# - 'quality' is the target variable, so it should not be used as an input feature.
# - 'quality_label' is a derived categorical label for the target, and we don't need it as an input feature.
# - 'quality_numeric' is the numeric version of 'quality_label', which is used as the target variable.
# Therefore, we exclude these from the feature set.

# Define the input features (X) - all other columns except 'quality', 'quality_label', and 'quality_numeric'
X = df.drop(columns=["quality", "quality_label", "quality_numeric"])  # Features

# Define the target variable (y) - quality_numeric is the numeric version of the quality score.
y = df["quality_numeric"]  # Target

# ------------------------------------------------
# Justification:
# ------------------------------------------------
# We selected all columns except the target and derived columns as the features because:
# - The columns we include are the physicochemical properties of the wine (e.g., acidity, alcohol content, etc.).
# - These features are the predictors that will help in determining the quality of the wine.
# - We drop 'quality', 'quality_label', and 'quality_numeric' since they are related to the target variable.
# - 'quality_numeric' is the target that we want to predict, so it cannot be used as a feature.
# - The features include important aspects like acidity, alcohol, pH, and other chemical compositions that can directly affect the wine's quality.
# These features are relevant and widely studied in predicting wine quality in similar datasets.

# Display the feature matrix and target to confirm
print("Input Features (X):\n", X.head())
print("\nTarget Variable (y):\n", y.head())


Input Features (X):
    fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  
0      9.4  
1      9.8  
2      9.8  
3      9.8  


# 6. A. Evaluate Model Performance (Random Forest) 

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# Assuming df is your DataFrame and 'quality_numeric' is your target column
X = df.drop(columns=["quality", "quality_label", "quality_numeric"])  # Features
y = df["quality_numeric"]  # Target

# Train/test split (stratify to preserve class balance)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize the results list
results = []

# ------------------------------------------------
# Model 1: Random Forest (100 trees)
# ------------------------------------------------
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train and evaluate the Random Forest (100 trees)
rf_model.fit(X_train, y_train)

# Predictions
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

# Evaluation
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)
train_f1 = f1_score(y_train, y_train_pred, average="weighted")
test_f1 = f1_score(y_test, y_test_pred, average="weighted")

print(f"Random Forest (100 trees) Results:")
print("Confusion Matrix (Test):")
print(confusion_matrix(y_test, y_test_pred))
print(f"Train Accuracy: {train_acc:.4f}, Test Accuracy: {test_acc:.4f}")
print(f"Train F1 Score: {train_f1:.4f}, Test F1 Score: {test_f1:.4f}")

# Save Results
results.append(
    {
        "Model": "Random Forest (100)",
        "Train Accuracy": train_acc,
        "Test Accuracy": test_acc,
        "Train F1": train_f1,
        "Test F1": test_f1,
    }
)

# Print the results list
print("\nResults:")
for result in results:
    print(result)


Random Forest (100 trees) Results:
Confusion Matrix (Test):
[[  0  13   0]
 [  0 256   8]
 [  0  15  28]]
Train Accuracy: 1.0000, Test Accuracy: 0.8875
Train F1 Score: 1.0000, Test F1 Score: 0.8661

Results:
{'Model': 'Random Forest (100)', 'Train Accuracy': 1.0, 'Test Accuracy': 0.8875, 'Train F1': 1.0, 'Test F1': 0.8660560842649911}


# 6. B. Evaluate Model Performance (AdaBoost)(100 estimator)

In [21]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# ------------------------------------------------
# Model 2: AdaBoost (100 estimators)
# ------------------------------------------------
adaboost_model = AdaBoostClassifier(n_estimators=100, random_state=42)

# Train and evaluate the AdaBoost (100 estimators)
adaboost_model.fit(X_train, y_train)

# Predictions
y_train_pred_adaboost = adaboost_model.predict(X_train)
y_test_pred_adaboost = adaboost_model.predict(X_test)

# Evaluation
train_acc_adaboost = accuracy_score(y_train, y_train_pred_adaboost)
test_acc_adaboost = accuracy_score(y_test, y_test_pred_adaboost)
train_f1_adaboost = f1_score(y_train, y_train_pred_adaboost, average="weighted")
test_f1_adaboost = f1_score(y_test, y_test_pred_adaboost, average="weighted")

print(f"AdaBoost (100 estimators) Results:")
print("Confusion Matrix (Test):")
print(confusion_matrix(y_test, y_test_pred_adaboost))
print(f"Train Accuracy: {train_acc_adaboost:.4f}, Test Accuracy: {test_acc_adaboost:.4f}")
print(f"Train F1 Score: {train_f1_adaboost:.4f}, Test F1 Score: {test_f1_adaboost:.4f}")

# Save Results
results.append(
    {
        "Model": "AdaBoost (100)",
        "Train Accuracy": train_acc_adaboost,
        "Test Accuracy": test_acc_adaboost,
        "Train F1": train_f1_adaboost,
        "Test F1": test_f1_adaboost,
    }
)

# Print the results list
print("\nResults:")
for result in results:
    print(result)


AdaBoost (100 estimators) Results:
Confusion Matrix (Test):
[[  1  12   0]
 [  5 240  19]
 [  0  20  23]]
Train Accuracy: 0.8342, Test Accuracy: 0.8250
Train F1 Score: 0.8209, Test F1 Score: 0.8158

Results:
{'Model': 'Random Forest (100)', 'Train Accuracy': 1.0, 'Test Accuracy': 0.8875, 'Train F1': 1.0, 'Test F1': 0.8660560842649911}
{'Model': 'AdaBoost (100)', 'Train Accuracy': 0.8342455043002346, 'Test Accuracy': 0.825, 'Train F1': 0.8208625713138017, 'Test F1': 0.8158028741740215}


# 7. Compare Results

In [22]:
import pandas as pd

# Assuming the 'results' list already contains the results for Random Forest and AdaBoost
# Add a gap column (difference between Train Accuracy and Test Accuracy)
for result in results:
    result["Accuracy Gap"] = result["Train Accuracy"] - result["Test Accuracy"]

# Convert results into a pandas DataFrame
results_df = pd.DataFrame(results)

# Display the comparison summary
print("\nSummary of All Models:")
display(results_df)

# Sort by Test Accuracy to find the best models
sorted_results = results_df.sort_values(by="Test Accuracy", ascending=False)
print("\nSorted Results by Test Accuracy:")
display(sorted_results)



Summary of All Models:


Unnamed: 0,Model,Train Accuracy,Test Accuracy,Train F1,Test F1,Accuracy Gap
0,Random Forest (100),1.0,0.8875,1.0,0.866056,0.1125
1,AdaBoost (100),0.834246,0.825,0.820863,0.815803,0.009246



Sorted Results by Test Accuracy:


Unnamed: 0,Model,Train Accuracy,Test Accuracy,Train F1,Test F1,Accuracy Gap
0,Random Forest (100),1.0,0.8875,1.0,0.866056,0.1125
1,AdaBoost (100),0.834246,0.825,0.820863,0.815803,0.009246
