*AI Aid: ChatGPT was used by me to get assistance with this homework

In [None]:
import warnings, os, math
warnings.filterwarnings("ignore")
import numpy as np, pandas as pd, matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(42)

In [None]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, ConfusionMatrixDisplay
from sklearn.base import clone
from scipy.stats import pearsonr
from sklearn.feature_selection import f_classif

In [None]:
df = pd.read_excel('/content/drive/MyDrive/Dry_Bean_Dataset.xlsx')
df.head(5)

# Q1
1. Below displayed the statistical measures for the attributes along with histogram plots
2. Most of the attributes show no clear evidence of skewness or any irregularities in the data. There aren't any null values in the dataset as well.
3. Since target lablels are texts, I've label encoded them to numerical values
4. I've split the dataset into Train - 60%, Val - 20% and Test - 20%

In [None]:
df.describe().T

In [None]:
df.isnull().sum()

In [None]:
df.hist(bins=20, figsize=(15, 10))
plt.show()

In [None]:
X = df.drop(columns=['Class'])
y = df['Class']

In [None]:
from sklearn.model_selection import train_test_split

# First split into train and temp (80% train, 20% temp)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42, stratify=y
)

# Then split the temp set equally into validation and test (20% each)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print("Train shape:", X_train.shape, y_train.shape)
print("Validation shape:", X_val.shape, y_val.shape)
print("Test shape:", X_test.shape, y_test.shape)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(y_train)  # fit only on training labels

y_train_enc = encoder.transform(y_train)
y_val_enc   = encoder.transform(y_val)
y_test_enc  = encoder.transform(y_test)

print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))

# Q2
1. Below I've plotted Pearson correlation matrix for between features and we can see set of features are highly correlated between them and negatively correlated with other set of features, which shows clear identifying character for their classes.
2. For feature and label relationship, I've performed ANOVA F-test, which gave higher value F score for many features which also shows the strong character for identifying their classes. Also, I've made scatter plot between features and label.
3. Also feature importance plot shows the top features to contribute in identifying their classes such as area, perimeter, e.t.c

In [None]:
X = X_train
y = y_train_enc

# Feature-Feature Correlation Heatmap
corr_matrix = X.corr(method='pearson')
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm',
            center=0, square=True, linewidths=0.5)
plt.title('Pearson Correlation Matrix - Bean', fontsize=16, pad=20)
plt.tight_layout()
plt.savefig('correlation_heatmap.png', dpi=300)
plt.show()

# Feature-Label Relationship (ANOVA F-test)
f_scores, p_values = f_classif(X, y)

feature_label_df = pd.DataFrame({
    'Feature': X.columns,
    'ANOVA_F': f_scores,
    'P-value': p_values
})
feature_label_df['Significant'] = np.where(feature_label_df['P-value'] < 0.05, 'Yes', 'No')
feature_label_df = feature_label_df.sort_values('ANOVA_F', ascending=False)

print("\n=== Feature-Label Association (ANOVA F-test) ===")
print(feature_label_df[['Feature', 'ANOVA_F', 'P-value', 'Significant']].to_string(index=False))

# Feature-Label Bar Plot (F-scores)
plt.figure(figsize=(10, 6))
plt.barh(feature_label_df['Feature'], feature_label_df['ANOVA_F'], color='skyblue', alpha=0.8)
plt.xlabel('ANOVA F-score')
plt.title('Feature Importance based on F-test')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig('feature_label_importance.png', dpi=300)
plt.show()

# Features vs Label Scatter Plots (Class-colored)
fig, axes = plt.subplots(3, 4, figsize=(16, 12))
axes = axes.flatten()
colors = ['red', 'green', 'blue']

for i, feature in enumerate(X.columns):
    if i >= len(axes):
        break
    ax = axes[i]
    scatter = ax.scatter(X[feature], y, c=y, cmap=plt.cm.Set1, alpha=0.6)
    ax.set_xlabel(feature, fontsize=9)
    ax.set_ylabel('Label', fontsize=9)
    ax.set_title(f'{feature}', fontsize=9)
    ax.grid(alpha=0.3)

# Remove empty subplots
if len(X.columns) < len(axes):
    for j in range(len(X.columns), len(axes)):
        fig.delaxes(axes[j])

plt.suptitle('Features vs Label (Class-colored)', fontsize=14)
plt.tight_layout()
plt.savefig('scatter_features_vs_label.png', dpi=300)
plt.show()

print("\n✓ Analysis complete! Check the saved PNG files.")

In [None]:
def show_confusion(y_true, y_pred, title):
    disp = ConfusionMatrixDisplay.from_predictions(y_true, y_pred)
    disp.ax_.set_title(title)
    plt.show()

# Q3
It was valid for question 1 to split the dataset. I've already mentioned about the split of dataset for this question

# Q4
1. I've trained the classifiers: Softmax, SVM and RandomForest and tweaked their hyperparameters for the best fit model. Results are down the line.
2. I've reported classification metrics under each model, as well as confusion matrix and classification report of 3 best classifiers down the line.
3. Impact of hyperparameters has been discussed under each model below.

# Softmax Regression

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import ParameterGrid

In [None]:
# Define parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['lbfgs', 'saga'],
    'max_iter': [100, 200, 500, 700, 1000]
}

best_f1_lr = -1
best_params_lr = None
best_model_lr = None

# Manual search — train on training set, evaluate on validation set
for params in ParameterGrid(param_grid):
    model = LogisticRegression(**params, multi_class='multinomial')
    model.fit(X_train_scaled, y_train_enc)

    y_val_pred = model.predict(X_val_scaled)
    f1 = f1_score(y_val_enc, y_val_pred, average='weighted')  # 'weighted' handles class imbalance

    if f1 > best_f1_lr:
        best_f1_lr = f1
        best_params_lr = params
        best_model_lr = model

print("Best Logistic Regression Params:", best_params_lr)
print("Best Validation F1-Score:", best_f1_lr)

# Impact of hyperparameters
1. C: higher C values (like 1 or 10) likely gave higher F1-scores — the model became more flexible and better fit the training data, while smaller C (e.g., 0.01) underfit and produced lower validation F1.
2. solver: lbfgs probably converged faster and gave stable scores, while saga might have been slower or slightly noisier in F1 due to stochastic updates.
3. max_iter: The model didn't show any better or worse results when it was increased more than 100
4. the best-performing combination balanced a moderate–high C, used lbfgs, and had enough max_iter to fully converge, resulting in the highest validation F1-score printed at the end.

In [None]:
def evaluate_model(model, X_train, y_train, X_val, y_val, X_test, y_test):
    results = {}
    for name, X, y in [('Train', X_train, y_train), ('Validation', X_val, y_val), ('Test', X_test, y_test)]:
        y_pred = model.predict(X)
        results[name] = {
            'Accuracy': accuracy_score(y, y_pred),
            'Precision': precision_score(y, y_pred, average='weighted'),
            'Recall': recall_score(y, y_pred, average='weighted'),
            'F1': f1_score(y, y_pred, average='weighted')
        }
    return pd.DataFrame(results).T

In [None]:
lr_results = evaluate_model(best_model_lr, X_train_scaled, y_train_enc, X_val_scaled, y_val_enc, X_test_scaled, y_test_enc)
print(lr_results)

# Support Vector Machine

In [None]:
# Define parameter grid
param_grid = {
    'C': [0.01,0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf'],
    'degree': [2, 3],
    'gamma': ['scale', 'auto']
}

best_f1_svm = -1
best_params_svm = None
best_model_svm = None

# Manual search — train on training set, evaluate on validation set
for params in ParameterGrid(param_grid):
    model = SVC(**params,probability=True)
    model.fit(X_train_scaled, y_train_enc)

    y_val_pred = model.predict(X_val_scaled)
    f1 = f1_score(y_val_enc, y_val_pred, average='weighted')  # 'weighted' handles class imbalance

    if f1 > best_f1_svm:
        best_f1_svm = f1
        best_params_svm = params
        best_model_svm = model

print("Best Logistic Regression Params:", best_params_svm)
print("Best Validation F1-Score:", best_f1_svm)

# Impact of Hyperparameters
1. C - Higher C (like 1 or 10) likely improved F1 on the validation set by letting the model fit harder boundaries, but slightly risking overfitting. Lower C (like 0.01) underfit and gave weaker separation between classes.
2. kernel - The rbf kernel probably yielded the best F1 since it captures non-linear relationships
3. degree - This mattered only for the poly kernel; Hence for rbf and linear it had no effect.
4. gamma - With the rbf kernel, 'scale' generally gave better F1 than 'auto', since it adapts gamma based on feature variance.
5. Best SVM likely used a moderate–high C, the rbf kernel with gamma='scale', and a higher iteration kernel fit — giving the balance between fitting complex patterns and keeping validation F1 stable.

In [None]:
svm_results = evaluate_model(best_model_svm, X_train_scaled, y_train_enc, X_val_scaled, y_val_enc, X_test_scaled, y_test_enc)
print("\n=== SVM Performance ===")
print(svm_results)

# Random Forest Classifier

In [None]:
# Define parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

best_f1_rf = -1
best_params_rf = None
best_model_rf = None

# Manual search — train on training set, evaluate on validation set
for params in ParameterGrid(param_grid):
    model = RandomForestClassifier(**params,random_state=42)
    model.fit(X_train_scaled, y_train_enc)

    y_val_pred = model.predict(X_val_scaled)
    f1 = f1_score(y_val_enc, y_val_pred, average='weighted')  # 'weighted' handles class imbalance

    if f1 > best_f1_rf:
        best_f1_rf = f1
        best_params_rf = params
        best_model_rf = model

print("Best Logistic Regression Params:", best_params_rf)
print("Best Validation F1-Score:", best_f1_rf)

# Impact of hyperparameters
1. n_estimators: More trees (200 vs 100) slightly improved validation F1 by stabilizing predictions.
2. max_depth: Although max_depth was not limited, restricting other hyperparameters ensured best results while not overfitting much.
3. min_samples_split and min_samples_leaf: Inspite of lower values, since validation F-1 was not significant with respect to train F-1, overfitting did not seem to be a concern

In [None]:
rf_results = evaluate_model(best_model_rf, X_train_scaled, y_train_enc, X_val_scaled, y_val_enc, X_test_scaled, y_test_enc)
print("\n=== SVM Performance ===")
print(svm_results)

In [None]:
# RandomForest feature importance
imp = pd.Series(best_model_rf.feature_importances_, index=X.columns).sort_values(ascending=False)
display(imp.to_frame("RF_feature_importance"))
imp.plot(kind="bar", figsize=(10,4), title="RandomForest Feature Importance"); plt.show()

# Classification report and confusion matrix for Train, Val and Test of best models from each classifier

In [None]:
# Evaluate best models on train/val/test
def eval_and_show(model, name):
    print(f"\n=== {name}===")
    for split_name, (X_, y_) in {
        "TRAIN": (X_train_scaled,y_train_enc),
        "VAL"  : (X_val_scaled,y_val_enc),
        "TEST" : (X_test_scaled,y_test_enc)
    }.items():
        yp = model.predict(X_)
        print(f"\n{split_name} report")
        print(classification_report(y_, yp, digits=3))
        show_confusion(y_, yp, f"{name} — {split_name}")

eval_and_show(best_model_lr,  "Best LogisticRegression")
eval_and_show(best_model_svm, "Best SVM")
eval_and_show(best_model_rf,  "Best RandomForest")


# Q5
1. I've done an ensemble of the 3 observed best classifiers and displayed the results below
2. Both individual and ensemble gave closely similar macro and weighted average of F-1 classes on test, which affirms that the ensemle is the best and reasonable representation of the individual models for the given dataset's label prediction
   RF avg F-1: 93%, 92%
   SVM avg F-1: 94%, 92%
   Softmax avg F-1: 94%, 92%
   Ensemble avg F-1: 94%,92%

# Ensemble

In [None]:
voting_clf = VotingClassifier(
    estimators=[
        ('logreg', best_model_lr),
        ('svm', best_model_svm),
        ('rf', best_model_rf)
    ],
    voting='soft',
    n_jobs=-1
)

# Fit ensemble on the same training data
voting_clf.fit(X_train_scaled, y_train_enc)

print("Ensemble on validation:")
y_val_pred = voting_clf.predict(X_val_scaled)
print(classification_report(y_val_enc, y_val_pred, digits=3))
show_confusion(y_val_enc, y_val_pred, "Ensemble — VAL")

# If ensemble wins on VAL, retrain on TRAIN+VAL, then test:
X_trval = pd.concat([pd.DataFrame(X_train_scaled),pd.DataFrame(X_val_scaled)], axis=0).values  # convert back to numpy if needed
y_trval = pd.concat([pd.Series(y_train_enc),pd.Series(y_val_enc)], axis=0).values
voting_clf.fit(X_trval, y_trval)

print("Ensemble on test (after refit on train+val):")
y_test_pred = voting_clf.predict(X_test_scaled)
print(classification_report(y_test_enc, y_test_pred, digits=3))
show_confusion(y_test_enc, y_test_pred, "Ensemble — TEST")