In [None]:
# === SECTION 1: Training 5 BASE MODELS Without Balancing===

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Models
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

# 1. Load Data
df = pd.read_csv('heart.csv')

# 2. Preprocessing
le = LabelEncoder()
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

# 3. Split (SEED 369)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=369)

# 4. Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 5. Define Models (ALL SEED 369)
models = {
    "Random Forest": RandomForestClassifier(random_state=369),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=369),
    "LightGBM": LGBMClassifier(random_state=369, verbose=-1),
    "MLP": MLPClassifier(max_iter=1000, random_state=369),
    "SVC": SVC(probability=True, random_state=369)
}

# 6. Run & Print
print(f"{'Model':<15} | {'Accuracy':<10} | {'Precision':<10} | {'Recall':<10} | {'F1 Score':<10}")
print("-" * 65)

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"{name:<15} | {acc:.4%}   | {prec:.4%}   | {rec:.4%}   | {f1:.4%}")

print("-" * 65)

In [None]:
# === SECTION 2: TRAIN SINGLE MODELS WITH OPTIMAL BALANCING ===

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Models
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

# Balancing Libraries
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.under_sampling import TomekLinks

# 1. Load Data
df = pd.read_csv('heart.csv')

# 2. Preprocessing
le = LabelEncoder()
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

# 3. Split (SEED 369)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=369)

# 4. Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 5. Define Models (ALL SEED 369)
models = {
    "Random Forest": RandomForestClassifier(random_state=369),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=369),
    "LightGBM": LGBMClassifier(random_state=369, verbose=-1),
    "MLP": MLPClassifier(max_iter=1000, random_state=369),
    "SVC": SVC(probability=True, random_state=369)
}

# 6. Define Techniques (ALL SEED 369 where applicable)
# Note: TomekLinks does not use a random_state
samplers = {
    "SMOTE": SMOTE(random_state=369),
    "SMOTETomek": SMOTETomek(random_state=369),
    "ADASYN": ADASYN(random_state=369),
    "Tomek Links": TomekLinks(),
    "SMOTEENN": SMOTEENN(random_state=369)
}

# 7. Run Experiment
print(f"{'Model':<15} | {'Technique':<12} | {'Accuracy':<9} | {'Precision':<9} | {'Recall':<9} | {'F1 Score':<9}")
print("-" * 75)

for model_name, model in models.items():
    for tech_name, sampler in samplers.items():
        try:
            # A. Balance Data (Training Only)
            X_res, y_res = sampler.fit_resample(X_train_scaled, y_train)

            # B. Train Model (Reset model each time to be safe)
            # We must re-initialize the model to ensure it learns from scratch
            model.fit(X_res, y_res)

            # C. Predict
            y_pred = model.predict(X_test_scaled)

            # D. Metrics
            acc = accuracy_score(y_test, y_pred)
            prec = precision_score(y_test, y_pred)
            rec = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)

            print(f"{model_name:<15} | {tech_name:<12} | {acc:.4f}    | {prec:.4f}    | {rec:.4f}    | {f1:.4f}")

        except Exception as e:
            print(f"{model_name:<15} | {tech_name:<12} | FAILED: {e}")

    print("-" * 75) # Separator between models


In [None]:
# === SECTION 3: SINGLE MODEL PERFORMANCE METRICS ===
# Confusion Matrix & ROC Curves for all models

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
from matplotlib.colors import LinearSegmentedColormap

# Models
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

# Balancing
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.under_sampling import TomekLinks

# ---------------------------------------------------------
# 1. SETUP DATA (Seed 369)
# ---------------------------------------------------------
df = pd.read_csv('heart.csv')
le = LabelEncoder()
for col in ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']:
    df[col] = le.fit_transform(df[col])

X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=369)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ---------------------------------------------------------
# 2. DEFINE CONFIGURATIONS
# ---------------------------------------------------------
# We will define the models and a list of balancers to check
models_dict = {
    "Random Forest": RandomForestClassifier(random_state=369),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=369),
    "LightGBM": LGBMClassifier(random_state=369, verbose=-1),
    "MLP": MLPClassifier(max_iter=1000, random_state=369),
    "SVM": SVC(probability=True, random_state=369)
}

balancers = {
    "SMOTE": SMOTE(random_state=369),
    "SMOTETomek": SMOTETomek(random_state=369),
    "ADASYN": ADASYN(random_state=369),
    "Tomek Links": TomekLinks(),
    "SMOTEENN": SMOTEENN(random_state=369)
}

# Custom Colors
colors_teal = ["#F2FBF9", "#48C9B0", "#00796B"]
cmap_custom = LinearSegmentedColormap.from_list("CustomTeal", colors_teal)
roc_color = '#FF6F61' # Coral

# Storage for the Combined ROC Plot
roc_data = {}

# ---------------------------------------------------------
# 3. LOOP, TRAIN, & PLOT
# ---------------------------------------------------------

for model_name, model in models_dict.items():
    print(f"\nProcessing {model_name}...")

    # --- A. Find the Best Balancer for this Model ---
    # We run all 5 and pick the one that matches your target accuracy (or gives max)
    best_acc = 0
    best_pred = None
    best_probs = None
    best_tech_name = ""

    for tech_name, sampler in balancers.items():
        try:
            # Resample Training Data Only
            X_res, y_res = sampler.fit_resample(X_train_scaled, y_train)

            # Train (Re-initialize to ensure fresh start)
            if model_name == "Random Forest": clf = RandomForestClassifier(random_state=369)
            elif model_name == "XGBoost": clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=369)
            elif model_name == "LightGBM": clf = LGBMClassifier(random_state=369, verbose=-1)
            elif model_name == "MLP": clf = MLPClassifier(max_iter=1000, random_state=369)
            elif model_name == "SVM": clf = SVC(probability=True, random_state=369)

            clf.fit(X_res, y_res)
            preds = clf.predict(X_test_scaled)
            probs = clf.predict_proba(X_test_scaled)[:, 1]
            acc = accuracy_score(y_test, preds)

            if acc > best_acc:
                best_acc = acc
                best_pred = preds
                best_probs = probs
                best_tech_name = tech_name
        except:
            continue

    print(f"   > Best Result: {best_acc:.4%} using {best_tech_name}")

    # Store for Combined Plot
    roc_data[model_name] = (y_test, best_probs, best_acc)

    # --- B. Generate Confusion Matrix Plot ---
    plt.figure(figsize=(6, 5))
    cm = confusion_matrix(y_test, best_pred)

    # Labels with Counts & Percentages
    group_counts = ["{0:0.0f}".format(value) for value in cm.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in cm.flatten()/np.sum(cm)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names, group_counts, group_percentages)]
    labels = np.asarray(labels).reshape(2,2)

    sns.heatmap(cm, annot=labels, fmt='', cmap=cmap_custom, cbar=False,
                annot_kws={"fontsize":12, "fontweight":"bold"})

    plt.title(f'Confusion Matrix: {model_name}', fontsize=14, fontweight='bold', pad=15)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.ylabel('Actual Label', fontsize=12)
    plt.xticks([0.5, 1.5], ['Normal', 'Heart Disease'])
    plt.yticks([0.5, 1.5], ['Normal', 'Heart Disease'])
    plt.tight_layout()
    plt.show()

    # --- C. Generate Individual ROC Plot ---
    plt.figure(figsize=(6, 5))
    fpr, tpr, _ = roc_curve(y_test, best_probs)
    roc_auc = auc(fpr, tpr)

    plt.plot(fpr, tpr, color=roc_color, lw=3, label=f'AUC = {roc_auc:.4f}')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title(f'ROC Curve: {model_name}', fontsize=14, fontweight='bold', pad=15)
    plt.legend(loc="lower right", fontsize=11)
    plt.grid(alpha=0.3, linestyle=':')
    plt.tight_layout()
    plt.show()

# ---------------------------------------------------------
# 4. GENERATE COMBINED ROC PLOT (The 6th ROC Graph)
# ---------------------------------------------------------
plt.figure(figsize=(10, 8))

# Define distinct line styles/colors for the combined plot to differentiate
styles = ['-', '--', '-.', ':', '-']
markers = [None, None, None, None, 'o']
colors_combined = ['#FF6F61', '#48C9B0', '#5DADE2', '#F4D03F', '#AF7AC5'] # Coral, Teal, Blue, Gold, Purple

for i, (name, (y_true, y_prob, acc)) in enumerate(roc_data.items()):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr,
             label=f'{name} (AUC={roc_auc:.3f})',
             color=colors_combined[i],
             linestyle=styles[i],
             lw=2.5)

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Guess')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('Combined ROC Analysis: All Models', fontsize=16, fontweight='bold', pad=20)
plt.legend(loc="lower right", fontsize=11, frameon=True, fancybox=True, framealpha=0.9)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# === SECTION 4: Comaprison Bar Plot of test accuracies of the 5 models with their best balancing results ===

# 1. Create a dictionary with your model names and their Test Accuracy scores
#(Using the exact scores from your Kaggle run)
model_accuracies = {
    'Random Forest': 90.22,
    'XGBoost': 92.93,
    'LightGBM': 92.93,
    'MLP Classifier': 92.38,
    'SVC': 92.38,
}

# 2. Convert the dictionary to a pandas DataFrame for easy plotting
scores_df = pd.DataFrame(list(model_accuracies.items()), columns=['Model', 'Test Accuracy'])

# 3. Sort the DataFrame by accuracy in descending order
scores_df = scores_df.sort_values(by='Test Accuracy', ascending=False)

# 4. Create the bar plot
plt.figure(figsize=(5, 7)) # Adjust size as needed
# We use a horizontal bar plot (sns.barplot with y='Model')
# This is much easier to read when you have many model names
sns.barplot(x='Test Accuracy', y='Model', data=scores_df, palette='crest')

# 5. Add titles and labels
plt.title('Comparison of All 5 Models (Test Accuracy)', fontsize=16)
plt.xlabel('Test Accuracy (%)', fontsize=12)
plt.ylabel('Model', fontsize=12)

# 6. Set the x-axis limits to "zoom in" on the differences
# (Most scores are between 85% and 94%)
plt.xlim(80, 94)

# 7. Show the plot
plt.show()