In [None]:
# === SECTION 1: 2 BEST MODELS ENSEMBLE EVALUATION  ===

# Weighted
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import ADASYN

# --- SETUP (Same as before) ---
df = pd.read_csv('heart.csv')
le = LabelEncoder()
for col in ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']:
    df[col] = le.fit_transform(df[col])
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=369)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- TRAIN MODELS ON THEIR SPECIFIC DATA ---
# 1. LightGBM (SMOTETomek)
smt = SMOTETomek(random_state=369)
X_train_lgbm, y_train_lgbm = smt.fit_resample(X_train_scaled, y_train)
model_lgbm = LGBMClassifier(random_state=369, verbose=-1).fit(X_train_lgbm, y_train_lgbm)

# 2. XGBoost (ADASYN)
ada = ADASYN(random_state=369)
X_train_xgb, y_train_xgb = ada.fit_resample(X_train_scaled, y_train)
model_xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=369).fit(X_train_xgb, y_train_xgb)

# --- WEIGHTED VOTING LOOP ---
print(f"{'LGBM Weight':<12} | {'XGB Weight':<12} | {'Accuracy':<10}")
print("-" * 40)

probs_lgbm = model_lgbm.predict_proba(X_test_scaled)
probs_xgb = model_xgb.predict_proba(X_test_scaled)

best_acc = 0
best_w = (0, 0)

# Try weights from 0.0 to 1.0
for w_lgbm in np.arange(0.1, 1.0, 0.1):
    w_xgb = 1.0 - w_lgbm

    # Weighted Average
    avg_probs = (probs_lgbm * w_lgbm) + (probs_xgb * w_xgb)
    y_pred = np.argmax(avg_probs, axis=1)
    acc = accuracy_score(y_test, y_pred)

    print(f"{w_lgbm:.1f}          | {w_xgb:.1f}          | {acc:.4%}")

    if acc > best_acc:
        best_acc = acc
        best_w = (w_lgbm, w_xgb)

print("-" * 40)
print(f"üèÜ BEST WEIGHTS: {best_w[0]:.1f} LGBM / {best_w[1]:.1f} XGB -> {best_acc:.4%}")

In [None]:
# === SECTION 2: PROPOSED METHOD (LIGHTGBM 0.8 + XGBOOST 0.2) ===

# Weighted (lgbm 0.6,xgboost 0.4)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import ADASYN

# 1. Load & Preprocess Data
df = pd.read_csv('heart.csv')

le = LabelEncoder()
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

# 2. Split Data (Seed 369)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=369)

# 3. Scale Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ---------------------------------------------------------
# STEP 4: TRAIN MODELS ON THEIR SPECIALIZED DATA
# ---------------------------------------------------------

# Model A: LightGBM (Using SMOTETomek)
print("Training LightGBM (w/ SMOTETomek)...")
smt = SMOTETomek(random_state=369)
X_train_lgbm, y_train_lgbm = smt.fit_resample(X_train_scaled, y_train)
model_lgbm = LGBMClassifier(random_state=369, verbose=-1)
model_lgbm.fit(X_train_lgbm, y_train_lgbm)

# Model B: XGBoost (Using ADASYN)
print("Training XGBoost (w/ ADASYN)...")
ada = ADASYN(random_state=369)
X_train_xgb, y_train_xgb = ada.fit_resample(X_train_scaled, y_train)
model_xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=369)
model_xgb.fit(X_train_xgb, y_train_xgb)

# ---------------------------------------------------------
# STEP 5: APPLY WEIGHTED VOTING (0.8 / 0.2)
# ---------------------------------------------------------

# Get probabilities for Class 1 (Heart Disease)
probs_lgbm = model_lgbm.predict_proba(X_test_scaled)
probs_xgb = model_xgb.predict_proba(X_test_scaled)

# Apply Weights
# Formula: (Prob_LGBM * 0.8) + (Prob_XGB * 0.2)
weighted_probs = (probs_lgbm * 0.8) + (probs_xgb * 0.2)

# Convert to final prediction (Class with highest score)
y_pred_ensemble = np.argmax(weighted_probs, axis=1)

# ---------------------------------------------------------
# STEP 6: EVALUATE
# ---------------------------------------------------------

acc_lgbm = accuracy_score(y_test, model_lgbm.predict(X_test_scaled))
acc_xgb = accuracy_score(y_test, model_xgb.predict(X_test_scaled))
acc_ensemble = accuracy_score(y_test, y_pred_ensemble)

print("\n" + "="*50)
print(f"RESULTS (Seed 369 | Weights: LGBM=0.8, XGB=0.2)")
print("="*50)
print(f"1. LightGBM (Individual):  {acc_lgbm:.4%}")
print(f"2. XGBoost  (Individual):  {acc_xgb:.4%}")
print("-" * 50)
print(f"üèÜ WEIGHTED ENSEMBLE:       {acc_ensemble:.4%}")
print("="*50)

if acc_ensemble > max(acc_lgbm, acc_xgb):
    print("‚úÖ SUCCESS: The 60/40 weighting improved the result!")
elif acc_ensemble == max(acc_lgbm, acc_xgb):
    print("‚ö†Ô∏è SAME: The result matched the best individual model.")
else:
    print("‚ùå DECREASE: This weighting combination performed worse.")

In [None]:
# === SECTION 3: CONFUSION MATRIX & ROC Curve (PROPOSED METHOD) ===

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import ADASYN
from matplotlib.colors import LinearSegmentedColormap # <--- Added this

# 1. Load & Preprocess Data
df = pd.read_csv('heart.csv')

le = LabelEncoder()
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

# 2. Split Data (Seed 369)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=369)

# 3. Scale Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ---------------------------------------------------------
# STEP 4: TRAIN MODELS
# ---------------------------------------------------------

# Model A: LightGBM (Using SMOTETomek)
smt = SMOTETomek(random_state=369)
X_train_lgbm, y_train_lgbm = smt.fit_resample(X_train_scaled, y_train)
model_lgbm = LGBMClassifier(random_state=369, verbose=-1)
model_lgbm.fit(X_train_lgbm, y_train_lgbm)

# Model B: XGBoost (Using ADASYN)
ada = ADASYN(random_state=369)
X_train_xgb, y_train_xgb = ada.fit_resample(X_train_scaled, y_train)
model_xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=369)
model_xgb.fit(X_train_xgb, y_train_xgb)

# ---------------------------------------------------------
# STEP 5: WEIGHTED ENSEMBLE (0.8 / 0.2)
# ---------------------------------------------------------

probs_lgbm = model_lgbm.predict_proba(X_test_scaled)
probs_xgb = model_xgb.predict_proba(X_test_scaled)

weighted_probs = (probs_lgbm * 0.8) + (probs_xgb * 0.2)
y_pred_ensemble = np.argmax(weighted_probs, axis=1)
y_prob_positive = weighted_probs[:, 1]

acc_ensemble = accuracy_score(y_test, y_pred_ensemble)
print(f"üèÜ WEIGHTED ENSEMBLE ACCURACY: {acc_ensemble:.4%}")

# ---------------------------------------------------------
# STEP 6: PLOTTING (FIXED)
# ---------------------------------------------------------

# Create a Custom Teal Colormap just for this plot
# (White -> Pastel Teal -> Dark Teal)
colors_teal = ["#F2FBF9", "#48C9B0", "#00796B"]
cmap_custom = LinearSegmentedColormap.from_list("CustomTeal", colors_teal)

# Setup Layout
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# --- Plot 1: Confusion Matrix ---
cm = confusion_matrix(y_test, y_pred_ensemble)
group_names = ['True Negative','False Positive','False Negative','True Positive']
group_counts = ["{0:0.0f}".format(value) for value in cm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cm.flatten()/np.sum(cm)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names, group_counts, group_percentages)]
labels = np.asarray(labels).reshape(2,2)

# Use 'cmap_custom' here instead of 'Teal_r'
sns.heatmap(cm, annot=labels, fmt='', cmap=cmap_custom, cbar=False,
            ax=axes[0], annot_kws={"fontsize":12, "fontweight":"bold"})
axes[0].set_title('Confusion Matrix\n(Weighted Ensemble)', fontsize=14, fontweight='bold', pad=15)
axes[0].set_xlabel('Predicted', fontsize=12)
axes[0].set_ylabel('Actual', fontsize=12)
axes[0].set_xticklabels(['Normal', 'Heart Disease'])
axes[0].set_yticklabels(['Normal', 'Heart Disease'])

# --- Plot 2: ROC Curve ---
fpr, tpr, thresholds = roc_curve(y_test, y_prob_positive)
roc_auc = auc(fpr, tpr)

axes[1].plot(fpr, tpr, color='#FF6F61', lw=3, label=f'ROC curve (AUC = {roc_auc:.4f})')
axes[1].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
axes[1].set_xlim([0.0, 1.0])
axes[1].set_ylim([0.0, 1.05])
axes[1].set_xlabel('False Positive Rate', fontsize=12)
axes[1].set_ylabel('True Positive Rate', fontsize=12)
axes[1].set_title('Receiver Operating Characteristic (ROC)', fontsize=14, fontweight='bold', pad=15)
axes[1].legend(loc="lower right", fontsize=11)
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()