In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_validate
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    roc_auc_score, confusion_matrix, classification_report
)
from sklearn.utils.class_weight import compute_class_weight
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully")

Libraries imported successfully


In [2]:
# Load processed data
X_train_scaled = pd.read_csv("../data/X_train_scaled.csv")
X_test_scaled = pd.read_csv("../data/X_test_scaled.csv")
y_train = pd.read_csv("../data/y_train.csv").values.ravel()
y_test = pd.read_csv("../data/y_test.csv").values.ravel()

print("="*80)
print("MODEL TRAINING & EVALUATION")
print("="*80)
print(f"\nTrain set: {X_train_scaled.shape[0]} samples")
print(f"Test set: {X_test_scaled.shape[0]} samples")
print(f"Features: {X_train_scaled.shape[1]}")

MODEL TRAINING & EVALUATION

Train set: 45824 samples
Test set: 11457 samples
Features: 23


In [3]:
# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

# Define evaluation function
def evaluate_model(y_true, y_pred, y_pred_prob, set_name=""):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_true, y_pred_prob)

    print(f"\n{set_name}:")
    print(f"  Accuracy:  {acc:.4f}")
    print(f"  Precision: {prec:.4f}")
    print(f"  Recall:    {rec:.4f}")
    print(f"  F1-Score:  {f1:.4f}")
    print(f"  ROC-AUC:   {roc_auc:.4f}")

    return {'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1, 'roc_auc': roc_auc}

print("Evaluation function defined")

Evaluation function defined


## Model 1: Random Forest

In [4]:
print("\n" + "="*80)
print("MODEL 1: RANDOM FOREST")
print("="*80)

rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    min_samples_split=20,
    min_samples_leaf=10,
    max_features='sqrt',
    class_weight=class_weight_dict,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train_scaled, y_train)
print("Random Forest trained successfully")


MODEL 1: RANDOM FOREST
Random Forest trained successfully


In [5]:
# Predictions
y_train_pred_rf = rf_model.predict(X_train_scaled)
y_train_pred_prob_rf = rf_model.predict_proba(X_train_scaled)[:, 1]
y_test_pred_rf = rf_model.predict(X_test_scaled)
y_test_pred_prob_rf = rf_model.predict_proba(X_test_scaled)[:, 1]

rf_test_metrics = evaluate_model(y_test, y_test_pred_rf, y_test_pred_prob_rf, "Random Forest - TEST")

cm_rf = confusion_matrix(y_test, y_test_pred_rf)
print(f"\nConfusion Matrix:")
print(f"  TN: {cm_rf[0, 0]}, FP: {cm_rf[0, 1]}")
print(f"  FN: {cm_rf[1, 0]}, TP: {cm_rf[1, 1]}")


Random Forest - TEST:
  Accuracy:  0.9383
  Precision: 0.8195
  Recall:    0.8157
  F1-Score:  0.8175
  ROC-AUC:   0.9656

Confusion Matrix:
  TN: 9166, FP: 349
  FN: 358, TP: 1584


In [6]:
# Feature importance
print(f"\nTop 10 Feature Importances:")
feature_importance_rf = pd.DataFrame({
    'feature': X_train_scaled.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

for idx, row in feature_importance_rf.head(10).iterrows():
    print(f"  {row['feature']:30s}: {row['importance']:.4f}")


Top 10 Feature Importances:
  avg_glucose_level             : 0.2132
  age                           : 0.2044
  bmi                           : 0.1945
  smoking_risk                  : 0.0598
  high_risk_score               : 0.0346
  gender                        : 0.0319
  Residence_type_Urban          : 0.0294
  ever_married                  : 0.0265
  work_type_Self-employed       : 0.0195
  work_type_Govt_job            : 0.0187


## Model 2: Gradient Boosting

In [7]:
print("\n" + "="*80)
print("MODEL 2: GRADIENT BOOSTING")
print("="*80)

gb_model = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=5,
    min_samples_split=20,
    min_samples_leaf=10,
    subsample=0.8,
    random_state=42
)

gb_model.fit(X_train_scaled, y_train)
print("Gradient Boosting trained successfully")


MODEL 2: GRADIENT BOOSTING
Gradient Boosting trained successfully


In [8]:
# Predictions
y_test_pred_gb = gb_model.predict(X_test_scaled)
y_test_pred_prob_gb = gb_model.predict_proba(X_test_scaled)[:, 1]

gb_test_metrics = evaluate_model(y_test, y_test_pred_gb, y_test_pred_prob_gb, "Gradient Boosting - TEST")

cm_gb = confusion_matrix(y_test, y_test_pred_gb)
print(f"\nConfusion Matrix:")
print(f"  TN: {cm_gb[0, 0]}, FP: {cm_gb[0, 1]}")
print(f"  FN: {cm_gb[1, 0]}, TP: {cm_gb[1, 1]}")


Gradient Boosting - TEST:
  Accuracy:  0.8404
  Precision: 0.9014
  Recall:    0.0659
  F1-Score:  0.1228
  ROC-AUC:   0.8471

Confusion Matrix:
  TN: 9501, FP: 14
  FN: 1814, TP: 128


In [9]:
# Feature importance
print(f"\nTop 10 Feature Importances:")
feature_importance_gb = pd.DataFrame({
    'feature': X_train_scaled.columns,
    'importance': gb_model.feature_importances_
}).sort_values('importance', ascending=False)

for idx, row in feature_importance_gb.head(10).iterrows():
    print(f"  {row['feature']:30s}: {row['importance']:.4f}")


Top 10 Feature Importances:
  avg_glucose_level             : 0.3673
  age                           : 0.2117
  bmi                           : 0.2043
  smoking_risk                  : 0.0393
  gender                        : 0.0226
  high_risk_score               : 0.0207
  ever_married                  : 0.0178
  Residence_type_Urban          : 0.0159
  work_type_Private             : 0.0156
  work_type_Govt_job            : 0.0143


## Model 3: Stacking Ensemble

In [10]:
print("\n" + "="*80)
print("MODEL 3: STACKING ENSEMBLE (RF + GB)")
print("="*80)

stack_clf = StackingClassifier(
    estimators=[
        ('rf', rf_model),
        ('gb', gb_model)
    ],
    final_estimator=LogisticRegression(
        class_weight='balanced',
        max_iter=500,
        solver='lbfgs'
    ),
    stack_method='predict_proba',
    n_jobs=-1,
    passthrough=False
)

print("Training Stacking Ensemble...")
stack_clf.fit(X_train_scaled, y_train)
print("Stacking Ensemble trained successfully")


MODEL 3: STACKING ENSEMBLE (RF + GB)
Training Stacking Ensemble...
Stacking Ensemble trained successfully


In [11]:
# Predictions
y_train_pred_stack = stack_clf.predict(X_train_scaled)
y_train_prob_stack = stack_clf.predict_proba(X_train_scaled)[:, 1]
y_test_pred_stack = stack_clf.predict(X_test_scaled)
y_test_prob_stack = stack_clf.predict_proba(X_test_scaled)[:, 1]

print("Stacking Ensemble - TRAIN")
_ = evaluate_model(y_train, y_train_pred_stack, y_train_prob_stack, "Train Set")

print("\nStacking Ensemble - TEST")
stack_test_metrics = evaluate_model(y_test, y_test_pred_stack, y_test_prob_stack, "Test Set")

Stacking Ensemble - TRAIN

Train Set:
  Accuracy:  0.9387
  Precision: 0.7368
  Recall:    0.9925
  F1-Score:  0.8458
  ROC-AUC:   0.9964

Stacking Ensemble - TEST

Test Set:
  Accuracy:  0.8988
  Precision: 0.6391
  Recall:    0.9248
  F1-Score:  0.7559
  ROC-AUC:   0.9656


## Cross-Validation

In [12]:
print("\n" + "="*80)
print("CROSS-VALIDATION (5-FOLD STRATIFIED)")
print("="*80)

scoring = {'accuracy': 'accuracy', 'precision': 'precision', 'recall': 'recall', 
           'f1': 'f1', 'roc_auc': 'roc_auc'}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("\nStacking Ensemble (RF + GB):")
stack_cv = cross_validate(stack_clf, X_train_scaled, y_train, cv=cv, scoring=scoring, n_jobs=-1)
for metric in ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']:
    scores = stack_cv[f'test_{metric}']
    print(f"  {metric.upper():10s}: {scores.mean():.4f} ± {scores.std():.4f}")


CROSS-VALIDATION (5-FOLD STRATIFIED)

Stacking Ensemble (RF + GB):
  ACCURACY  : 0.8632 ± 0.0026
  PRECISION : 0.5614 ± 0.0049
  RECALL    : 0.8802 ± 0.0070
  F1        : 0.6856 ± 0.0056
  ROC_AUC   : 0.9367 ± 0.0023


In [13]:
# Save models and predictions for next notebook
import pickle

# Save models
with open('../data/rf_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)
with open('../data/gb_model.pkl', 'wb') as f:
    pickle.dump(gb_model, f)
with open('../data/stack_clf.pkl', 'wb') as f:
    pickle.dump(stack_clf, f)

# Save predictions
np.save('../data/y_test_pred_stack.npy', y_test_pred_stack)
np.save('../data/y_test_prob_stack.npy', y_test_prob_stack)

print("\nModels and predictions saved for evaluation notebook")


Models and predictions saved for evaluation notebook
