In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE

df = pd.read_csv('/content/diabetes_binary_health_indicators_BRFSS2015.csv')
X = df.drop(columns=['Diabetes_binary', 'HeartDiseaseorAttack', 'Age'])
y_diabetes = df['Diabetes_binary']
y_heart = df['HeartDiseaseorAttack']

# splitting into training and testing set
X_train, X_test, y_train_diabetes, y_test_diabetes = train_test_split(X, y_diabetes, test_size=0.2, random_state=42)

# Applying smote for class imbalances
smote = SMOTE(random_state=42)
X_train_resampled_diabetes, y_train_resampled_diabetes = smote.fit_resample(X_train, y_train_diabetes)

# Defining the algorithms
rf_classifier = RandomForestClassifier(random_state=42)
gb_classifier = GradientBoostingClassifier(random_state=42)
ada_classifier = AdaBoostClassifier(random_state=42)
xgb_classifier = XGBClassifier(random_state=42)

models = {
    "RandomForest": rf_classifier,
    "GradientBoosting": gb_classifier,
    "AdaBoost": ada_classifier,
    "XGBoost": xgb_classifier
}

# Training the model for diabetes
for name, model in models.items():
    model.fit(X_train_resampled_diabetes, y_train_resampled_diabetes)
    y_pred_proba_diabetes = model.predict_proba(X_test)[:, 1]
    y_pred_diabetes = model.predict(X_test)

    auc_roc_diabetes = roc_auc_score(y_test_diabetes, y_pred_proba_diabetes)
    accuracy_diabetes = accuracy_score(y_test_diabetes, y_pred_diabetes)
    precision_diabetes = precision_score(y_test_diabetes, y_pred_diabetes)
    recall_diabetes = recall_score(y_test_diabetes, y_pred_diabetes)
    f1_diabetes = f1_score(y_test_diabetes, y_pred_diabetes)

    print(f"{name} - Diabetes Prediction Metrics:")
    print(f"AUC-ROC: {auc_roc_diabetes:.3f}")
    print(f"Accuracy: {accuracy_diabetes:.3f}")
    print(f"Precision: {precision_diabetes:.3f}")
    print(f"Recall: {recall_diabetes:.3f}")
    print(f"F1 Score: {f1_diabetes:.3f}")
    print("")

# training the model for heart disease prediction
X_train, X_test, y_train_heart, y_test_heart = train_test_split(X, y_heart, test_size=0.2, random_state=42)
X_train_resampled_heart, y_train_resampled_heart = smote.fit_resample(X_train, y_train_heart)

for name, model in models.items():
    model.fit(X_train_resampled_heart, y_train_resampled_heart)
    y_pred_proba_heart = model.predict_proba(X_test)[:, 1]
    y_pred_heart = model.predict(X_test)

    auc_roc_heart = roc_auc_score(y_test_heart, y_pred_proba_heart)
    accuracy_heart = accuracy_score(y_test_heart, y_pred_heart)
    precision_heart = precision_score(y_test_heart, y_pred_heart)
    recall_heart = recall_score(y_test_heart, y_pred_heart)
    f1_heart = f1_score(y_test_heart, y_pred_heart)

    print(f"{name} - Heart Disease Prediction Metrics:")
    print(f"AUC-ROC: {auc_roc_heart:.3f}")
    print(f"Accuracy: {accuracy_heart:.3f}")
    print(f"Precision: {precision_heart:.3f}")
    print(f"Recall: {recall_heart:.3f}")
    print(f"F1 Score: {f1_heart:.3f}")
    print("")

# Age group performance
results_diabetes = {}
results_heart = {}

for age_value in sorted(df['Age'].unique()):
    X_age = X[df['Age'] == age_value]
    y_diabetes_age = y_diabetes[df['Age'] == age_value]
    y_heart_age = y_heart[df['Age'] == age_value]

    if X_age.shape[0] < 50:
        print(f"Skipping age value {age_value} due to insufficient samples.")
        continue

    # Training and testing split
    X_train, X_test, y_train, y_test = train_test_split(X_age, y_diabetes_age, test_size=0.2, random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Model evaluation for age group
    for name, model in models.items():
        model.fit(X_train_resampled, y_train_resampled)
        y_pred_proba_diabetes = model.predict_proba(X_test)[:, 1]
        auc_diabetes = roc_auc_score(y_test, y_pred_proba_diabetes)
        results_diabetes.setdefault(name, {})[age_value] = auc_diabetes

    # same process for heart disease
    X_train, X_test, y_train, y_test = train_test_split(X_age, y_heart_age, test_size=0.2, random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    for name, model in models.items():
        model.fit(X_train_resampled, y_train_resampled)
        y_pred_proba_heart = model.predict_proba(X_test)[:, 1]
        auc_heart = roc_auc_score(y_test, y_pred_proba_heart)
        results_heart.setdefault(name, {})[age_value] = auc_heart

results_df_diabetes = pd.DataFrame.from_dict(results_diabetes)
results_df_heart = pd.DataFrame.from_dict(results_heart)

print("Age Value Model Evaluation Results for Diabetes")
print(results_df_diabetes)
print("Age Value Model Evaluation Results for Heart Disease")
print(results_df_heart)


RandomForest - Diabetes Prediction Metrics:
AUC-ROC: 0.768
Accuracy: 0.847
Precision: 0.401
Recall: 0.219
F1 Score: 0.284

GradientBoosting - Diabetes Prediction Metrics:
AUC-ROC: 0.813
Accuracy: 0.839
Precision: 0.418
Recall: 0.432
F1 Score: 0.425

AdaBoost - Diabetes Prediction Metrics:
AUC-ROC: 0.805
Accuracy: 0.803
Precision: 0.360
Recall: 0.545
F1 Score: 0.433

XGBoost - Diabetes Prediction Metrics:
AUC-ROC: 0.815
Accuracy: 0.864
Precision: 0.518
Recall: 0.197
F1 Score: 0.285

RandomForest - Heart Disease Prediction Metrics:
AUC-ROC: 0.767
Accuracy: 0.892
Precision: 0.329
Recall: 0.147
F1 Score: 0.203

GradientBoosting - Heart Disease Prediction Metrics:
AUC-ROC: 0.808
Accuracy: 0.881
Precision: 0.358
Recall: 0.333
F1 Score: 0.345

AdaBoost - Heart Disease Prediction Metrics:
AUC-ROC: 0.799
Accuracy: 0.857
Precision: 0.312
Recall: 0.432
F1 Score: 0.362

XGBoost - Heart Disease Prediction Metrics:
AUC-ROC: 0.812
Accuracy: 0.903
Precision: 0.451
Recall: 0.129
F1 Score: 0.201

Age Va