In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score
# from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier # Import XGBoost classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold


In [4]:
diabetes_df = pd.read_csv('diabetes_preprocessed.csv')
diabetes_df

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,1,1,1,0.325581,1,0,0,0,0,1,...,0,5,0.6,0.500000,1,0,0.666667,4,3,0
1,0,0,0,0.151163,1,0,0,1,0,0,...,1,3,0.0,0.000000,0,0,0.500000,6,1,0
2,1,1,1,0.186047,0,0,0,0,1,0,...,1,5,1.0,1.000000,1,0,0.666667,4,8,0
3,1,0,1,0.174419,0,0,0,1,1,1,...,0,2,0.0,0.000000,0,0,0.833333,3,6,0
4,1,1,1,0.139535,0,0,0,1,1,1,...,0,2,0.1,0.000000,0,0,0.833333,5,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,1,1,1,0.383721,0,0,0,0,1,1,...,0,3,0.0,0.166667,0,1,0.333333,6,7,0
253676,1,1,1,0.069767,0,0,0,0,0,0,...,0,4,0.0,0.000000,1,0,0.833333,2,4,1
253677,0,0,1,0.186047,0,0,0,1,1,0,...,0,1,0.0,0.000000,0,0,0.083333,5,2,0
253678,1,0,1,0.127907,0,0,0,0,1,1,...,0,3,0.0,0.000000,0,1,0.500000,5,1,0


In [33]:
def train_models_by_age_group(df, age_column='Age', target_column='Diabetes_binary'):
    # Dictionary to hold models per age
    models = {}
    f1_scores = {}

    # Drop rows where the target is NaN
    df = df.dropna(subset=[target_column])
    
    # Get unique age groups
    age_groups = df[age_column].unique()
    
    for age in age_groups:
        # Filter data for current age group
        age_group_df = df[df[age_column] == age]

        # Separate features and target
        X = age_group_df.drop(target_column, axis=1)
        y = age_group_df[target_column]

        # Skip this group if it's too small
        if len(y) < 5 or y.nunique() < 2:
            continue

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Train model
        model = RandomForestClassifier(random_state=42)
        model.fit(X_train, y_train)

        # Save the model
        models[age] = model

        # Predict and calculate F1 score
        y_pred = model.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        f1_scores[age] = f1

    return models

In [35]:
def evaluate_models_on_test_data(test_df, models, age_column='Age', target_column='Diabetes_binary'):
    test_df = test_df.dropna(subset=[target_column])
    
    y_true_all = []
    y_pred_all = []

    for age, model in models.items():
        age_df = test_df[test_df[age_column] == age]
        if age_df.empty:
            continue

        X = age_df.drop(target_column, axis=1)
        y = age_df[target_column]
        y_pred = model.predict(X)

        y_true_all.extend(y)
        y_pred_all.extend(y_pred)

    scores = {
        "overall_f1": f1_score(y_true_all, y_pred_all),
        "overall_precision": precision_score(y_true_all, y_pred_all),
        "overall_recall": recall_score(y_true_all, y_pred_all)
    }

    return scores, y_pred_all, y_true_all

In [37]:
# models_by_age, _ = train_models_by_age_group(diabetes_df)
# scores, _ , _ = evaluate_overall_f1(diabetes_df, models_by_age)
# print(scores)
# # print(f"Overall F1 Score: {overall_f1:.2f}")

# Step 1: Split the dataset
train_df, test_df = train_test_split(diabetes_df, test_size=0.2, random_state=42)

# Step 2: Train models per age group
models_by_age = train_models_by_age_group(train_df)

# Step 3: Evaluate on test data
scores, predictions, true_labels = evaluate_models_on_test_data(test_df, models_by_age)

print(scores)

{'overall_f1': 0.2545986622073579, 'overall_precision': 0.47374562427071176, 'overall_recall': 0.17407460340145778}


In [55]:
def train_models_by_age_group_grid(train_df, age_column='Age', target_column='Diabetes_binary'):
    models = {}
    train_df = train_df.dropna(subset=[target_column])
    age_groups = train_df[age_column].unique()

    # Define parameter grid for RandomForest
    param_grid = {
        'n_estimators': [10,50, 100],
        'max_depth': [None,3,5,8,10,12,15, 20],
        'min_samples_split': [2,3,5,8,10]
    }

    for age in age_groups:
        group_df = train_df[train_df[age_column] == age]
        X = group_df.drop(target_column, axis=1)
        y = group_df[target_column]

        if len(y) < 10 or y.nunique() < 2:
            continue

        # Stratified K-Fold to preserve class balance during CV
        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

        rf = RandomForestClassifier(random_state=42)
        grid_search = GridSearchCV(rf, param_grid, cv=cv, scoring='f1', n_jobs=-1)

        grid_search.fit(X, y)

        # Store the best estimator
        models[age] = grid_search.best_estimator_

    return models

In [57]:
# models_by_age, _ = train_models_by_age_group(diabetes_df)
# scores, _ , _ = evaluate_overall_f1(diabetes_df, models_by_age)
# print(scores)
# # print(f"Overall F1 Score: {overall_f1:.2f}")

# Step 1: Split the dataset
train_df, test_df = train_test_split(diabetes_df, test_size=0.2, random_state=42)

# Step 2: Train models per age group
models_by_age = train_models_by_age_group_grid(train_df)

# Step 3: Evaluate on test data
scores, predictions, true_labels = evaluate_models_on_test_data(test_df, models_by_age)

print(scores)

{'overall_f1': 0.25491215303046055, 'overall_precision': 0.467581998474447, 'overall_recall': 0.1752179505502358}
