In [29]:
import pandas as pd
import joblib

from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate, train_test_split
from sklearn.ensemble import RandomForestClassifier

In [38]:
#df = pd.read_csv('Data/diabetes_binary_health_indicators_BRFSS2015.csv')

## Model Building Methodology

To avoid data leakage and have confidence in the model's generalizability: 
- Stratified K-Crossfold Validation is only applied in the Training dataset
- Model is (stratified) split 80-20 train test; Models are all to be fairly evaluated on the same unseen data

### Feel free to add/remove for feature selection (Diabetes_binary/012 required)

In [37]:
cols_to_drop = ['Diabetes_binary','NoDocbcCost','Sex','Education','Income','MentHlth',
                'AnyHealthcare','GenHlth'] 

In [40]:
X = df.drop(cols_to_drop, axis=1)
y = df['Diabetes_binary']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

### Change/Modify Model used here:

In [41]:
model = RandomForestClassifier(random_state=42, n_estimators = 20)

In [48]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']

results = cross_validate(
    model, X_train, y_train,
    cv=skf,
    scoring=metrics
)

In [49]:
print("Mean Accuracy:", results['test_accuracy'].mean())
print("Mean Precision:", results['test_precision_macro'].mean())
print("Mean Recall:", results['test_recall_macro'].mean())
print("Mean F1:", results['test_f1_macro'].mean())

Mean Accuracy: 0.8434149358977224
Mean Precision: 0.6312812079017713
Mean Recall: 0.5744647676981132
Mean F1: 0.5882187890219204


In [50]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
trained_results = cross_validate(model, X_test, y_test, scoring=metrics)


In [51]:
trained_results['test_recall_macro'].mean()

0.5641298066924121

In [53]:
joblib.dump(model, "Models/diabetes_rf_model_streamlit.pkl")

['Models/diabetes_rf_model_streamlit.pkl']

In [54]:
X_train.columns

Index(['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke',
       'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'PhysHlth', 'DiffWalk', 'Age'],
      dtype='object')

In [56]:
X_train.columns

Index(['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke',
       'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'PhysHlth', 'DiffWalk', 'Age'],
      dtype='object')