# Diabetes Prediction Analysis - Processed Dataset

This notebook implements a comprehensive machine learning analysis for diabetes prediction using the processed diabetes dataset (CDC BRFSS data). The analysis includes:

1. Data Exploration and Visualization
2. Data Preprocessing and Feature Engineering
3. Multiple Machine Learning Algorithms
4. Model Evaluation and Comparison
5. Feature Importance Analysis
6. Results Interpretation

**Dataset**: The processed diabetes dataset contains health indicators and lifestyle factors from CDC's Behavioral Risk Factor Surveillance System (BRFSS).

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.decomposition import PCA

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [None]:
df = pd.read_csv('processed_diabetes_data.csv')

In [None]:
categorical_cols = ['BMI_Category', 'Age_Group', 'GenHlth', 'Sex', 'Education', 'Income']
for col in categorical_cols:
    if col in df.columns:
        print(f"{col}: {df[col].unique()[:10]}...")  # Show first 10 unique values

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()

In [None]:
# Explore key health indicators
health_indicators = ['HighBP', 'HighChol', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 
                    'Smoker', 'HvyAlcoholConsump', 'GenHlth', 'MentHlth', 'PhysHlth']

# Filter indicators that exist in the dataset
available_indicators = [col for col in health_indicators if col in df.columns]

In [None]:
# Data Preprocessing
df_processed = df.copy()

categorical_mappings = {}
if 'BMI_Category' in df_processed.columns and df_processed['BMI_Category'].dtype == 'object':
    bmi_mapping = {'Underweight': 0, 'Normal': 1, 'Overweight': 2, 'Obese': 3}
    df_processed['BMI_Category_encoded'] = df_processed['BMI_Category'].map(bmi_mapping)
    categorical_mappings['BMI_Category'] = bmi_mapping

if 'Age_Group' in df_processed.columns and df_processed['Age_Group'].dtype == 'object':
    age_mapping = {'18-24': 0, '25-34': 1, '35-54': 2, '55-64': 3, '65+': 4}
    df_processed['Age_Group_encoded'] = df_processed['Age_Group'].map(age_mapping)
    categorical_mappings['Age_Group'] = age_mapping

In [None]:
# Feature Engineering
# Create new composite features
if 'MentHlth' in df_processed.columns and 'PhysHlth' in df_processed.columns:
    df_processed['Overall_Health_Score'] = df_processed['MentHlth'] + df_processed['PhysHlth']

if 'HighBP' in df_processed.columns and 'HeartDiseaseorAttack' in df_processed.columns:
    df_processed['Health_Issue_Both'] = (df_processed['HighBP'] == 1) & (df_processed['HeartDiseaseorAttack'] == 1)
    df_processed['Health_Issue_Both'] = df_processed['Health_Issue_Both'].astype(int)

# Count of health conditions
health_conditions = ['HighBP', 'HighChol', 'Stroke', 'HeartDiseaseorAttack']
available_conditions = [col for col in health_conditions if col in df_processed.columns]
if len(available_conditions) > 1:
    df_processed['Multiple_Conditions'] = df_processed[available_conditions].sum(axis=1)

# Lifestyle score
lifestyle_factors = ['PhysActivity', 'Fruits', 'Veggies']
available_lifestyle = [col for col in lifestyle_factors if col in df_processed.columns]
if len(available_lifestyle) > 1:
    df_processed['Healthy_Lifestyle_Score'] = df_processed[available_lifestyle].sum(axis=1)

# Risk factors score (negative health indicators)
risk_factors = ['Smoker', 'HvyAlcoholConsump']
available_risks = [col for col in risk_factors if col in df_processed.columns]
if len(available_risks) > 1:
    df_processed['Risk_Factors_Score'] = df_processed[available_risks].sum(axis=1)


new_features = ['Overall_Health_Score', 'Health_Issue_Both', 'Multiple_Conditions', 
                'Healthy_Lifestyle_Score', 'Risk_Factors_Score']

In [None]:
# Prepare features for machine learning
# Identify numerical features (excluding the target variable)
numerical_features = df_processed.select_dtypes(include=[np.number]).columns.tolist()
if 'Diabetes_binary' in numerical_features:
    numerical_features.remove('Diabetes_binary')

# Remove any string categorical columns that weren't encoded
string_columns = df_processed.select_dtypes(include=['object']).columns.tolist()

# Final feature list for ML
ml_features = [col for col in numerical_features if col not in string_columns]

print(f"\nFeatures selected for ML ({len(ml_features)}):")
for i, feature in enumerate(ml_features):

# Prepare X and y
X = df_processed[ml_features]
y = df_processed['Diabetes_binary']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# Feature Scaling
# Standard Scaler
scaler_std = StandardScaler()
X_train_std = scaler_std.fit_transform(X_train)
X_test_std = scaler_std.transform(X_test)

# MinMax Scaler
scaler_minmax = MinMaxScaler()
X_train_minmax = scaler_minmax.fit_transform(X_train)
X_test_minmax = scaler_minmax.transform(X_test)

In [None]:
# Machine Learning Models
# Define models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'AdaBoost': AdaBoostClassifier(random_state=42)
}


In [None]:
# Train and Evaluate Models with Pipeline Approach

results = []
model_performance = {}

# Create pipelines for each model that handle missing values
model_pipelines = {}

for name, model in models.items():
    # Create pipeline with imputer + scaler + model
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
        ('scaler', StandardScaler()),                 # Scale features
        ('classifier', model)                         # The actual model
    ])
    model_pipelines[name] = pipeline

# Train and evaluate each pipeline
for name, pipeline in model_pipelines.items():
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    if hasattr(pipeline.named_steps['classifier'], 'predict_proba'):
        y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
    else:
        y_pred_proba = None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else None
    
    # Store results
    result = {
        'Model': name,
        'CV_Mean': cv_scores.mean(),
        'CV_Std': cv_scores.std(),
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1_Score': f1,
        'ROC_AUC': roc_auc
    }
    
    results.append(result)
    model_performance[name] = {
        'model': pipeline,  # Store the entire pipeline
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }

In [None]:
# Create results DataFrame and display
results_df = pd.DataFrame(results)
results_df = results_df.round(4)
results_df = results_df.sort_values('Accuracy', ascending=False)

# Find best performing model
best_model_name = results_df.iloc[0]['Model']
best_accuracy = results_df.iloc[0]['Accuracy']
best_f1 = results_df.iloc[0]['F1_Score']

# Display top 3 models
for i, (_, row) in enumerate(results_df.head(3).iterrows()):
    print(f"{i+1}. {row['Model']}: Accuracy={row['Accuracy']:.4f}, F1={row['F1_Score']:.4f}")

In [None]:
# Detailed Analysis of Best Model
best_model = model_performance[best_model_name]['model']
best_predictions = model_performance[best_model_name]['predictions']
best_probabilities = model_performance[best_model_name]['probabilities']

In [None]:
# Feature Importance Analysis
# Get the actual model from the pipeline
best_pipeline = model_performance[best_model_name]['model']
best_model = best_pipeline.named_steps['classifier']

# Get feature importance (for tree-based models) or coefficients (for linear models)
if hasattr(best_model, 'feature_importances_'):
    # Tree-based models
    feature_importance = best_model.feature_importances_
    importance_type = "Feature Importance"
elif hasattr(best_model, 'coef_'):
    # Linear models
    feature_importance = np.abs(best_model.coef_[0])
    importance_type = "Coefficient Magnitude"
else:
    # For models without feature importance, use permutation importance
    from sklearn.inspection import permutation_importance
    perm_importance = permutation_importance(best_pipeline, X_test, y_test, random_state=42)
    feature_importance = perm_importance.importances_mean
    importance_type = "Permutation Importance"

# Create feature importance dataframe
feature_imp_df = pd.DataFrame({
    'Feature': ml_features,
    'Importance': feature_importance
}).sort_values('Importance', ascending=False)


for i, (_, row) in enumerate(feature_imp_df.head(10).iterrows()):
    print(f"{i+1:2d}. {row['Feature']}: {row['Importance']:.4f}")

In [None]:
# Hyperparameter Tuning for Best Model

# Define parameter grids for different models (with pipeline prefixes)
param_grids = {
    'Random Forest': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [5, 10, 15, None],
        'classifier__min_samples_split': [2, 5, 10]
    },
    'Logistic Regression': {
        'classifier__C': [0.1, 1, 10, 100],
        'classifier__penalty': ['l1', 'l2'],
        'classifier__solver': ['liblinear', 'saga']
    },
    'Gradient Boosting': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.05, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7]
    }
}

if best_model_name in param_grids:
    # Create a fresh pipeline
    fresh_model = models[best_model_name]
    if 'random_state' in fresh_model.get_params():
        fresh_model.set_params(random_state=42)
    
    fresh_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('classifier', fresh_model)
    ])
    
    # Grid search with cross-validation (use smaller CV for large dataset)
    cv_folds = 3 if len(X_train) > 50000 else 5
    grid_search = GridSearchCV(
        fresh_pipeline, 
        param_grids[best_model_name], 
        cv=cv_folds, 
        scoring='f1', 
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)
    
    # Evaluate tuned model
    tuned_model = grid_search.best_estimator_
    tuned_predictions = tuned_model.predict(X_test)
    tuned_accuracy = accuracy_score(y_test, tuned_predictions)
    tuned_f1 = f1_score(y_test, tuned_predictions)
    
    improvement_acc = tuned_accuracy - best_accuracy
    improvement_f1 = tuned_f1 - best_f1

In [None]:
# Cross-Validation Analysis

# Perform detailed cross-validation for top 3 models
top_models = results_df.head(3)['Model'].tolist()

cv_results = []
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # Use 5-fold for large dataset

for model_name in top_models:
    
    # Get the pipeline for this model
    pipeline = model_pipelines[model_name]
    
    # 5-fold cross-validation with multiple metrics
    cv_accuracy = cross_val_score(pipeline, X_train, y_train, cv=skfold, scoring='accuracy')
    cv_precision = cross_val_score(pipeline, X_train, y_train, cv=skfold, scoring='precision')
    cv_recall = cross_val_score(pipeline, X_train, y_train, cv=skfold, scoring='recall')
    cv_f1 = cross_val_score(pipeline, X_train, y_train, cv=skfold, scoring='f1')
    
    cv_results.append({
        'Model': model_name,
        'Accuracy_Mean': cv_accuracy.mean(),
        'Accuracy_Std': cv_accuracy.std(),
        'Precision_Mean': cv_precision.mean(),
        'Precision_Std': cv_precision.std(),
        'Recall_Mean': cv_recall.mean(),
        'Recall_Std': cv_recall.std(),
        'F1_Mean': cv_f1.mean(),
        'F1_Std': cv_f1.std()
    })

cv_df = pd.DataFrame(cv_results)