# Diabetes Hospital Readmission Prediction

**Problem Statement:** Can we predict 30-day hospital readmission risk for diabetic patients ?

**Project Overview:** This project analyzes over 100,000 hospital admissions to develop models predicting early readmission in diabetic patients. Using 50+ features including medications, diagnoses, and procedures, we compare multiple ML algorithms to identify high-risk patients requiring enhanced post-discharge care.

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from skopt import BayesSearchCV
from skopt.space import Real, Integer
import pickle
import warnings
warnings.filterwarnings('ignore')

from ml_models import LogisticRegression, RandomForestModel, XGBoostModel

In [2]:
# Load preprocessed data
df = pd.read_csv('data/diabetic_data_preprocessed.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nTarget distribution:")
print(df['readmitted'].value_counts())
print(f"\nReadmission rate: {df['readmitted'].mean():.2%}")

Dataset shape: (101766, 78)

Target distribution:
readmitted
0    90409
1    11357
Name: count, dtype: int64

Readmission rate: 11.16%


## 1. Data Splitting

In [3]:
# Separate features and target
X = df.drop('readmitted', axis=1)
y = df['readmitted']

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"\nFeatures: {X_train.shape[1]}")
print(f"\nTrain readmission rate: {y_train.mean():.2%}")
print(f"Test readmission rate: {y_test.mean():.2%}")

Training set: 81412 samples
Test set: 20354 samples

Features: 77

Train readmission rate: 11.16%
Test readmission rate: 11.16%


## 2. Baseline Logistic Regression

In [4]:
# Train baseline logistic regression
print("Training Logistic Regression baseline...")
lr_baseline = LogisticRegression(random_state=42)
lr_baseline.train(X_train, y_train)

# Evaluate
lr_metrics = lr_baseline.evaluate(X_test, y_test)
print("\nLogistic Regression Results:")
for metric, value in lr_metrics.items():
    print(f"  {metric}: {value:.4f}")

# Save model
with open('ml_models/logistic_regression_baseline.pkl', 'wb') as f:
    pickle.dump(lr_baseline, f)
print("\n✓ Model saved: logistic_regression_baseline.pkl")

Training Logistic Regression baseline...


  return _ForkingPickler.loads(res)



Logistic Regression Results:
  accuracy: 0.6658
  precision: 0.1746
  recall: 0.5354
  f1_score: 0.2633
  roc_auc: 0.6501

✓ Model saved: logistic_regression_baseline.pkl


## 3. Random Forest with Bayesian Optimization

In [5]:
# Define search space for Random Forest
rf_search_space = {
    'n_estimators': Integer(50, 300),
    'max_depth': Integer(5, 30),
    'min_samples_split': Integer(2, 20),
    'min_samples_leaf': Integer(1, 10),
    'max_features': Real(0.1, 1.0)
}

print("Starting Bayesian optimization for Random Forest...")
print("This may take several minutes...\n")

# Create base estimator
from sklearn.ensemble import RandomForestClassifier
rf_base = RandomForestClassifier(
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

# Bayesian search
rf_search = BayesSearchCV(
    rf_base,
    rf_search_space,
    n_iter=30,
    cv=3,
    scoring='roc_auc',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

rf_search.fit(X_train, y_train)

print("\nBest parameters found:")
for param, value in rf_search.best_params_.items():
    print(f"  {param}: {value}")
print(f"\nBest CV ROC-AUC: {rf_search.best_score_:.4f}")

Starting Bayesian optimization for Random Forest...
This may take several minutes...

Fitting 3 folds for each of 1 candidates, totalling 3 fits


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


Fitting 3 folds for each of 1 candidates, totalling 3 fits


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


Fitting 3 folds for each of 1 candidates, totalling 3 fits


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi

  return _ForkingPickler.loads(res)


Fitting 3 folds for each of 1 candidates, totalling 3 fits

Best parameters found:
  max_depth: 8
  max_features: 0.6965603551981999
  min_samples_leaf: 6
  min_samples_split: 16
  n_estimators: 229

Best CV ROC-AUC: 0.6634


In [6]:
# Train final Random Forest with best parameters
rf_model = RandomForestModel(
    n_estimators=rf_search.best_params_['n_estimators'],
    max_depth=rf_search.best_params_['max_depth'],
    random_state=42
)
rf_model.model.set_params(
    min_samples_split=rf_search.best_params_['min_samples_split'],
    min_samples_leaf=rf_search.best_params_['min_samples_leaf'],
    max_features=rf_search.best_params_['max_features']
)
rf_model.train(X_train, y_train)

# Evaluate
rf_metrics = rf_model.evaluate(X_test, y_test)
print("\nRandom Forest Test Results:")
for metric, value in rf_metrics.items():
    print(f"  {metric}: {value:.4f}")

# Save model
with open('ml_models/random_forest_optimized.pkl', 'wb') as f:
    pickle.dump(rf_model, f)
print("\n✓ Model saved: random_forest_optimized.pkl")


Random Forest Test Results:
  accuracy: 0.6719
  precision: 0.1850
  recall: 0.5698
  f1_score: 0.2793
  roc_auc: 0.6781

✓ Model saved: random_forest_optimized.pkl


## 4. XGBoost with Bayesian Optimization

In [7]:
# Define search space for XGBoost
xgb_search_space = {
    'n_estimators': Integer(50, 300),
    'max_depth': Integer(3, 10),
    'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
    'subsample': Real(0.6, 1.0),
    'colsample_bytree': Real(0.6, 1.0),
    'min_child_weight': Integer(1, 10),
    'gamma': Real(0, 0.5)
}

print("Starting Bayesian optimization for XGBoost...")
print("This may take several minutes...\n")

# Calculate scale_pos_weight
n_neg = np.sum(y_train == 0)
n_pos = np.sum(y_train == 1)
scale_pos_weight = n_neg / n_pos

# Create base estimator
import xgboost as xgb
xgb_base = xgb.XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    eval_metric='logloss',
    use_label_encoder=False,
    n_jobs=-1
)

# Bayesian search
xgb_search = BayesSearchCV(
    xgb_base,
    xgb_search_space,
    n_iter=30,
    cv=3,
    scoring='roc_auc',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

xgb_search.fit(X_train, y_train)

print("\nBest parameters found:")
for param, value in xgb_search.best_params_.items():
    print(f"  {param}: {value}")
print(f"\nBest CV ROC-AUC: {xgb_search.best_score_:.4f}")

Starting Bayesian optimization for XGBoost...
This may take several minutes...

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candi

In [8]:
# Train final XGBoost with best parameters
xgb_model = XGBoostModel(
    n_estimators=xgb_search.best_params_['n_estimators'],
    max_depth=xgb_search.best_params_['max_depth'],
    learning_rate=xgb_search.best_params_['learning_rate'],
    random_state=42
)
xgb_model.model.set_params(
    subsample=xgb_search.best_params_['subsample'],
    colsample_bytree=xgb_search.best_params_['colsample_bytree'],
    min_child_weight=xgb_search.best_params_['min_child_weight'],
    gamma=xgb_search.best_params_['gamma']
)
xgb_model.train(X_train, y_train)

# Evaluate
xgb_metrics = xgb_model.evaluate(X_test, y_test)
print("\nXGBoost Test Results:")
for metric, value in xgb_metrics.items():
    print(f"  {metric}: {value:.4f}")

# Save model
with open('ml_models/xgboost_optimized.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)
print("\n✓ Model saved: xgboost_optimized.pkl")


XGBoost Test Results:
  accuracy: 0.6618
  precision: 0.1855
  recall: 0.5989
  f1_score: 0.2832
  roc_auc: 0.6863

✓ Model saved: xgboost_optimized.pkl


## 5. Model Comparison

In [9]:
# Consolidate results
results_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost'],
    'Accuracy': [lr_metrics['accuracy'], rf_metrics['accuracy'], xgb_metrics['accuracy']],
    'Precision': [lr_metrics['precision'], rf_metrics['precision'], xgb_metrics['precision']],
    'Recall': [lr_metrics['recall'], rf_metrics['recall'], xgb_metrics['recall']],
    'F1-Score': [lr_metrics['f1_score'], rf_metrics['f1_score'], xgb_metrics['f1_score']],
    'ROC-AUC': [lr_metrics['roc_auc'], rf_metrics['roc_auc'], xgb_metrics['roc_auc']]
})

print("Model Comparison:")
print(results_df.to_string(index=False))

# Save results
print("\n✓ Results saved: model_comparison_results.csv")

Model Comparison:
              Model  Accuracy  Precision   Recall  F1-Score  ROC-AUC
Logistic Regression  0.665766   0.174612 0.535447  0.263346 0.650090
      Random Forest  0.671907   0.184989 0.569793  0.279301 0.678054
            XGBoost  0.661786   0.185463 0.598855  0.283215 0.686289

✓ Results saved: model_comparison_results.csv


In [10]:
# Visualization 1: Metrics comparison
fig = go.Figure()

metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']
colors = ['#636EFA', '#EF553B', '#00CC96']

for i, model in enumerate(results_df['Model']):
    fig.add_trace(go.Bar(
        name=model,
        x=metrics,
        y=results_df.iloc[i, 1:].values,
        marker_color=colors[i],
        text=[f"{v:.3f}" for v in results_df.iloc[i, 1:].values],
        textposition='outside'
    ))

fig.update_layout(
    title='Model Performance Comparison',
    xaxis_title='Metrics',
    yaxis_title='Score',
    barmode='group',
    height=500,
    yaxis=dict(range=[0, 1.1])
)

fig.show()

In [11]:
# Visualization 2: Radar chart
fig = go.Figure()

for i, model in enumerate(results_df['Model']):
    fig.add_trace(go.Scatterpolar(
        r=results_df.iloc[i, 1:].values.tolist() + [results_df.iloc[i, 1]],
        theta=metrics + [metrics[0]],
        fill='toself',
        name=model,
        line_color=colors[i]
    ))

fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 1]
        )
    ),
    title='Model Performance Radar Chart',
    height=600,
    showlegend=True
)

fig.show()

## 6. Model Improvement

In [13]:
from sklearn.metrics import fbeta_score, precision_recall_curve
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# Charger données
df = pd.read_csv('data/diabetic_data_preprocessed.csv')
X = df.drop('readmitted', axis=1)
y = df['readmitted']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Rééchantillonnage
sampling = ImbPipeline([
    ('over', SMOTE(sampling_strategy=0.3, random_state=42)),
    ('under', RandomUnderSampler(sampling_strategy=0.5, random_state=42))
])
X_train_res, y_train_res = sampling.fit_resample(X_train, y_train)

# Entraînement XGBoost optimisé
xgb_improved = XGBoostModel(
    n_estimators=300,
    max_depth=7,
    learning_rate=0.01,
    random_state=42
)
xgb_improved.model.set_params(
    subsample=0.6,
    colsample_bytree=0.6,
    min_child_weight=8,
    gamma=0.5
)
xgb_improved.train(X_train_res, y_train_res)

# Optimisation du seuil
y_proba = xgb_improved.predict_proba(X_test)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_test, y_proba)
f2_scores = (5 * precisions * recalls) / (4 * precisions + recalls + 1e-10)
optimal_idx = np.argmax(f2_scores)
optimal_threshold = thresholds[optimal_idx]

# Prédictions ajustées
y_pred = (y_proba >= optimal_threshold).astype(int)

# Métriques
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
print(f"\nSeuil optimal: {optimal_threshold:.3f}")
print(f"F2-Score: {f2_scores[optimal_idx]:.3f}")

              precision    recall  f1-score   support

           0       0.94      0.43      0.59     18083
           1       0.15      0.79      0.25      2271

    accuracy                           0.47     20354
   macro avg       0.54      0.61      0.42     20354
weighted avg       0.85      0.47      0.55     20354


Seuil optimal: 0.338
F2-Score: 0.422


In [15]:
# Courbe Precision-Recall
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Precision-Recall Curve', 'Threshold Impact on Metrics')
)

# Graphique 1: Courbe PR
fig.add_trace(
    go.Scatter(x=recalls, y=precisions, mode='lines', name='PR Curve',
               line=dict(color='blue', width=2)),
    row=1, col=1
)
fig.add_trace(
    go.Scatter(x=[recalls[optimal_idx]], y=[precisions[optimal_idx]], 
               mode='markers', name=f'Optimal (t={optimal_threshold:.3f})',
               marker=dict(color='red', size=12, symbol='star')),
    row=1, col=1
)

# Graphique 2: Métriques vs Seuil
threshold_range = thresholds[::10]  # Sous-échantillonnage
precision_range = precisions[::10]
recall_range = recalls[::10]
f2_range = f2_scores[::10]

fig.add_trace(
    go.Scatter(x=threshold_range, y=precision_range, name='Precision',
               line=dict(color='green')),
    row=1, col=2
)
fig.add_trace(
    go.Scatter(x=threshold_range, y=recall_range, name='Recall',
               line=dict(color='orange')),
    row=1, col=2
)
fig.add_trace(
    go.Scatter(x=threshold_range, y=f2_range, name='F2-Score',
               line=dict(color='purple', dash='dash')),
    row=1, col=2
)
fig.add_vline(x=optimal_threshold, line_dash="dash", line_color="red",
              annotation_text=f"Optimal: {optimal_threshold:.3f}",
              row=1, col=2)

fig.update_xaxes(title_text="Recall", row=1, col=1)
fig.update_yaxes(title_text="Precision", row=1, col=1)
fig.update_xaxes(title_text="Threshold", row=1, col=2)
fig.update_yaxes(title_text="Score", row=1, col=2)

fig.update_layout(height=500, showlegend=True, title_text="Model Performance Analysis")
fig.show()

In [17]:
# Confusion matrices comparison
cm_default = confusion_matrix(y_test, y_pred_default)
cm_optimal = confusion_matrix(y_test, y_pred_optimal)

fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Default Threshold (0.5)', f'Optimal Threshold ({optimal_threshold:.3f})')
)

# Default CM
fig.add_trace(
    go.Heatmap(
        z=cm_default,
        x=['Predicted: No', 'Predicted: Yes'],
        y=['Actual: No', 'Actual: Yes'],
        text=cm_default,
        texttemplate='%{text}',
        colorscale='Blues',
        showscale=False
    ),
    row=1, col=1
)

# Optimal CM
fig.add_trace(
    go.Heatmap(
        z=cm_optimal,
        x=['Predicted: No', 'Predicted: Yes'],
        y=['Actual: No', 'Actual: Yes'],
        text=cm_optimal,
        texttemplate='%{text}',
        colorscale='Reds',
        showscale=False
    ),
    row=1, col=2
)

fig.update_layout(height=400, title_text="Confusion Matrices Comparison")
fig.show()

print("\nConfusion Matrix - Default Threshold:")
print(f"  True Negatives:  {cm_default[0,0]:5,}")
print(f"  False Positives: {cm_default[0,1]:5,}")
print(f"  False Negatives: {cm_default[1,0]:5,}")
print(f"  True Positives:  {cm_default[1,1]:5,}")

print(f"\nConfusion Matrix - Optimal Threshold ({optimal_threshold:.3f}):")
print(f"  True Negatives:  {cm_optimal[0,0]:5,}")
print(f"  False Positives: {cm_optimal[0,1]:5,}")
print(f"  False Negatives: {cm_optimal[1,0]:5,}")
print(f"  True Positives:  {cm_optimal[1,1]:5,}")


Confusion Matrix - Default Threshold:
  True Negatives:  16,187
  False Positives: 1,896
  False Negatives: 1,677
  True Positives:    594

Confusion Matrix - Optimal Threshold (0.338):
  True Negatives:  7,761
  False Positives: 10,322
  False Negatives:   480
  True Positives:  1,791


In [18]:
import pickle
from datetime import datetime

# Créer un dictionnaire avec tous les composants nécessaires
model_package = {
    'model': xgb_improved,
    'optimal_threshold': optimal_threshold,
    'sampling_strategy': {
        'over_sampling': 0.3,
        'under_sampling': 0.5
    },
    'feature_names': list(X.columns),
    'training_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'performance': {
        'f2_score': f2_scores[optimal_idx],
        'optimal_threshold': optimal_threshold
    }
}

# Export du modèle
model_path = 'ml_models/xgboost_final.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(model_package, f)

print(f"✓ Model saved: {model_path}")
print(f"  - Optimal threshold: {optimal_threshold:.3f}")
print(f"  - F2-Score: {f2_scores[optimal_idx]:.3f}")

✓ Model saved: ml_models/xgboost_final.pkl
  - Optimal threshold: 0.338
  - F2-Score: 0.422
