# Diabetes Hospital Readmission Prediction

**Problem Statement:** Can we predict 30-day hospital readmission risk for diabetic patients ?

**Project Overview:** This project analyzes over 100,000 hospital admissions to develop models predicting early readmission in diabetic patients. Using 50+ features including medications, diagnoses, and procedures, we compare multiple ML algorithms to identify high-risk patients requiring enhanced post-discharge care.

## 1. Importation

In [28]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

## 2. Data Loading

In [29]:
# Chargement du dataset
df = pd.read_csv('data/diabetic_data.csv')

print(f"Dataset chargé: {df.shape[0]} lignes, {df.shape[1]} colonnes")
print(f"\nAperçu:")
df.head()

Dataset chargé: 101766 lignes, 50 colonnes

Aperçu:


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


## 3. Drop columns

In [30]:
# Colonnes à supprimer basées sur l'analyse descriptive
columns_to_drop = [
    'encounter_id',           # Identifiant unique, pas prédictif
    'patient_nbr',            # Identifiant patient, pas prédictif
    'weight',                 # 97% de valeurs manquantes
    'payer_code',             # 52% de valeurs manquantes
    'medical_specialty',      # 53% de valeurs manquantes
    'examide',                # Toujours 'No'
    'citoglipton'             # Toujours 'No'
]

df = df.drop(columns=columns_to_drop)

print(f"{len(columns_to_drop)} deleted columns")

7 deleted columns


## 4. Missing values

In [31]:
# Vérification des valeurs manquantes restantes
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing': missing[missing > 0],
    'Percentage': missing_pct[missing > 0]
}).sort_values('Percentage', ascending=False)

if len(missing_df) > 0:
    print("Missing values:")
    print(missing_df)
    
    # Visualisation
    fig = px.bar(missing_df, y=missing_df.index, x='Percentage',
                 orientation='h',
                 title='% of missing values per column',
                 labels={'Percentage': 'Percentage (%)', 'index': 'Columns'})
    fig.show()
else:
    print("No missing values")

Missing values:
               Missing  Percentage
max_glu_serum    96420   94.746772
A1Cresult        84748   83.277322


In [32]:
# Processing of missing values (?)
for col in df.select_dtypes(include=['object']).columns:
    if '?' in df[col].values:
        mode_value = df[col][df[col] != '?'].mode()[0]
        df[col] = df[col].replace('?', mode_value)
        print(f"Column '{col}': '?' replaced by '{mode_value}'")

Column 'race': '?' replaced by 'Caucasian'
Column 'diag_1': '?' replaced by '428'
Column 'diag_2': '?' replaced by '276'
Column 'diag_3': '?' replaced by '250'


In [33]:
if df['race'].isnull().sum() > 0:
    df['race'].fillna('Other', inplace=True)
    print("Missing 'race' values set to 'Other'")

## 5. Target variable

In [34]:
# Setting 'readmitted' to a binary value
# '<30' = 1 
# '>30' ou 'NO' = 0

print("Distribution before processing :")
print(df['readmitted'].value_counts())

df['readmitted'] = df['readmitted'].map({
    '<30': 1,
    '>30': 0,
    'NO': 0
})

print("\nDistribution after processing :")
print(df['readmitted'].value_counts())
print(f"\nReadmission <30 rate: {df['readmitted'].mean():.2%}")

Distribution before processing :
readmitted
NO     54864
>30    35545
<30    11357
Name: count, dtype: int64

Distribution after processing :
readmitted
0    90409
1    11357
Name: count, dtype: int64

Readmission <30 rate: 11.16%


In [35]:
# Visualisation de l'équilibre des classes
fig = px.pie(df, names='readmitted', 
             title='Distribution of the target variable',
             labels={1: 'Readmission <30', 0: 'No Readmission <30'})
fig.show()

## 6. Preprocessing of categorical variables

In [36]:
# Simplification of the age column
age_mapping = {
    '[0-10)': '0-30',
    '[10-20)': '0-30',
    '[20-30)': '0-30',
    '[30-40)': '30-50',
    '[40-50)': '30-50',
    '[50-60)': '50-70',
    '[60-70)': '50-70',
    '[70-80)': '70-80',
    '[80-90)': '80+',
    '[90-100)': '80+'
}

df['age'] = df['age'].map(age_mapping)

In [37]:
# Simplification of diagnostics (diag_1, diag_2, diag_3)

def simplify_diagnosis(diag_code):
    if pd.isna(diag_code):
        return 'Other'
    
    diag_code = str(diag_code)
    
    # Extraire le préfixe numérique
    if diag_code.startswith('V') or diag_code.startswith('E'):
        return 'Other'
    
    try:
        code_num = float(diag_code)
    except:
        return 'Other'
    
    # Catégorisation selon ICD-9
    if 390 <= code_num < 460 or code_num == 785:
        return 'Circulatory'
    elif 460 <= code_num < 520 or code_num == 786:
        return 'Respiratory'
    elif 520 <= code_num < 580 or code_num == 787:
        return 'Digestive'
    elif 250.0 <= code_num < 251:
        return 'Diabetes'
    elif 800 <= code_num < 1000:
        return 'Injury'
    elif 710 <= code_num < 740:
        return 'Musculoskeletal'
    elif 580 <= code_num < 630 or code_num == 788:
        return 'Genitourinary'
    elif 140 <= code_num < 240:
        return 'Neoplasms'
    else:
        return 'Other'

df['diag_1'] = df['diag_1'].apply(simplify_diagnosis)
df['diag_2'] = df['diag_2'].apply(simplify_diagnosis)
df['diag_3'] = df['diag_3'].apply(simplify_diagnosis)

In [38]:
# Simplification of drugs

medication_cols = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
                   'glimepiride', 'acetohexamide', 'glipizide', 'glyburide',
                   'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
                   'miglitol', 'troglitazone', 'tolazamide', 'insulin',
                   'glyburide-metformin', 'glipizide-metformin',
                   'glimepiride-pioglitazone', 'metformin-rosiglitazone',
                   'metformin-pioglitazone']

for col in medication_cols:
    if col in df.columns:
        df[col] = df[col].apply(lambda x: 0 if x == 'No' else 1)

In [39]:
# Setting categorical variables to binary
binary_mappings = {
    'change': {'No': 0, 'Ch': 1},
    'diabetesMed': {'No': 0, 'Yes': 1},
    'gender': {'Male': 0, 'Female': 1, 'Unknown/Invalid': 0}
}

for col, mapping in binary_mappings.items():
    if col in df.columns:
        df[col] = df[col].map(mapping)

## 7. Encoding of categorical variables

In [40]:
# One-Hot Encoding for the remaining categorical values

categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

## 8. Outliers detection

In [41]:
numeric_cols = df_encoded.select_dtypes(include=[np.number]).columns.tolist()

if 'readmitted' in numeric_cols:
    numeric_cols.remove('readmitted')

print(f"Colonnes numériques à analyser: {len(numeric_cols)}")

Colonnes numériques à analyser: 35


In [42]:
# Outliers detection
outliers_info = []

for col in ['time_in_hospital', 'num_lab_procedures', 'num_procedures',
            'num_medications', 'number_outpatient', 'number_emergency',
            'number_inpatient', 'number_diagnoses']:
    if col in df_encoded.columns:
        Q1 = df_encoded[col].quantile(0.25)
        Q3 = df_encoded[col].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = df_encoded[(df_encoded[col] < lower_bound) | 
                             (df_encoded[col] > upper_bound)]
        
        outliers_info.append({
            'Column': col,
            'Outliers': len(outliers),
            'Percentage': (len(outliers) / len(df_encoded)) * 100,
            'Lower_Bound': lower_bound,
            'Upper_Bound': upper_bound
        })

outliers_df = pd.DataFrame(outliers_info)
print("Détection des outliers:")
print(outliers_df)

Détection des outliers:
               Column  Outliers  Percentage  Lower_Bound  Upper_Bound
0    time_in_hospital      2252    2.212920         -4.0         12.0
1  num_lab_procedures       143    0.140518         -8.0         96.0
2      num_procedures      4954    4.868031         -3.0          5.0
3     num_medications      2557    2.512627         -5.0         35.0
4   number_outpatient     16739   16.448519          0.0          0.0
5    number_emergency     11383   11.185465          0.0          0.0
6    number_inpatient      7049    6.926675         -1.5          2.5
7    number_diagnoses       281    0.276124          1.5         13.5


In [43]:
# Outliers visualization
if len(outliers_df) > 0:
    fig = px.bar(outliers_df, x='Column', y='Percentage',
                 title='Pourcentage d\'outliers par colonne numérique',
                 labels={'Percentage': 'Pourcentage (%)', 'Column': 'Colonnes'})
    fig.show()

In [44]:
# Outliers winsorization

for col in ['time_in_hospital', 'num_lab_procedures', 'num_procedures',
            'num_medications', 'number_outpatient', 'number_emergency',
            'number_inpatient', 'number_diagnoses']:
    if col in df_encoded.columns:
        lower = df_encoded[col].quantile(0.01)
        upper = df_encoded[col].quantile(0.99)
        
        df_encoded[col] = df_encoded[col].clip(lower=lower, upper=upper)

## 9. Feature Engineering

In [45]:
# New features creation
df_encoded['total_visits'] = (df_encoded['number_outpatient'] + 
                               df_encoded['number_emergency'] + 
                               df_encoded['number_inpatient'])

df_encoded['procedures_per_day'] = df_encoded['num_procedures'] / (df_encoded['time_in_hospital'] + 1)

df_encoded['lab_per_day'] = df_encoded['num_lab_procedures'] / (df_encoded['time_in_hospital'] + 1)

diabetes_meds = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
                 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide',
                 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
                 'miglitol', 'troglitazone', 'tolazamide', 'insulin']

diabetes_meds_present = [col for col in diabetes_meds if col in df_encoded.columns]
df_encoded['total_diabetes_meds'] = df_encoded[diabetes_meds_present].sum(axis=1)

df_encoded['high_risk_patient'] = (df_encoded['total_visits'] >= 2).astype(int)

df_encoded['has_emergency'] = (df_encoded['number_emergency'] > 0).astype(int)

print("New features :")
print("  - total_visits")
print("  - procedures_per_day")
print("  - lab_per_day")
print("  - total_diabetes_meds")
print("  - high_risk_patient")
print("  - has_emergency")

print(df_encoded.head(10))

New features :
  - total_visits
  - procedures_per_day
  - lab_per_day
  - total_diabetes_meds
  - high_risk_patient
  - has_emergency
   gender  admission_type_id  discharge_disposition_id  admission_source_id  \
0       1                  6                        25                    1   
1       1                  1                         1                    7   
2       1                  1                         1                    7   
3       0                  1                         1                    7   
4       0                  1                         1                    7   
5       0                  2                         1                    2   
6       0                  3                         1                    2   
7       0                  1                         1                    7   
8       1                  2                         1                    4   
9       1                  3                         3                    4

## 10. Standardization 

In [46]:
cols_to_scale = ['time_in_hospital', 'num_lab_procedures', 'num_procedures',
                 'num_medications', 'number_outpatient', 'number_emergency',
                 'number_inpatient', 'number_diagnoses', 'total_visits',
                 'procedures_per_day', 'lab_per_day', 'total_diabetes_meds']

In [47]:
# Standardization (mean=0, std=1)
scaler = StandardScaler()
df_encoded[cols_to_scale] = scaler.fit_transform(df_encoded[cols_to_scale])

print(df_encoded[cols_to_scale].describe().loc[['mean', 'std']])

      time_in_hospital  num_lab_procedures  num_procedures  num_medications  \
mean      5.809126e-17        4.468559e-18   -6.479410e-17    -8.043406e-17   
std       1.000005e+00        1.000005e+00    1.000005e+00     1.000005e+00   

      number_outpatient  number_emergency  number_inpatient  number_diagnoses  \
mean      -3.574847e-17      1.787424e-17     -6.255982e-17      5.362271e-17   
std        1.000005e+00      1.000005e+00      1.000005e+00      1.000005e+00   

      total_visits  procedures_per_day   lab_per_day  total_diabetes_meds  
mean  2.681135e-17        2.122565e-17  1.184168e-16         7.596550e-17  
std   1.000005e+00        1.000005e+00  1.000005e+00         1.000005e+00  


## 11. Exporting preprocessed dataset

In [48]:
output_path = 'data/diabetic_data_preprocessed.csv'
df_encoded.to_csv(output_path, index=False)
print(f"✓ Dataset saved: {output_path}")

✓ Dataset saved: data/diabetic_data_preprocessed.csv
