# AI-Driven Risk Prediction Engine for Chronic Care Patients
## Data Exploration and Model Development

This notebook implements the ML pipeline for predicting patient deterioration risk within 90 days.

### Project Objectives:
- Predict probability of deterioration within 90 days (0-100%)
- Achieve AUROC > 0.75
- Provide explainable predictions using SHAP
- Build clinician-friendly risk assessments

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

print("Environment setup complete!")

## 1. Dataset Loading and Initial Exploration

In [None]:
# Load the EHR dataset
df = pd.read_csv('dataset/ehr_cleaned_dataset.csv')

print(f"Dataset Shape: {df.shape}")
print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"\nColumn Count: {len(df.columns)}")
print(f"Sample of columns: {df.columns[:10].tolist()}")

In [None]:
# Examine data types and missing values
print("Data Types Distribution:")
print(df.dtypes.value_counts())
print(f"\nTotal Missing Values: {df.isnull().sum().sum():,}")
print(f"Percentage Missing: {(df.isnull().sum().sum() / (df.shape[0] * df.shape[1]) * 100):.2f}%")

In [None]:
# Key patient information
print("Key Patient Statistics:")
print(f"Total Patients: {df['patient_id'].nunique():,}")
print(f"Gender Distribution:")
print(df['gender'].value_counts())

# Mortality analysis (our target variable)
print(f"\nMortality Distribution:")
print(df['mortality'].value_counts())
print(f"Mortality Rate: {df['mortality'].mean():.4f} ({df['mortality'].mean()*100:.2f}%)")

## 2. Feature Categories Analysis

In [None]:
# Categorize features by type
demographic_cols = ['patient_id', 'gender', 'birthdate']
temporal_cols = ['first_encounter', 'last_encounter', 'avg_encounter_duration_min', 'deceaseddatetime']
vital_cols = ['Body_Height', 'Body_Mass_Index', 'Body_Weight', 'Oral_temperature']
lab_cols = ['Calcium', 'Carbon_Dioxide', 'Chloride', 'Creatinine', 'Glucose', 
           'Hemoglobin_A1c_Hemoglobin_total_in_Blood', 'Potassium', 'Sodium', 
           'Total_Cholesterol', 'Triglycerides', 'Urea_Nitrogen']
condition_cols = ['conditions']
target_col = ['mortality']

print("Feature Categories:")
print(f"Demographic: {len(demographic_cols)} features")
print(f"Temporal: {len(temporal_cols)} features")
print(f"Vitals: {len(vital_cols)} features")
print(f"Lab Results: {len(lab_cols)} features")
print(f"Conditions: {len(condition_cols)} features")
print(f"Target: {len(target_col)} feature")

In [None]:
# Analyze chronic conditions
import ast

def extract_conditions(condition_str):
    if pd.isna(condition_str) or condition_str == '':
        return []
    try:
        return ast.literal_eval(condition_str)
    except:
        return []

# Extract all conditions
df['conditions_list'] = df['conditions'].apply(extract_conditions)

# Get condition counts
all_conditions = []
for conditions in df['conditions_list']:
    all_conditions.extend(conditions)

condition_counts = pd.Series(all_conditions).value_counts()
print("Top 10 Most Common Conditions:")
print(condition_counts.head(10))

## 3. Target Variable Analysis

In [None]:
# Mortality analysis by demographics
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Mortality by gender
mortality_by_gender = df.groupby('gender')['mortality'].agg(['count', 'sum', 'mean']).round(4)
mortality_by_gender.plot(kind='bar', y='mean', ax=axes[0], color=['skyblue', 'lightcoral'])
axes[0].set_title('Mortality Rate by Gender')
axes[0].set_ylabel('Mortality Rate')
axes[0].tick_params(axis='x', rotation=0)

# Age at death analysis
df['birthdate'] = pd.to_datetime(df['birthdate'])
df['deceaseddatetime'] = pd.to_datetime(df['deceaseddatetime'])
df['age_at_death'] = (df['deceaseddatetime'] - df['birthdate']).dt.days / 365.25

deceased_patients = df[df['mortality'] == 1]['age_at_death'].dropna()
if len(deceased_patients) > 0:
    axes[1].hist(deceased_patients, bins=20, alpha=0.7, color='lightcoral')
    axes[1].set_title('Age Distribution at Death')
    axes[1].set_xlabel('Age at Death')
    axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

print(f"\nMortality Statistics:")
print(mortality_by_gender)