# 03 - Modeling Prediction with Class Balancing
This notebook trains machine learning models to predict patient no-shows, using SMOTE to balance the dataset.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv('../data/healthcare_noshows_appt.csv')

# Rename columns
df.rename(columns={
    'Hipertension': 'Hypertension',
    'Handcap': 'Handicap',
    'Showed_up': 'ShowedUp',
    'Date.diff': 'LeadTimeDays'
}, inplace=True)

# Convert date columns
df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay'])
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay'])

# Feature engineering
df['AppointmentWeekday'] = df['AppointmentDay'].dt.dayofweek
df['LeadTime'] = (df['AppointmentDay'] - df['ScheduledDay']).dt.days
df['HourScheduled'] = df['ScheduledDay'].dt.hour

## Preprocessing & SMOTE Oversampling

In [None]:
# Drop unnecessary columns
df.drop(['PatientId', 'AppointmentID', 'ScheduledDay', 'AppointmentDay', 'Neighbourhood'], axis=1, inplace=True, errors='ignore')

# Encode categorical variables
df['Gender'] = df['Gender'].map({'F': 0, 'M': 1})
df['ShowedUp'] = df['ShowedUp'].astype(int)

# Features and target
X = df.drop('ShowedUp', axis=1)
y = df['ShowedUp']

# Split before applying SMOTE to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to training data
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

print('Original dataset shape:', y_train.value_counts().to_dict())
print('Resampled dataset shape:', pd.Series(y_res).value_counts().to_dict())

## Train Model on Balanced Data

In [None]:
# Train Random Forest model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_res, y_res)

# Evaluate
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))