# 03 - Modeling Prediction
This notebook builds machine learning models to predict whether a patient will show up for their medical appointment.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load cleaned dataset
df = pd.read_csv('../data/healthcare_noshows_appt.csv')

# Rename and convert necessary fields
df.rename(columns={
    'Hipertension': 'Hypertension',
    'Handcap': 'Handicap',
    'Showed_up': 'ShowedUp',
    'Date.diff': 'LeadTimeDays'
}, inplace=True)

# Feature engineering
df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay'])
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay'])
df['AppointmentWeekday'] = df['AppointmentDay'].dt.dayofweek
df['LeadTime'] = (df['AppointmentDay'] - df['ScheduledDay']).dt.days
df['HourScheduled'] = df['ScheduledDay'].dt.hour


## Data Preprocessing

In [None]:
# Drop unused columns
df.drop(['PatientId', 'AppointmentID', 'ScheduledDay', 'AppointmentDay', 'Neighbourhood'], axis=1, inplace=True, errors='ignore')

# Encode categorical values
df['Gender'] = df['Gender'].map({'F': 0, 'M': 1})
df['ShowedUp'] = df['ShowedUp'].astype(int)

# Define features and label
X = df.drop('ShowedUp', axis=1)
y = df['ShowedUp']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Random Forest Classifier

In [None]:
# Train model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))