In [1]:
import sys
from pathlib import Path

# Detect project root (the folder that contains /src)
project_root = Path.cwd()

# If notebook is inside /notebooks, go one level up
if not (project_root / "src").exists():
    project_root = project_root.parent

# Add src/ to Python path
sys.path.append(str(project_root / "src"))

print("PYTHONPATH updated:", project_root / "src")

PYTHONPATH updated: c:\Users\elsingy\Documents\AMDARI DS\Internship\Project 3\horizonbedforecast\src


In [2]:
# Setup
import pandas as pd
from horizon_forecast.features.demand_aggregation import (
    build_demand_timeseries
)
from horizon_forecast.modeling.split import time_split
from horizon_forecast.evaluation.metrics import regression_metrics

In [3]:
# Load raw data
from horizon_forecast.ingestion.adm_load import load_admissions
from horizon_forecast.ingestion.edarr_load import load_ed_arrivals
from horizon_forecast.ingestion.electsurgeries_load import load_elective_surgeries
from horizon_forecast.ingestion.staff_load import load_staffing

adm = load_admissions()
ed = load_ed_arrivals()
elec = load_elective_surgeries()
staff = load_staffing()

In [4]:
adm["admission_datetime"] = pd.to_datetime(adm["admission_datetime"], errors="coerce")
ed["arrival_datetime"] = pd.to_datetime(ed["arrival_datetime"], errors="coerce")
elec["surgery_date"] = pd.to_datetime(elec["surgery_date"], errors="coerce")
staff["date"] = pd.to_datetime(staff["date"], errors="coerce")

In [5]:
def aggregate_admissions(adm_df: pd.DataFrame, freq="D"):
    df = adm_df.copy()
    df["admission_datetime"] = pd.to_datetime(df["admission_datetime"], errors="coerce")
    df["date"] = df["admission_datetime"].dt.floor(freq)


def aggregate_ed_arrivals(ed_df: pd.DataFrame, freq="D"):
    df = ed_df.copy()
    df["arrival_datetime"] = pd.to_datetime(df["arrival_datetime"], errors="coerce")
    df["date"] = df["arrival_datetime"].dt.floor(freq)


def aggregate_elective_surgeries(elec_df: pd.DataFrame, freq="D"):
    df = elec_df.copy()
    df["surgery_date"] = pd.to_datetime(df["surgery_date"], errors="coerce")
    df["date"] = df["surgery_date"].dt.floor(freq)

In [6]:
# Build modeling dataset
df = build_demand_timeseries(adm, ed, elec, freq="D")
df.head()

Unnamed: 0_level_0,admissions,ed_arrivals,elective_surgeries,dow,month,is_weekend
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-01-01 00:00:00,288.0,99.0,0.0,0,1,0
2024-01-02 00:00:00,240.0,121.0,0.0,1,1,0
2024-01-02 08:00:00,0.0,0.0,11.0,1,1,0
2024-01-03 00:00:00,290.0,130.0,0.0,2,1,0
2024-01-03 08:00:00,0.0,0.0,8.0,2,1,0


In [7]:
# Ensure sorted index
df = df.sort_index()

# Dynamic split points
train_end = df.index[int(len(df) * 0.6)]
val_end   = df.index[int(len(df) * 0.85)]

train, val, test = time_split(df, train_end, val_end)

len(train), len(val), len(test), train.index.min(), train.index.max()

(877,
 365,
 218,
 Timestamp('2024-01-01 00:00:00'),
 Timestamp('2025-03-14 08:00:00'))

In [8]:
# Train/validation/test split
train, val, test = time_split(
    df,
    train_end="2023-12-31",
    val_end="2024-06-30"
)

In [9]:
len(train), len(val), len(test)

(0, 362, 1098)

In [10]:
df.index.min(), df.index.max()

(Timestamp('2024-01-01 00:00:00'), Timestamp('2025-12-31 00:00:00'))

In [11]:
train_end = df.index[int(len(df)*0.6)]
val_end   = df.index[int(len(df)*0.85)]

In [12]:
train, val, test = time_split(
    df,
    train_end="2024-06-30",
    val_end="2025-03-31"
)

In [13]:
y_train = train["admissions"]
y_val   = val["admissions"]
y_test  = test["admissions"]

In [14]:
test["naive"] = y_train.iloc[-1]
regression_metrics(y_test, test["naive"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["naive"] = y_train.iloc[-1]


{'mae': 140.67090909090908,
 'rmse': np.float64(188.85316566533425),
 'mape': np.float64(13348363638.928583),
 'bias': np.float64(129.44545454545454)}

In [15]:
test["mean_baseline"] = y_train.mean()
regression_metrics(y_test, test["mean_baseline"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["mean_baseline"] = y_train.mean()


{'mae': 137.05766951280762,
 'rmse': np.float64(137.52326695607593),
 'mape': np.float64(6943112028.6049795),
 'bias': np.float64(1.804570567553988)}

In [16]:
rolling_mean = y_train.rolling(window=7).mean().iloc[-1]
test["rolling_mean_7"] = rolling_mean
regression_metrics(y_test, test["rolling_mean_7"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["rolling_mean_7"] = rolling_mean


{'mae': 137.13168831168832,
 'rmse': np.float64(139.28548584729853),
 'mape': np.float64(7964571449.4463415),
 'bias': np.float64(22.159740259740296)}

In [17]:
# Simple XGBoost prototype
from xgboost import XGBRegressor

FEATURES = ["ed_arrivals", "elective_surgeries", "dow", "month", "is_weekend"]

X_train, y_train = train[FEATURES], train["admissions"]
X_val,   y_val   = val[FEATURES],   val["admissions"]
X_test,  y_test  = test[FEATURES],  test["admissions"]

model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
)
model.fit(X_train, y_train)

test["xgb"] = model.predict(X_test)
regression_metrics(y_test, test["xgb"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["xgb"] = model.predict(X_test)


{'mae': 8.930650784176859,
 'rmse': np.float64(18.65633986475359),
 'mape': np.float64(60537418.123633064),
 'bias': np.float64(-0.6096719962832603)}

In [18]:
# Quick backtesting loop (rolling origin)
import numpy as np

def rolling_backtest(df, features, horizon=7, step=7, min_train_days=90):
    df = df.sort_index()
    metrics_list = []

    for start in range(min_train_days, len(df) - horizon, step):
        train_slice = df.iloc[:start]
        test_slice  = df.iloc[start:start + horizon]

        X_tr, y_tr = train_slice[features], train_slice["admissions"]
        X_te, y_te = test_slice[features],  test_slice["admissions"]

        m = XGBRegressor(
            n_estimators=200,
            learning_rate=0.05,
            max_depth=4,
            subsample=0.9,
            colsample_bytree=0.9,
            random_state=42,
        )
        m.fit(X_tr, y_tr)
        preds = m.predict(X_te)

        metrics_list.append(regression_metrics(y_te, preds))

    return metrics_list

bt_results = rolling_backtest(df, FEATURES)
bt_results[:3]

[{'mae': 41.64394763324942,
  'rmse': np.float64(95.45325229341283),
  'mape': np.float64(18217551.53247561),
  'bias': np.float64(-35.740911106978146)},
 {'mae': 15.431449210803423,
  'rmse': np.float64(24.284740846341506),
  'mape': np.float64(3373765.3182534343),
  'bias': np.float64(-1.799511179594057)},
 {'mae': 8.742040475032159,
  'rmse': np.float64(17.924500471558908),
  'mape': np.float64(22819642.264063694),
  'bias': np.float64(-8.387702888143915)}]

In [19]:
avg_mae  = np.mean([m["mae"] for m in bt_results])
avg_rmse = np.mean([m["rmse"] for m in bt_results])
avg_mape = np.mean([m["mape"] for m in bt_results])
avg_mae, avg_rmse, avg_mape

(np.float64(9.138481729816016),
 np.float64(15.855015966552147),
 np.float64(41848415.45518705))

In [21]:
import joblib
from pathlib import Path

artifacts_dir = Path("artifacts")
artifacts_dir.mkdir(exist_ok=True, parents=True)

joblib.dump(model, artifacts_dir / "model.pkl")

['artifacts\\model.pkl']

In [22]:
import pandas as pd
from horizon_forecast.evaluation.metrics import regression_metrics

results = {}

# Naive
results["Naive"] = regression_metrics(
    y_test,
    np.full_like(y_test, y_train.iloc[-1])
)

# Mean baseline
results["Mean"] = regression_metrics(
    y_test,
    np.full_like(y_test, y_train.mean())
)

# Rolling mean (7-day)
rolling_mean_7 = y_train.rolling(7).mean().iloc[-1]
results["RollingMean7"] = regression_metrics(
    y_test,
    np.full_like(y_test, rolling_mean_7)
)

# Seasonal naive (7-day lag)
seasonal_naive = y_train.shift(7).iloc[-1]
results["SeasonalNaive"] = regression_metrics(
    y_test,
    np.full_like(y_test, seasonal_naive)
)

# Convert to DataFrame
baseline_table = pd.DataFrame(results).T
baseline_table

Unnamed: 0,mae,rmse,mape,bias
Naive,140.670909,188.853166,13348360000.0,129.445455
Mean,137.05767,137.523267,6943112000.0,1.804571
RollingMean7,137.131688,139.285486,7964571000.0,22.15974
SeasonalNaive,136.554545,193.795089,49.81818,-136.554545
