## Train/Test Split

Time-series aware train/test split (80% train, 20% test) while maintaining per-shelter grouping.


In [1]:
import pandas as pd
from pathlib import Path

project_root = Path().resolve().parent
processed_dir = project_root / "data" / "processed"

print("Loading processed features...")
df = pd.read_csv(processed_dir / "features.csv")
df['OCCUPANCY_DATE'] = pd.to_datetime(df['OCCUPANCY_DATE'])

print(f"Loaded {len(df)} rows")
df.head()


Loading processed features...
Loaded 165802 rows


Unnamed: 0,SHELTER_ID,OCCUPANCY_DATE,OCCUPIED_BEDS,ORGANIZATION_ID,SHELTER_ID.1,LOCATION_ID,PROGRAM_ID,SERVICE_USER_COUNT,CAPACITY_ACTUAL_BED,CAPACITY_FUNDING_BED,...,PROGRAM_AREA_Temporary Refugee Response,PROGRAM_AREA_Winter Programs,day_of_week_sin,day_of_week_cos,month_sin,month_cos,week_of_year_sin,week_of_year_cos,OCCUPANCY_RATE_BEDS,overcapacity
0,1,2001-01-21,35.0,2,1,1002.0,11794,35,35.0,36.0,...,False,False,-0.781831,0.62349,0.5,0.866025,0.354605,0.935016,100.0,1
1,1,2001-01-21,11.0,2,1,1013.0,11871,11,15.0,15.0,...,False,False,-0.781831,0.62349,0.5,0.866025,0.354605,0.935016,73.33,0
2,1,2001-01-22,39.0,2,1,1002.0,11794,39,44.0,44.0,...,False,False,0.0,1.0,0.5,0.866025,0.464723,0.885456,88.64,0
3,1,2001-01-22,13.0,2,1,1013.0,11871,13,17.0,17.0,...,False,False,0.0,1.0,0.5,0.866025,0.464723,0.885456,76.47,0
4,1,2001-02-21,31.0,2,1,1002.0,11794,31,35.0,36.0,...,False,False,0.974928,-0.222521,0.866025,0.5,0.822984,0.568065,88.57,0


In [2]:
# Sort by SHELTER_ID and OCCUPANCY_DATE
df = df.sort_values(['SHELTER_ID', 'OCCUPANCY_DATE']).reset_index(drop=True)

# Chronological split: 80% earliest dates, 20% latest dates
split_idx = int(len(df) * 0.8)

train_df = df.iloc[:split_idx].copy()
test_df = df.iloc[split_idx:].copy()

print(f"Train set: {len(train_df)} rows ({len(train_df) / len(df) * 100:.1f}%)")
print(f"Test set: {len(test_df)} rows ({len(test_df) / len(df) * 100:.1f}%)")
print(f"\nTrain date range: {train_df['OCCUPANCY_DATE'].min()} to {train_df['OCCUPANCY_DATE'].max()}")
print(f"Test date range: {test_df['OCCUPANCY_DATE'].min()} to {test_df['OCCUPANCY_DATE'].max()}")


Train set: 132641 rows (80.0%)
Test set: 33161 rows (20.0%)

Train date range: 2001-01-21 00:00:00 to 2031-12-22 00:00:00
Test date range: 2001-01-21 00:00:00 to 2031-12-22 00:00:00


In [4]:
# Save splits
train_path = processed_dir / "train.csv"
test_path = processed_dir / "test.csv"

print(f"Saving train set to {train_path}...")
train_df.to_csv(train_path, index=False)

print(f"Saving test set to {test_path}...")
test_df.to_csv(test_path, index=False)

print("✓ Train/test split complete!")


Saving train set to /Users/qadeermac/workspace/ML-Toronto-Shelter-Occupancy /ML-Toronto-Shelter-Occupancy/data/processed/train.csv...
Saving test set to /Users/qadeermac/workspace/ML-Toronto-Shelter-Occupancy /ML-Toronto-Shelter-Occupancy/data/processed/test.csv...
✓ Train/test split complete!
