## Train/Test Split

Time-series aware train/test split (80% train, 20% test) while maintaining per-shelter grouping.


In [None]:
import pandas as pd
from pathlib import Path

project_root = Path().resolve().parent
processed_dir = project_root / "data" / "processed"

print("Loading processed features...")
df = pd.read_csv(processed_dir / "features.csv")
df['OCCUPANCY_DATE'] = pd.to_datetime(df['OCCUPANCY_DATE'])

print(f"Loaded {len(df)} rows")
df.head()


In [None]:
# Sort by SHELTER_ID and OCCUPANCY_DATE
df = df.sort_values(['SHELTER_ID', 'OCCUPANCY_DATE']).reset_index(drop=True)

# Chronological split: 80% earliest dates, 20% latest dates
split_idx = int(len(df) * 0.8)

train_df = df.iloc[:split_idx].copy()
test_df = df.iloc[split_idx:].copy()

print(f"Train set: {len(train_df)} rows ({len(train_df) / len(df) * 100:.1f}%)")
print(f"Test set: {len(test_df)} rows ({len(test_df) / len(df) * 100:.1f}%)")
print(f"\nTrain date range: {train_df['OCCUPANCY_DATE'].min()} to {train_df['OCCUPANCY_DATE'].max()}")
print(f"Test date range: {test_df['OCCUPANCY_DATE'].min()} to {test_df['OCCUPANCY_DATE'].max()}")


In [None]:
# Save splits
train_path = processed_dir / "train.csv"
test_path = processed_dir / "test.csv"

print(f"Saving train set to {train_path}...")
train_df.to_csv(train_path, index=False)

print(f"Saving test set to {test_path}...")
test_df.to_csv(test_path, index=False)

print("âœ“ Train/test split complete!")
