### Loading daily collisions and neighbourhood list

In [1]:
import pandas as pd
import geopandas as gpd

DAILY_PATH = "../data/interim/collisions_nbhd_daily.parquet"
NBHD_PATH = "../data/raw/boundaries/toronto_neighbourhoods.geojson"

daily = pd.read_parquet(DAILY_PATH)
nbhd = gpd.read_file(NBHD_PATH)[["AREA_ID", "AREA_NAME"]].drop_duplicates()

daily["date"] = pd.to_datetime(daily["date"])
print(daily.shape, nbhd.shape)
daily.head()


(362550, 4) (158, 2)


Unnamed: 0,date,AREA_ID,AREA_NAME,collision_count
0,2014-01-01,2502209,Humber Summit,1
1,2014-01-01,2502213,West Humber-Clairville,1
2,2014-01-01,2502218,Kingsview Village-The Westway,1
3,2014-01-01,2502222,Princess-Rosethorn,1
4,2014-01-01,2502227,Stonegate-Queensway,1


### Creating full date range

In [2]:
min_date = daily["date"].min()
max_date = daily["date"].max()

all_dates = pd.date_range(min_date, max_date, freq="D")
print(min_date, max_date, len(all_dates))


2014-01-01 00:00:00 2025-12-31 00:00:00 4383


### Cross joining neighborhoods and dates

In [3]:
nbhd["key"] = 1
dates_df = pd.DataFrame({"date": all_dates})
dates_df["key"] = 1

base = nbhd.merge(dates_df, on="key").drop(columns=["key"])
print(base.shape)
base.head()


(692514, 3)


Unnamed: 0,AREA_ID,AREA_NAME,date
0,2502366,South Eglinton-Davisville,2014-01-01
1,2502366,South Eglinton-Davisville,2014-01-02
2,2502366,South Eglinton-Davisville,2014-01-03
3,2502366,South Eglinton-Davisville,2014-01-04
4,2502366,South Eglinton-Davisville,2014-01-05


### Left joining collisions and filling 0

In [4]:
base = base.merge(
    daily.rename(columns={"AREA_ID": "AREA_ID", "AREA_NAME": "AREA_NAME"}),
    on=["date", "AREA_ID", "AREA_NAME"],
    how="left"
)

base["collision_count"] = base["collision_count"].fillna(0).astype(int)

print(base.shape)
base["collision_count"].describe()


(692514, 4)


count    692514.000000
mean          0.952180
std           1.274196
min           0.000000
25%           0.000000
50%           1.000000
75%           1.000000
max          28.000000
Name: collision_count, dtype: float64

### Saving processed base table

In [5]:
OUT_PATH = "../data/processed/base_nbhd_day.parquet"
base.to_parquet(OUT_PATH, index=False)
print("Saved:", OUT_PATH)

Saved: ../data/processed/base_nbhd_day.parquet
