## Retrieving all OpenData Toronto data 'Daily Shelter & Overnight Service Occupancy & Capacity'

In [3]:
import requests

base_url = "https://ckan0.cf.opendata.inter.prod-toronto.ca"
package_id = "daily-shelter-overnight-service-occupancy-capacity"

# Get package metadata
package = requests.get(f"{base_url}/api/3/action/package_show", params={"id": package_id}).json()

# Find the first active resource
for resource in package["result"]["resources"]:
    if resource["datastore_active"]:
        resource_id = resource["id"]
        csv_url = f"{base_url}/datastore/dump/{resource_id}"
        
        # Download entire CSV at once
        csv_text = requests.get(csv_url).text
        with open("../data/shelters.csv", "w", encoding="utf-8") as f:
            f.write(csv_text)
        print("All data saved as data/shelters.csv")
        break


All data saved as data/shelters.csv


In [9]:
import pandas as pd

# Load CSV
df = pd.read_csv("../data/shelters.csv")

# Convert date
df['OCCUPANCY_DATE'] = pd.to_datetime(df['OCCUPANCY_DATE'])

# Quick look
print(df.head())
print(df.info())


   _id OCCUPANCY_DATE  ORGANIZATION_ID         ORGANIZATION_NAME  SHELTER_ID  \
0    1     2025-01-01               24  COSTI Immigrant Services          40   
1    2     2025-01-01               24  COSTI Immigrant Services          40   
2    3     2025-01-01               24  COSTI Immigrant Services          40   
3    4     2025-01-01               24  COSTI Immigrant Services          40   
4    5     2025-01-01               24  COSTI Immigrant Services          40   

            SHELTER_GROUP  LOCATION_ID               LOCATION_NAME  \
0  COSTI Reception Centre         1320   COSTI Hotel Program Dixon   
1  COSTI Reception Centre         1320   COSTI Hotel Program Dixon   
2  COSTI Reception Centre         1051      COSTI Reception Centre   
3  COSTI Reception Centre         1051      COSTI Reception Centre   
4  COSTI Reception Centre         1114  COSTI Uptown Hotel Program   

    LOCATION_ADDRESS LOCATION_POSTAL_CODE  ... OCCUPIED_BEDS UNOCCUPIED_BEDS  \
0      640 Dixon R

## Observations:

OCCUPIED_BEDS, CAPACITY_ACTUAL_BED, CAPACITY_FUNDING_BED → ~33k non-null

CAPACITY_ACTUAL_ROOM, OCCUPIED_ROOMS, OCCUPANCY_RATE_ROOMS → ~11k non-null

LOCATION_ADDRESS, LOCATION_POSTAL_CODE, LOCATION_CITY, LOCATION_PROVINCE → small fraction missing

Some rows only have room-based capacity, others bed-based.

__Beds are a more complete data point so we can focus on this for regression and classification__

In [33]:
# Drop rows missing the regression target
df = df.dropna(subset=['OCCUPIED_BEDS'])

# Classification target
df['overcapacity'] = (df['OCCUPANCY_RATE_BEDS'] > 97).astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 33600 entries, 91 to 44919
Data columns (total 39 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   _id                     33600 non-null  int64         
 1   OCCUPANCY_DATE          33600 non-null  datetime64[ns]
 2   ORGANIZATION_ID         33600 non-null  int64         
 3   ORGANIZATION_NAME       33600 non-null  object        
 4   SHELTER_ID              33600 non-null  int64         
 5   SHELTER_GROUP           33600 non-null  object        
 6   LOCATION_ID             33600 non-null  int64         
 7   LOCATION_NAME           33600 non-null  object        
 8   LOCATION_ADDRESS        32017 non-null  object        
 9   LOCATION_POSTAL_CODE    32017 non-null  object        
 10  LOCATION_CITY           32006 non-null  object        
 11  LOCATION_PROVINCE       31628 non-null  object        
 12  PROGRAM_ID              33600 non-null  int64     

## Feature Engineering

In [35]:
# Temporal Features
df['day_of_week'] = df['OCCUPANCY_DATE'].dt.dayofweek
df['month'] = df['OCCUPANCY_DATE'].dt.month
df['week_of_year'] = df['OCCUPANCY_DATE'].dt.isocalendar().week

In [36]:
# Lag & Rolling Features
df = df.sort_values(['SHELTER_ID','OCCUPANCY_DATE'])
df['lag_1'] = df.groupby('SHELTER_ID')['OCCUPIED_BEDS'].shift(1)
df['lag_7'] = df.groupby('SHELTER_ID')['OCCUPIED_BEDS'].shift(7)
df['roll_mean_7'] = df.groupby('SHELTER_ID')['OCCUPIED_BEDS'].rolling(7).mean().shift(1).reset_index(0, drop=True)

In [None]:
# Encode categorical features as bools
cat_cols = ['SECTOR','PROGRAM_MODEL','OVERNIGHT_SERVICE_TYPE','PROGRAM_AREA']
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

In [43]:
df.info()
# Handle missing values for lag features:

df.fillna(0, inplace=True)

<class 'pandas.core.frame.DataFrame'>
Index: 33600 entries, 91 to 44919
Data columns (total 51 columns):
 #   Column                                                   Non-Null Count  Dtype         
---  ------                                                   --------------  -----         
 0   _id                                                      33600 non-null  int64         
 1   OCCUPANCY_DATE                                           33600 non-null  datetime64[ns]
 2   ORGANIZATION_ID                                          33600 non-null  int64         
 3   ORGANIZATION_NAME                                        33600 non-null  object        
 4   SHELTER_ID                                               33600 non-null  int64         
 5   SHELTER_GROUP                                            33600 non-null  object        
 6   LOCATION_ID                                              33600 non-null  int64         
 7   LOCATION_NAME                                        

91        0.0
92        0.0
93        0.0
228       0.0
229       0.0
         ... 
44336    50.0
44482    50.0
44629    50.0
44774    50.0
44919    50.0
Name: lag_7, Length: 33600, dtype: float64