In [1]:
import os
import re

import numpy as np
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
tqdm.pandas()

from shapely.geometry import MultiPolygon, Polygon
from shapely.validation import make_valid

In [2]:
from constants import CEN_YEARS
from constants import YEAR_CODES

mpoly_GTA = gpd.read_file('../data/geo/regions/GTA.gpkg').geometry.to_list()[0]

Prune geometries to select for the GTA-only versions

In [6]:
cen_years_81 = [y for y in CEN_YEARS if y >= 1981]

for year in tqdm(cen_years_81):
    gdf_csd_all = gpd.read_file(f'../data/geo/{year}_csd/csd_{year}.geojson')
    gdf_csd_all['gta_overlap'] = gdf_csd_all.apply(lambda row: row.geometry.intersection(mpoly_GTA).area / row.geometry.area, axis=1)
    gdf_csd_gta = gdf_csd_all[gdf_csd_all.gta_overlap >= 0.25]
    gdf_csd_gta.to_file(f'../data/geo/{year}_csd/csd_gta_{year}.gpkg', driver="GPKG")

100%|██████████| 9/9 [02:19<00:00, 15.47s/it]


In [10]:
# For each year, save those CT's which are in the GTA
for year in tqdm(CEN_YEARS):
    gdf_ct_all = gpd.read_file(f"../data/geo/{year}_ct/ct_{year}.zip")
    gdf_ct_all['gta_overlap'] = gdf_ct_all.apply(lambda row: row.geometry.intersection(mpoly_GTA).area / row.geometry.area, axis=1)
    gdf_ct_gta = gdf_ct_all[gdf_ct_all.gta_overlap > 0.5]
    gdf_ct_gta.to_file(f"../data/geo/{year}_ct/ct_gta_{year}.gpkg", driver='GPKG')

100%|██████████| 15/15 [00:42<00:00,  2.82s/it]


Join together CSD/CT boundaries and data, then create a joint geographical dataset which covers the full GTA by CSD's where CT's are missing weighted proportionately by how much they are included.

In the case of years without CSD data (<1981), join the CSD geometry from 2021 with NaN values.

In [20]:
def add_census_values_to_gdf(gdf_full, gdf_small, cen_year, cen_var):
    cols = YEAR_CODES[cen_year][cen_var]

    if len(cols) == 0:  
        gdf_small[cen_var] = np.nan
        if cen_var == 'num_not_vm_tot':
            gdf_small.rename(columns={'num_not_vm_tot': 'num_vm_tot'}, inplace=True)
    elif len(cols) > 1:  # New immigrants has multiple tags
        gdf_small.loc[:, cen_var] = gdf_full.loc[:, cols].sum(axis=1)
    elif cen_var == 'num_not_vm_tot':  # Need to compute total - NOT
        pop_total = gdf_full[YEAR_CODES[cen_year]['num_pop_tot'][0]]
        non_vm_total = gdf_full[cols[0]]  # Use [0] to get string from list
        gdf_small['num_vm_tot'] = pop_total - non_vm_total
    else:
        gdf_small.loc[:, cen_var] = gdf_full.loc[:, cols]

In [21]:
for cen_year in tqdm(CEN_YEARS):
    if cen_year < 1961:
        continue
    
    if cen_year != 2021:
        continue

    # Load data
    gdf_csd_gta = gpd.read_file(f"../data/geo/{cen_year}_csd/csd_gta_{cen_year}.gpkg")
    gdf_ct_gta = gpd.read_file(f"../data/geo/{cen_year}_ct/ct_gta_{cen_year}.gpkg")

    df_csd_cen = pd.read_csv(f"../data/census/{cen_year}_csd_wide/census_wide_{cen_year}_csd.csv")
    df_ct_cen = pd.read_csv(f"../data/census/{cen_year}_ct_wide/census_wide_{cen_year}_ct.csv")

    # Prepare for merging
    gdf_csd_gta['geosid'] = gdf_csd_gta['geosid'].astype(str)
    df_csd_cen['geosid'] = df_csd_cen['geosid'].astype(str)

    gdf_ct_gta['geosid'] = gdf_ct_gta['geosid'].astype(str)
    df_ct_cen['geosid'] = df_ct_cen['geosid'].astype(str)
    df_ct_cen['geosid'] = df_ct_cen['geosid'].apply(lambda x: x[:-2] + '.00' if x.endswith('.0') else x)

    # Merge geometry with census data
    gdf_csd_full = pd.merge(
        gdf_csd_gta,
        df_csd_cen,
        on='geosid',
        how='left'
    )

    gdf_ct_full = pd.merge(
        gdf_ct_gta,
        df_ct_cen,
        on='geosid',
        how='left'
    )

    # Obtain desired census variables
    gdf_csd_small = gdf_csd_full[['geosid', 'geometry']].copy()
    gdf_ct_small = gdf_ct_full[['geosid', 'geometry']].copy()

    add_census_values_to_gdf(gdf_csd_full, gdf_csd_small, cen_year, 'num_pop_tot')
    add_census_values_to_gdf(gdf_csd_full, gdf_csd_small, cen_year, 'num_imm_tot')

    add_census_values_to_gdf(gdf_ct_full, gdf_ct_small, cen_year, 'num_pop_tot')
    add_census_values_to_gdf(gdf_ct_full, gdf_ct_small, cen_year, 'num_imm_tot')

100%|██████████| 15/15 [00:22<00:00,  1.53s/it]
