In [65]:
import os
import re

import numpy as np
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
tqdm.pandas()

from shapely.geometry import MultiPolygon, Polygon
from shapely.validation import make_valid

In [4]:
from constants import CENSUS_CODES
from constants import CENSUS_25, CENSUS_YEARS

MUNICIPALITIES = ['Toronto', 'Mississauga', 'Brampton', 'Vaughan', 'Richmond Hill', 'Markham', 'Pickering', 'Ajax']

gdf_gta_csd = gpd.read_file('../data/geo/regions/GTA_CSD.gpkg').to_crs(4326)
gdf_tmun_csd = gdf_gta_csd[gdf_gta_csd['CSDNAME'].isin(MUNICIPALITIES)]
gdf_tmun_union = gdf_tmun_csd.unary_union

Tabulate number of speakers and percent of speakers (for each language) in each CT.
Join language data with geometry. Tag CT geometry with 2021 municipalities, and filter for set of target municipalities and save.

In [67]:
def format_ctid(ctid):
    # Handle cases where input might be float, int, or string
    if pd.isna(ctid):
        return ctid
    
    # Convert to string and split into parts
    parts = str(ctid).split('.')
    
    # Pad left part with zeros to 4 digits
    left_part = parts[0].zfill(4)
    
    # Get right part (or use '00' if not present)
    right_part = parts[1] if len(parts) > 1 else '00'
    
    # Ensure right part has exactly 2 digits
    right_part = right_part.ljust(2, '0')[:2]
    
    return f"{left_part}.{right_part}"

# Calculate intersection percentage for each census tract
def calculate_intersection(geom):
    if geom.is_empty:
        return 0
    intersection_area = geom.intersection(gdf_tmun_union).area
    return intersection_area / geom.area

In [68]:
def join_geo_ct(gdf_ct_gta, df_ct_cen, intersection_threshold=0.12):
    """
    Join census tract geometries with census data and filter for tracts that intersect
    with municipal boundaries by at least the threshold percentage.
    
    Parameters:
    - gdf_ct_gta: GeoDataFrame with census tract geometries
    - df_ct_cen: DataFrame with census data
    - gdf_gta_csd: GeoDataFrame with municipal boundaries
    - intersection_threshold: Minimum intersection percentage (default 2%)
    
    Returns:
    - Filtered GeoDataFrame with joined data and geometries
    """
    # Convert geosid to string and standardize format
    gdf_ct_gta['geosid'] = gdf_ct_gta['geosid'].astype(str)
    df_ct_cen['geosid'] = df_ct_cen['geosid'].astype(str)
    df_ct_cen['geosid'] = df_ct_cen['geosid'].apply(lambda x: x[:-2] + '.00' if x.endswith('.0') else x)

    # Perform the merge
    gdf_full = gdf_ct_gta.merge(
        df_ct_cen,
        on='geosid',
        how='left'
    )
    
    gdf_full['intersection_ratio'] = gdf_full.geometry.apply(calculate_intersection)
    gdf_full = gdf_full[gdf_full['intersection_ratio'] >= intersection_threshold].copy()
    
    return gdf_full.drop(columns=['intersection_ratio'])

def join_geo_ct_chass(gdf_ct_gta, df_ct_cen, intersection_threshold=0.12):
    """
    """
    gdf_ct_gta['geosid'] = gdf_ct_gta['geosid'].astype(str)
    gdf_ct_gta['ctid'] = gdf_ct_gta['geosid'].apply(lambda x: x[3:])
    df_ct_cen['ctid'] = df_ct_cen['ctid'].astype(str)
    df_ct_cen['ctid'] = df_ct_cen['ctid'].apply(format_ctid)

    gdf_full = gdf_ct_gta.merge(
        df_ct_cen,
        on='ctid',
        how='left'
    )

    gdf_full['intersection_ratio'] = gdf_full.geometry.apply(calculate_intersection)
    gdf_full = gdf_full[gdf_full['intersection_ratio'] >= intersection_threshold].copy()
    
    return gdf_full.drop(columns=['intersection_ratio'])

In [69]:
def get_num_speakers(gdf_full, year):
    """
    """
    gdf_small = gdf_full[['geosid', 'geometry']].copy()

    for l_code, c_codes in CENSUS_CODES[year].items():
        if c_codes:
            col = l_code
            if l_code != 'num_tot':
                col = f'num_{l_code}'

            c_code = c_codes[0]

            gdf_small.loc[:, col] = gdf_full.loc[:, c_code]
    
    gdf_small.insert(3, 'num_not_eng', gdf_small['num_tot'] - gdf_small['num_eng'])

    return gdf_small

In [70]:
def get_pct_speakers(gdf_num_speakers, year):
    """
    """
    gdf_small = gdf_num_speakers[['geosid', 'geometry']].copy()

    for l_code, c_codes in CENSUS_CODES[year].items():
        if c_codes and (l_code != 'num_tot'):
            col_in = f'num_{l_code}'
            col_out = f'pct_{l_code}'

            gdf_small[col_out] = gdf_num_speakers[col_in] / gdf_num_speakers['num_tot']
    
    gdf_small['num_tot'] = gdf_num_speakers['num_tot']

    gdf_small.insert(3, 'pct_not_eng', 1 - gdf_small['pct_eng'])

    return gdf_small

In [73]:
for year in tqdm(CENSUS_YEARS):
    # Load data
    gdf_ct_gta = gpd.read_file(f"../data/geo/{year}_ct/ct_gta_{year}.gpkg")
    df_ct_cen = pd.read_csv(f"../data/census/{year}_ct_wide/census_wide_{year}_ct.csv")
    if year == 1996 or year == 2001:
        df_ct_cen = pd.read_csv(f"../data/census/{year}_ct_wide/census_chass_{year}_ct.csv")

    # Join geometric data 
    if year == 1996 or year == 2001:
        gdf_full = join_geo_ct_chass(gdf_ct_gta, df_ct_cen)
    else:
        gdf_full = join_geo_ct(gdf_ct_gta, df_ct_cen)

    # Compute number speakers per language (ie. select columns)
    gdf_num_speakers = get_num_speakers(gdf_full, year)

    # Compute percent speakers per language (ie. divide by total)
    gdf_pct_speakers = get_pct_speakers(gdf_num_speakers, year)

    if not os.path.exists(f'../data/language/{year}'):
        os.makedirs(f'../data/language/{year}')
    
    gdf_num_speakers.to_file(f'../data/language/{year}/num_speakers_tmun_{year}.gpkg', driver='GPKG')
    gdf_pct_speakers.to_file(f'../data/language/{year}/pct_speakers_tmun_{year}.gpkg', driver='GPKG')

100%|██████████| 1/1 [00:09<00:00,  9.33s/it]
