In [1]:
import os

import numpy as np
import pandas as pd
import geopandas as gpd
from tqdm import tqdm

In [10]:
YEAR_CODES = {
    1951: {
        'num_pop_tot': 'agec_tot1951ttd',
        'num_imm_tot': '',
        'num_jew_tot': 'eth_euroothrjewi1951tt1',
        'num_chn_tot': 'eth_asia_tot1951tt1',  # Asian, not Chinese -- but Chinese are a majority in 1951
    }, 
    1956: {
        'num_pop_tot': 'agec_tot1956ttd',
        'num_imm_tot': '',
        'num_jew_tot': '',
        'num_chn_tot': '',
    }, 
    1961: {
        'num_pop_tot': 'agec_tot1961ttd',
        'num_imm_tot': 'imb__tot1961ttd',
        'num_jew_tot': 'eth_euroothrjewi1961tt1',
        'num_chn_tot': 'eth_asiaeastchin1961tt1',
    }, 
    1966: {
        'num_pop_tot': 'agec_tot1966ttd',
        'num_imm_tot': '',
        'num_jew_tot': '',
        'num_chn_tot': '',
    }, 
    1971: {
        'num_pop_tot': 'agec_tot1971ttd',
        'num_imm_tot': 'imb__tot1971ttd',
        'num_jew_tot': 'eth_euroothrjewi1971tt1',
        'num_chn_tot': 'eth_asiaeastchin1971tt1',
    }, 
    1976: {
        'num_pop_tot': 'agec_tot1976ttd',
        'num_imm_tot': '',
        'num_jew_tot': '',
        'num_chn_tot': '',
    }, 
    1981: {
        'num_pop_tot': 'agec_tot1981ttd',
        'num_imm_tot': 'imag_tot1981ttd',
        'num_jew_tot': '',
        'num_chn_tot': '',
    }, 
    1986: {
        'num_pop_tot': 'agec_tot1986ttd',
        'num_imm_tot': 'imb__tot1986ttd',
        'num_jew_tot': 'eth_euroothrjewi1986tt1',
        'num_chn_tot': 'eth_asiaeastchin1986tt1',
    }, 
    1991: {
        'num_pop_tot': 'agec_tot1991ttd',
        'num_imm_tot': 'imd__tot1991ttd',
        'num_jew_tot': 'eth_euroothrjewi1991tt1',
        'num_chn_tot': 'eth_asiaeastchin1991tt1',
    }, 
    1996: {
        'num_pop_tot': 'agec_tot1996ttd',
        'num_imm_tot': 'imb__tot1996ttd',
        'num_jew_tot': 'ethmeuroothrjewi1996tt1',
        'num_chn_tot': 'ethmasiaeastchin1996tt1',
    }, 
    2001: {
        'num_pop_tot': 'agec_tot2001ttd',
        'num_imm_tot': 'imb__tot2001ttd',
        'num_jew_tot': 'ethmeuroothrjewi2001tt1',
        'num_chn_tot': 'ethmasiaeastchin2001tt1',
    }, 
    2006: {
        'num_pop_tot': 'agec_tot2006ttd',
        'num_imm_tot': 'imb__tot2006ttd',
        'num_jew_tot': 'ethmeuroothrjewi2006tt3',
        'num_chn_tot': 'ethmasiaeastchin2006tt2',
    }, 
    2011: {
        'num_pop_tot': 'agec_tot2011ttd',
        'num_imm_tot': 'imb__tot2011ttd',
        'num_jew_tot': 'ethmeuroothrjewi2011tt3',
        'num_chn_tot': 'ethmasiaeastchin2011tt3',
    }, 
    2016: {
        'num_pop_tot': 'agec_tot2016ttd',
        'num_imm_tot': 'imb__tot2016ttd',
        'num_jew_tot': 'ethmeuroothrjewi2016tt3',
        'num_chn_tot': 'ethmasiaeastchin2016tt3',
    }, 
    2021: {
        'num_pop_tot': 'agec_tot2021ttd',
        'num_imm_tot': 'imb__tot2021ttd',
        'num_jew_tot': 'ethmeuroothrjewi2021tt1',
        'num_chn_tot': 'ethmasiaeastchin2021tt1',
    },
}

In [32]:
year = 1981
df_cen = pd.read_csv(f"../data/census/{year}_ct_wide/census_wide_{year}_ct.csv", dtype={'geosid': str})
gdf_cur_geo = gpd.read_file(f"../data/geo/{year}_ct_shp/ct_{year}.zip")

In [43]:
tag_total = YEAR_CODES[year]['num_pop_tot']
tag_part = YEAR_CODES[year]['num_imm_tot']

df_cur_cen = df_cen[['geosid', 'geopart', tag_total, tag_part]].copy(deep=True)
df_cur_cen['imm_prop'] = df_cur_cen[tag_part] / df_cur_cen[tag_total]


In [55]:
pd.merge(gdf_cur_geo, df_cur_cen, on="geosid", how='outer')

Unnamed: 0,geosid,time,level,geoname,lat_c,lon_c,lat_r,lon_r,areakm,source,timestamp,version,geometry,geopart,agec_tot1981ttd,imag_tot1981ttd,imm_prop
0,0010001.00,1981,1.0,001.00,47.541905,-52.700848,47.539509,-52.707117,10.410335,StatCan,Sun Oct 23 20:35:20 2022,1.0,"POLYGON ((-52.68954 47.53004, -52.69011 47.529...",,2340,15,0.006410
1,0010002.00,1981,1.0,002.00,47.543926,-52.729141,47.540174,-52.731720,1.979298,StatCan,Sun Oct 23 20:35:20 2022,1.0,"POLYGON ((-52.71822 47.54844, -52.71717 47.548...",,6625,220,0.033208
2,0010003.00,1981,1.0,003.00,47.537304,-52.750281,47.537665,-52.754373,3.608936,StatCan,Sun Oct 23 20:35:20 2022,1.0,"POLYGON ((-52.74120 47.52964, -52.74165 47.529...",,8140,345,0.042383
3,0010004.00,1981,1.0,004.00,47.554360,-52.766477,47.554116,-52.763585,9.324178,StatCan,Sun Oct 23 20:35:20 2022,1.0,"POLYGON ((-52.74217 47.56288, -52.74025 47.560...",,6580,195,0.029635
4,0010005.01,1981,1.0,005.01,47.553756,-52.732434,47.553069,-52.734042,1.072092,StatCan,Sun Oct 23 20:35:20 2022,1.0,"POLYGON ((-52.72475 47.55260, -52.72568 47.553...",,3435,50,0.014556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3314,9700019.00,1981,1.0,019.00,53.945858,-122.737133,53.946573,-122.735988,13.083714,StatCan,Sun Oct 23 20:35:20 2022,1.0,"POLYGON ((-122.71563 53.96250, -122.71403 53.9...",,1500,230,0.153333
3315,9700020.00,1981,1.0,020.00,53.961407,-122.783337,53.960020,-122.789842,12.526351,StatCan,Sun Oct 23 20:35:20 2022,1.0,"POLYGON ((-122.78685 53.98955, -122.78490 53.9...",,3320,320,0.096386
3316,9700021.00,1981,1.0,021.00,53.959558,-122.829140,53.954890,-122.815389,7.870314,StatCan,Sun Oct 23 20:35:20 2022,1.0,"POLYGON ((-122.85022 53.98118, -122.85024 53.9...",,2360,290,0.122881
3317,9700022.00,1981,1.0,022.00,54.005916,-122.819425,54.000055,-122.825697,29.778238,StatCan,Sun Oct 23 20:35:20 2022,1.0,"POLYGON ((-122.76043 54.03953, -122.79730 54.0...",,2020,195,0.096535


In [3]:
for year in tqdm(YEAR_CODES.keys()):
    out_dir = f'../data/immigration/{year}'
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    out_dir = f'../data/ethnicity/{year}'
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

100%|████████████████████████████████████████| 15/15 [00:00<00:00, 10997.13it/s]


In [56]:
def get_target_prop(year, df_cen, tag_total, tag_part, tag_prop_name):
    df_cur_cen = df_cen[['geosid', 'geopart', tag_total, tag_part]].copy(deep=True)
    df_cur_cen[tag_prop_name] = df_cur_cen[tag_part] / df_cur_cen[tag_total]

    gdf_cur_geo = gpd.read_file(f"../data/geo/{year}_ct_shp/ct_{year}.zip")

    return pd.merge(gdf_cur_geo, df_cur_cen, on="geosid", how='outer')

In [16]:
for year in tqdm(YEAR_CODES.keys()):
    df_cen = pd.read_csv(f"../data/census/{year}_ct_wide/census_wide_{year}_ct.csv", dtype={'geosid': str})
    code_pop_tot = YEAR_CODES[year]['num_pop_tot']
    
    # Immigration
    code_imm_tot = YEAR_CODES[year]['num_imm_tot']
    if (code_pop_tot and code_imm_tot):
        f_name = f'../data/immigration/{year}/imm_prop_{year}.gpkg'
        get_target_prop(year, df_cen, code_pop_tot, code_imm_tot, 'imm_prop').to_file(f_name, driver='GPKG')

    # Ethnicity - Jewish 
    # code_jew_tot = YEAR_CODES[year]['num_jew_tot']
    # if (code_pop_tot and code_jew_tot):
    #     df_cur_cen = df_cen[['geosid', 'geopart', code_pop_tot, code_jew_tot]].copy(deep=True)
    #     df_cur_cen['jew_prop'] = df_cur_cen[code_jew_tot] / df_cur_cen[code_pop_tot]
    
    #     gdf_cur_geo = gpd.read_file(f"../data/geo/{year}_ct_shp/ct_{year}.zip")
    
    #     pd.merge(gdf_cur_geo, df_cur_cen, on="geosid").to_file(f'../data/ethnicity/{year}/jew_prop_{year}.gpkg', driver='GPKG')

    # Ethnicity - Chinese
    # code_chn_tot = YEAR_CODES[year]['num_chn_tot']
    # if (code_pop_tot and code_chn_tot):
    #     df_cur_cen = df_cen[['geosid', 'geopart', code_pop_tot, code_chn_tot]].copy(deep=True)
    #     df_cur_cen['chn_prop'] = df_cur_cen[code_chn_tot] / df_cur_cen[code_pop_tot]
    
    #     gdf_cur_geo = gpd.read_file(f"../data/geo/{year}_ct_shp/ct_{year}.zip")
    
    #     pd.merge(gdf_cur_geo, df_cur_cen, on="geosid").to_file(f'../data/ethnicity/{year}/chn_prop_{year}.gpkg', driver='GPKG')

100%|███████████████████████████████████████████| 15/15 [00:25<00:00,  1.67s/it]
