In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import os
from scipy import sparse
from fastprogress.fastprogress import master_bar, progress_bar



In [2]:
parameters = pd.read_csv('parameter.csv')
parameters = parameters.set_index('variable')
max_travel_t = parameters.loc['max_travel_t','value']
min_travel_t = parameters.loc['min_travel_t','value']

In [3]:
def load_sociodemo(path_geometry,sociodemo):
    '''
    This function loads the sociodemographic data.
    
           Parameters:
                    path_geometry (str): path where the geographic data is stored.
                    sociodemo (pd.DataFrame): sociodemographic data for all zones in the Netherlands.

            Returns:
                    sociodemo (pd.DataFrame): sociodemographic data 
                                              for the city of interest.
    '''
    sociodemo = sociodemo.copy()
    zones = gpd.read_file(path_geometry)

    if 'gemeente' in zones.columns:
        zones = zones.drop(columns = 'gemeente')

    sociodemo = zones.merge(sociodemo, on = 'Postcode')

    sociodemo['N_NL'] = (sociodemo['N_NL'] * sociodemo['area_ratio']).round()
    sociodemo['N_WE'] = (sociodemo['N_WE'] * sociodemo['area_ratio']).round()
    sociodemo['N_NW'] = (sociodemo['N_NW'] * sociodemo['area_ratio']).round()

    sociodemo['pop'] = sociodemo.loc[:,'N_NL'] + sociodemo.loc[:,'N_WE'] + sociodemo.loc[:,'N_NW']
    
    sociodemo['P_NL'] = 0
    sociodemo['P_WE'] = 0
    sociodemo['P_NW'] = 0
    sociodemo.loc[:,'P_NL'] = sociodemo['P_NL'].mask(sociodemo['pop']>0,
                                                     sociodemo['N_NL'] / sociodemo['pop'])
    sociodemo.loc[:,'P_WE'] = sociodemo['P_WE'].mask(sociodemo['pop']>0,
                                                     sociodemo['N_WE'] / sociodemo['pop'])
    sociodemo.loc[:,'P_NW'] = sociodemo['P_NW'].mask(sociodemo['pop']>0,
                                                     sociodemo['N_NW'] / sociodemo['pop'])
    
    sociodemo = sociodemo.drop(columns = ['N_NL','N_NW','N_WE'])
    
    return sociodemo

In [4]:
def compute_cov_mat(shortest_path,sociodemo):
    '''
    This function computes the covariance matrix of exposure to people with a non-western migration background.
    
           Parameters:
                    shortest_path (str): path where the geographic data is stored.
                    sociodemo (pd.DataFrame): sociodemographic data for all zones in the Netherlands.

            Returns:
                    cov_matrix (pd.DataFrame): Covariance matrix where the columns and rows are zones of interest.
                                               The cell contains the covariance of the exposure in two zones.
    '''

    shortest_path['walk_t'] = shortest_path['walk_t'].mask(shortest_path['walk_t'] < min_travel_t, 
                                                               min_travel_t)
    shortest_path['proximity'] = min_travel_t**2 / shortest_path['walk_t']**2

    # Weights of each zone on the exposure of the other zones.
    weight_av = pd.merge(shortest_path,
                         sociodemo[['id_unit','pop']],
                         left_on = 'from_id_unit',
                         right_on = 'id_unit').drop(columns = 'id_unit')

    weight_av = weight_av.drop(columns = 'walk_t')
    weight_av['weight_var'] = weight_av['pop'] * weight_av['proximity']

    # Total weights for a given zone. (The weights are normalized by the total weight).
    weight_av_denom = weight_av[['to_id_unit','weight_var']].groupby(by = 'to_id_unit').sum().reset_index()

    weight_av = weight_av.merge(weight_av_denom.rename(columns = {'weight_var':'denom'}), on = 'to_id_unit')

    weight_av['weight_var_norm'] = 0
    weight_av.loc[:,'weight_var_norm'] = weight_av['weight_var_norm'].mask(weight_av['denom'] > 0,
                                                                           weight_av['weight_var'] / weight_av['denom'])

    # Matrix format.
    weight_mat = weight_av.pivot(index= 'from_id_unit', columns = 'to_id_unit', values = 'weight_var_norm')
    weight_mat = weight_mat.replace(np.nan,0)

    # Definition of the covariance matrix.
    cov_matrix = weight_mat.transpose().dot(weight_mat)

    sociodemo = sociodemo.loc[(sociodemo['id_unit'].isin(shortest_path['from_id_unit'])) & 
                              (sociodemo['id_unit'].isin(shortest_path['to_id_unit'])) & 
                              (sociodemo['pop'] >0)]

    cov_matrix = cov_matrix * sociodemo['P_NW'].var()
    
    return cov_matrix

In [10]:
dir_sociodemo = '../data/processed_data/sociodemographics/'
dir_geometry_edited = '../data/processed_data/zones_delineation/edited/'
dir_geometry = '../data/processed_data/zones_delineation/'
dir_shortest_path = '../data/processed_data/shortest_path/'
dir_cov_mat = '../data/processed_data/covariance_matrix/'
dir_gemeente = '../data/processed_data/city_boundary/'

min_travel_t = 60
max_travel_t = 1200
parameter = min_travel_t**2

sociodemo_raw = pd.read_csv(dir_sociodemo + 'sociodemographics.csv')

list_files = os.listdir(dir_gemeente)
list_files.remove('Buitenland.gpkg')
list_files.remove('README_city_boundary.mkd')

pb = progress_bar(range(len(list_files)))

for i in pb:
    file = list_files[i]
    city = file[:-5]
    pb.comment = city

    if os.path.isfile(dir_cov_mat + city + '_cov_mat.npz') and os.path.isfile(dir_cov_mat + city + 'cov_mat_axis.csv'):
        continue
        
    # Amsterdam has two pieces.
    if city == 'Amsterdam':
        sociodemo_1 = load_sociodemo(dir_geometry_edited + 'PC_' + city + '_1.gpkg', sociodemo_raw)
        sociodemo_2 = load_sociodemo(dir_geometry_edited + 'PC_' + city + '_2.gpkg', sociodemo_raw)
        sociodemo = pd.concat([sociodemo_1, sociodemo_2], ignore_index=True)
        

    elif os.path.isfile(dir_geometry_edited + 'PC_' + file):
        sociodemo = load_sociodemo(dir_geometry_edited + 'PC_' + file, sociodemo_raw)
        
    elif os.path.isfile(dir_geometry + 'PC_' + file):
        sociodemo = load_sociodemo(dir_geometry + 'PC_' + file, sociodemo_raw)

        
    shortest_path = pd.read_csv(dir_shortest_path + city + '_walk_t_unit_to_unit.csv')
    shortest_path = shortest_path.drop_duplicates()
    cov_mat = compute_cov_mat(shortest_path,sociodemo)
    axis_cov_mat = pd.DataFrame({'rows' : cov_mat.index.values, 
                                 'columns': cov_mat.columns.values})
    axis_cov_mat.to_csv(dir_cov_mat + city + 'cov_mat_axis.csv', index = False)
    cov_mat = cov_mat.to_numpy()
    cov_mat = sparse.coo_matrix(cov_mat)  
    sparse.save_npz(dir_cov_mat + city + '_cov_mat.npz', cov_mat)

In [6]:

file = 'Huizen.gpkg'
city = file[:-5]

# Amsterdam has two pieces.
if city == 'Amsterdam':
    sociodemo_1 = load_sociodemo(dir_geometry_edited + 'PC_' + city + '_1.gpkg', sociodemo_raw)
    sociodemo_2 = load_sociodemo(dir_geometry_edited + 'PC_' + city + '_2.gpkg', sociodemo_raw)
    sociodemo = pd.concat([sociodemo_1, sociodemo_2], ignore_index=True)


elif os.path.isfile(dir_geometry_edited + 'PC_' + file):
    sociodemo = load_sociodemo(dir_geometry_edited + 'PC_' + file, sociodemo_raw)

elif os.path.isfile(dir_geometry + 'PC_' + file):
    sociodemo = load_sociodemo(dir_geometry + 'PC_' + file, sociodemo_raw)


shortest_path = pd.read_csv(dir_shortest_path + city + '_walk_t_unit_to_unit.csv')
shortest_path = shortest_path.drop_duplicates()
cov_mat = compute_cov_mat(shortest_path,sociodemo)
axis_cov_mat = pd.DataFrame({'rows' : cov_mat.index.values, 
                             'columns': cov_mat.columns.values})
axis_cov_mat.to_csv(dir_cov_mat + city + 'cov_mat_axis.csv', index = False)
cov_mat = cov_mat.to_numpy()
cov_mat = sparse.coo_matrix(cov_mat)  
sparse.save_npz(dir_cov_mat + city + '_cov_mat.npz', cov_mat)