In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
from datetime import datetime,timedelta
import os
from fastprogress.fastprogress import master_bar, progress_bar
pd.set_option('mode.chained_assignment','raise')



# Geographic pre-processing

In [2]:
parameters = pd.read_csv('parameter.csv')
parameters = parameters.set_index('variable')
ratio_in_gemeente = parameters.loc['ratio_in_gemeente','value']

In [3]:
# Postcode demographic data.
zones = gpd.read_file('../data/raw_data/sociodemographics/PC6/CBS_PC6_2017_v3.shp')
zones = zones[['Postcode','geometry']]
gemeenten = gpd.read_file('../data/raw_data/sociodemographics/Gemeente/WijkBuurtkaart_2017_v3.gpkg',
                          layer = 'gemeente_2017_v3')

In [4]:
gemeenten['GM_NAAM'] = gemeenten['GM_NAAM'].str.replace("'",'', regex = True)
gemeenten['GM_NAAM'] = gemeenten['GM_NAAM'].str.replace('\(','', regex = True)
gemeenten['GM_NAAM'] = gemeenten['GM_NAAM'].str.replace('.\)','', regex = True)
gemeenten['GM_NAAM'] = gemeenten['GM_NAAM'].str.replace(' ','_', regex = True)
gemeenten['GM_NAAM'] = gemeenten['GM_NAAM'].str.replace('-','_', regex = True)
gemeenten = gemeenten.dissolve(by = 'GM_NAAM').reset_index()

gemeenten = gemeenten.rename(columns = {'GM_NAAM':'gemeente'})[['gemeente','geometry']]

In [5]:
zones['surface_postcode'] = zones.area.copy()

zones_overlay = gpd.overlay(zones,gemeenten, how = 'intersection',keep_geom_type=True)

zones_overlay['ratio_in_gemeente'] = zones_overlay.area / zones_overlay['surface_postcode']
zones_overlay = zones_overlay.loc[zones_overlay['ratio_in_gemeente'] > ratio_in_gemeente]
zones_overlay = zones_overlay[['Postcode','gemeente','geometry']]

In [6]:
zones = zones.merge(zones_overlay[['Postcode','gemeente']], on = 'Postcode')

zones = zones.drop(columns = 'surface_postcode')

In [7]:
def area_building_footprint_res(buildings, zones):
    
    '''
    This function:
        - Breaks discontinous postcodes into continuous spatial units.
        - Determines the centres of these units if they have residential buildings in it.
        - Computes the area_ratio, indicating how to allocate data from the postcodes to the spatial units.
    
           Parameters:
                    buildings (gpd.GeoDataFrame): building layout.
                    zones (gpd.GeoDataFrame): Postcodes.

            Returns:
                    zones_split (gpd.GeoDataFrame): Spatial units deriving from postcodes.
                    zones_merged (gpd.GeoDataFrame): Spatial units deriving from postcodes, with area_ratio.
                    centroids (gpd.GeoDataFrame): Centres of the spatial units.
    '''

    zones = zones.copy()
    
    # Filtering residential buildings from non-residential ones.
    buildings_res = buildings.loc[buildings['building'] != 'yes']
    buildings_nres = buildings.loc[buildings['building'] == 'yes']
    
    # Determining the footprint of residential buildings in zones.
    # If a postcode is composed of two disjoints areas,
    # the footprint is used to allocate the population between the two disjoints area.
    zones_res = gpd.overlay(zones, buildings_res,how = 'intersection')
    zones_res = zones_res.dissolve(by = 'Postcode')
    
    zones_res = zones_res.reset_index().loc[:,['Postcode',
                                               'gemeente',
                                               'geometry']]

    zones_res['building_footprint_tot'] = zones_res.area
    
    # Breaking postcodes into smaller zones, if they are composed by several disjoints areas.
    # index_parts = True does not work. Providing unique id manually instead.
    zones_split = zones.explode(index_parts = False)

    # Defining an id for this new zones.
    i = 0
    zones_split['id_unit'] = i
    while not zones_split.loc[zones_split.duplicated(subset=['Postcode','id_unit'])].empty:
        i = i+1
        zones_split['id_unit'] = zones_split['id_unit'].mask(zones_split.duplicated(subset=['Postcode',
                                                                                            'id_unit'], 
                                                                                    keep = 'first'), i)
    
    zones_split['id_unit'] = zones_split['Postcode'] + zones_split['id_unit'].astype(str)
    
    # The footprint of the buildings is computed again for the newly formed areas.
    zones_res_split = gpd.overlay(zones_split,
                                  buildings_res,
                                  how = 'intersection')
    
    zones_res_split = zones_res_split.dissolve(by = 'id_unit')
    
    zones_res_split = zones_res_split.reset_index().loc[:,['id_unit',
                                                           'Postcode',
                                                           'gemeente',
                                                           'geometry']]

    zones_res_split['building_footprint'] = zones_res_split.area
    
    # Adding the footprint of the entire postcode to the new data.
    zones_res_split = zones_res_split.merge(zones_res[['Postcode',
                                                       'building_footprint_tot']],
                                            on = 'Postcode')
    
    # Computing the ratio of the footprint of the postcode laying in the newly formed areas.
    # It is used to allocate the demographics of the postcode to the smaller areas.
    zones_res_split['area_ratio'] = zones_res_split['building_footprint'] / zones_res_split['building_footprint_tot']
    
    # The center of the zones are defined using the residential building footprint.
    zones_res_split = zones_res_split.set_index('id_unit')
    centroids_res = zones_res_split.centroid.reset_index()
    centroids_res = centroids_res.rename(columns = {0:'geometry'})
    centroids_res = centroids_res.set_crs('EPSG:28992')
    zones_res_split = zones_res_split.reset_index()
    
    # Some of the newly formed areas have no residential building in them.
    # In this case, the centers of the zones are defined using 
    # the non-residential building footprint.
    zones_nres_split = zones_split.loc[(zones_split['Postcode'].isin(zones_res['Postcode'])) &
                                       (~(zones_split['id_unit'].isin(zones_res_split['id_unit'])))]
    # Since no residential building lay in the zone, the ratio is set to 0.
    zones_nres_split = zones_nres_split.assign(area_ratio = 0)
    zones_nres_split = gpd.overlay(zones_nres_split, buildings_nres,how = 'intersection')

    zones_nres_split = zones_nres_split.dissolve(by = 'id_unit')
    zones_nres_split = zones_nres_split.reset_index().loc[:,['id_unit',
                                                             'Postcode',
                                                             'area_ratio',
                                                             'gemeente',
                                                             'geometry']]
    
    #zones_nres_split = gpd.clip(zones_nres_split, buildings_nres)
    zones_nres_split = zones_nres_split.set_index('id_unit')
    centroids_nres = zones_nres_split.centroid.reset_index()
    centroids_nres = centroids_nres.rename(columns = {0:'geometry'})
    centroids_nres = centroids_nres.set_crs('EPSG:28992')
    zones_nres_split = zones_nres_split.reset_index()
    
    # Some of the newly formed areas have no building at all.
    # In this case, the centers of the zones are the geographic center.
    remainings = zones_split.loc[zones_split['Postcode'].isin(zones_res['Postcode']) &
                                 (~(zones_split['id_unit'].isin(zones_res_split['id_unit']) | 
                                    (zones_split['id_unit'].isin(zones_nres_split['id_unit']))))]
                                 
    remainings = remainings.assign(area_ratio = 0)
    remainings = remainings.set_index('id_unit')
    centroids_remainings = remainings.centroid.reset_index()
    centroids_remainings = centroids_remainings.rename(columns = {0:'geometry'})
    centroids_remainings = centroids_remainings.set_crs('EPSG:28992')
    remainings = remainings.reset_index()
    
    # Aggregating the centroids of all newly formed areas.
    centroids = gpd.GeoDataFrame(pd.concat([centroids_res,
                                            centroids_nres,
                                            centroids_remainings], 
                                           ignore_index=True),
                                 crs=centroids_res.crs)
    
    zones_info = pd.concat([zones_res_split[['id_unit',
                                       'area_ratio']],
                            zones_nres_split[['id_unit',
                                        'area_ratio']],
                            remainings[['id_unit',
                                        'area_ratio']]], 
                           ignore_index=True)
    
    zones_merged = zones_split.merge(zones_info,
                                     on = 'id_unit')

    return zones_split, zones_merged, centroids

In [8]:
def area_building_footprint_nres(buildings, zones, zones_split):

    '''
    This function:
        - Determines the centres of spatial units if they have non-residential buildings in it.
        - Computes the area_ratio, indicating how to allocate data from the postcodes to these spatial units.
    
           Parameters:
                    buildings (gpd.GeoDataFrame): building layout.
                    zones (gpd.GeoDataFrame): postcodes.
                    zones_split (gpd.GeoDataFrame): spatial units.

            Returns:
                    zones_merged (gpd.GeoDataFrame): Spatial units with non-residential building, with area_ratio.
                    centroids (gpd.GeoDataFrame): Centres of spatial units with non-residential building.
    '''
    
    zones = zones.copy()
    zones_split = zones_split.loc[zones_split['Postcode'].isin(zones['Postcode'])]
    
    # Extracting non-residential buildings.
    buildings_nres = buildings.loc[buildings['building'] == 'yes']
    
    # Determining the footprint of buildings in zones.
    # If a postcode is composed of two disjoints areas,
    # the footprint is used to allocate the population between the two disjoints area.
    zones_nres = gpd.overlay(zones, buildings_nres,how = 'intersection')
    
    zones_nres = zones_nres.dissolve(by = 'Postcode')
    
    zones_nres = zones_nres.reset_index().loc[:,['Postcode',
                                                 'gemeente',
                                                 'geometry']]

    zones_nres['building_footprint_tot'] = zones_nres.area
    
    # The footprint of the buildings is computed again for the newly formed areas.
    zones_nres_split = gpd.overlay(zones_split, buildings_nres,how = 'intersection')
    
    zones_nres_split = zones_nres_split.dissolve(by = 'id_unit')
    
    zones_nres_split = zones_nres_split.reset_index().loc[:,['id_unit',
                                                             'Postcode',
                                                             'gemeente',
                                                             'geometry']]
    
    #zones_nres_split = gpd.clip(zones_split, buildings_nres)
    zones_nres_split['building_footprint'] = zones_nres_split.area
    
    # Adding the footprint of the entire postcode to the new data.
    zones_nres_split = zones_nres_split.merge(zones_nres[['Postcode',
                                                          'building_footprint_tot']],
                                              on = 'Postcode')
    
    # Computing the ratio of the footprint of the postcode laying in the newly formed areas.
    # It is used to allocate the demographics of the postcode to the smaller areas.
    zones_nres_split['area_ratio'] = zones_nres_split['building_footprint'] / zones_nres_split['building_footprint_tot']
    
    # The center of the zones are defined using the residential building footprint.
    zones_nres_split = zones_nres_split.set_index('id_unit')
    centroids_nres = zones_nres_split.centroid.reset_index()
    centroids_nres = centroids_nres.rename(columns = {0:'geometry'})
    centroids_nres = centroids_nres.set_crs('EPSG:28992')

    zones_nres_split = zones_nres_split.reset_index()
    
    # Some of the newly formed areas have no building.
    # In this case, the centers of the zones are the geographic center.
    remainings = zones_split.loc[(zones_split['Postcode'].isin(zones_nres['Postcode'])) &
                                 (~(zones_split['id_unit'].isin(zones_nres_split['id_unit'])))]
    
    remainings = remainings.assign(area_ratio = 0)
    remainings = remainings.set_index('id_unit')
    centroids_remainings = remainings.centroid.reset_index()
    centroids_remainings = centroids_remainings.rename(columns = {0:'geometry'})
    centroids_remainings = centroids_remainings.set_crs('EPSG:28992')
    remainings = remainings.reset_index()
    
    # Aggregating the centroids of all newly formed areas.
    centroids = gpd.GeoDataFrame(pd.concat([centroids_nres,
                                            centroids_remainings], 
                                           ignore_index=True),
                                 crs=centroids_nres.crs)
    
    zones_info = pd.concat([zones_nres_split[['id_unit',
                                              'area_ratio']],
                            remainings[['id_unit',
                                        'area_ratio']]],
                           ignore_index=True)
    
    zones_merged = zones_split.merge(zones_info,
                                     on = 'id_unit')

    return zones_merged, centroids

In [9]:
def geographic_preprocess(dir_building, zones, city, year):

    '''
    This function 
        - Delineates spatial units from postcodes.
        - Determines their centres based on the building layout.
        - If postcodes are discontinous, there are split into several spatial units, 
            and area_ratio indicates the share of the demographics from the postcode that should 
            be assigned to the spatial unit.
    
           Parameters:
                    dir_building (str): directory for the buildings data.
                    zones (gpd.GeoDataFrame): postcodes.
                    city (str): name of the city.
                    year (int): year of interest.

            Returns:
                    zones_city (gpd.GeoDataFrame): Spatial units.
                    centroids (gpd.GeoDataFrame): Centres of spatial units.
    '''

    # Selecting the zones inside the city of interest.
    zones_city = zones.loc[zones['gemeente'] == city].copy()
    
    # Taking the buildings laying within the city boundaries.
    buildings = gpd.read_file(dir_building + 'buildings_' + city + '.gpkg')
    buildings['year'] = buildings['other_tags'].str.extract(r'"start_date"=>"(\d{4})').astype(float)
    buildings = buildings.drop(columns = 'other_tags')
    buildings = buildings.loc[(buildings['year']<= year) | (buildings['year'].isna())]
    
    # Computing the footprint of the residential buildings in spatial units.
    zones_split, zones_res, centroids_res = area_building_footprint_res(buildings, zones_city)
    
    zones_nres = zones_city.loc[~zones_city['Postcode'].isin(zones_res['Postcode'])]
    
    # Computing the footprint of the non-residential buildings in ' + city)
    zones_nres, centroids_nres = area_building_footprint_nres(buildings, zones_nres, zones_split)
    
    zones_remainings = zones_city.loc[~((zones_city['Postcode'].isin(zones_res['Postcode'])) | 
                                       (zones_city['Postcode'].isin(zones_nres['Postcode'])))]
    
    # Managing the remaining zones (that have no building in it).
    zones_remainings = zones_remainings.assign(area_tot = zones_remainings.area)
    zones_remainings = zones_split.merge(zones_remainings[['Postcode',
                                                           'area_tot']], on = 'Postcode')
    zones_remainings = zones_remainings.assign(area_ratio = zones_remainings.area / zones_remainings['area_tot'])
    zones_remainings = zones_remainings.set_index('id_unit')
    centroids_remainings = zones_remainings.centroid.reset_index()
    centroids_remainings = centroids_remainings.rename(columns = {0:'geometry'})
    centroids_remainings = centroids_remainings.set_crs('EPSG:28992')
    zones_remainings = zones_remainings.reset_index()
    zones_remainings = zones_remainings.drop(columns = 'area_tot')
    
    # Concatenating everything.
    centroids = gpd.GeoDataFrame(pd.concat([centroids_res,
                                            centroids_nres,
                                            centroids_remainings], 
                                           ignore_index=True),
                                 crs=centroids_res.crs)
    
    zones_city = gpd.GeoDataFrame(pd.concat([zones_res,
                                             zones_nres,
                                             zones_remainings], 
                                            ignore_index=True),
                                  crs=zones_res.crs)

    return zones_city,centroids

In [10]:
directory_city = '../data/processed_data/city_boundary/'
directory_pc_per_city = '../data/processed_data/zones_delineation/'
dir_building = '../data/raw_data/buildings/'

if not os.path.isdir(directory_pc_per_city):
    os.mkdir(directory_pc_per_city)

In [12]:
list_files = os.listdir(directory_city)
# Removing README file.
if 'city_boundary_README.mkd' in list_files:
    list_files.remove('city_boundary_README.mkd')

pg = progress_bar(range(len(list_files)))
for i in pg:
    file = list_files[i]
    if file == 'Buitenland.gpkg':
        continue
    # Removing the extension.
    city_name = file[:-5]
    pg.comment = city_name
    if not os.path.isfile(directory_pc_per_city + 'PC_' + file):
        zones_city = zones.loc[zones['gemeente'] == city_name]
        zones_city, centres_city = geographic_preprocess(dir_building,zones_city,city_name,2017)
        
        city_shape = gemeenten.loc[gemeenten['gemeente'] == city_name]
        zones_city = zones_city.sjoin(city_shape[['geometry']],
                                      predicate = 'intersects').drop(columns = 'index_right')
        centres_city = centres_city.loc[centres_city['id_unit'].isin(zones_city['id_unit'])]
        zones_city.to_file(directory_pc_per_city + 'PC_' + file, layer = 'zone')
        centres_city.to_file(directory_pc_per_city + 'PC_' + file, layer ='centroid')

  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Inde