In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import os
from scipy import sparse
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from fastprogress.fastprogress import master_bar, progress_bar



In [2]:
dir_adjacency = '../data/processed_data/adjacency_matrix/'
dir_adjacency_edited = '../data/processed_data/adjacency_matrix/edited/'
dir_zones_ori = '../data/processed_data/zones_delineation/'
dir_zones_edited = '../data/processed_data/zones_delineation/edited/'
dir_shortest_path = '../data/processed_data/shortest_path/'
matrix_stats_per_city = pd.DataFrame({'city':[],'N_regions':[],'largest_comp':[],'N_postcodes':[]})

# Initialization of the progress bar.
list_files = os.listdir(dir_zones_ori)
list_files.remove('edited')
list_files.remove('README_zones_delineation.mkd')
pb = progress_bar(range(len(list_files)))

for i in pb:
    file = list_files[i]

    city_name = file[3:-5]
    
    if os.path.isfile(dir_adjacency + city_name + '_adjacency_matrix.npz'):
        continue
        
    zones = gpd.read_file(dir_zones_ori + file)
    zones = zones[['id_unit','geometry']]

    shortest_path = pd.read_csv(dir_shortest_path + city_name + '_walk_t_unit_to_unit.csv')
    # Some shortest paths are duplicated. Keeping only one.
    shortest_path = shortest_path.sort_values(by = 'walk_t')
    shortest_path = shortest_path.drop_duplicates(subset=['from_id_unit','to_id_unit'])
    
    adj_tab = zones.sjoin(zones.rename(columns = {'id_unit':'id_unit_2'})) 
    adj_tab = adj_tab.rename(columns = {'index_right':'adjacency'})

    adj_tab['adjacency'] = 1
    adj_mat = adj_tab[['id_unit','id_unit_2','adjacency']].pivot(index = 'id_unit',
                                                                 columns = 'id_unit_2',
                                                                 values = 'adjacency')
    adj_mat = adj_mat.replace(np.nan,0)

    adj_mat = adj_mat.reindex(zones['id_unit'])
    adj_mat = adj_mat.transpose() 
    adj_mat = adj_mat.reindex(zones['id_unit'].rename('id_unit_2'))

    
    # Distance matrix.
    shortest_path['walk_t'] = 1
    shortest_path.loc[:,'walk_t'] = shortest_path.mask(shortest_path['from_id_unit'] == shortest_path['to_id_unit'],0)
    shortest_path.loc[:,'walk_t'] = shortest_path['walk_t'].astype(np.int8)

    distance_mat = shortest_path.pivot(index= 'from_id_unit',
                                       columns = 'to_id_unit',
                                       values = 'walk_t')
    distance_mat = distance_mat.reindex(zones['id_unit'].rename('from_id_unit'))
    distance_mat = distance_mat.replace(np.nan,0)

    # For the zones that have no shortest path to other zones on the map, we dismiss the shortest path.
    # For that, we set the column and the row to 1.
    distance_mat = distance_mat.mask(distance_mat.sum() == 0, 1)
    distance_mat = distance_mat.transpose() 
    distance_mat = distance_mat.reindex(zones['id_unit'].rename('to_id_unit'))
    distance_mat = distance_mat.mask(distance_mat.sum() == len(distance_mat), 1)

    distance_mat = distance_mat.to_numpy()
    np.fill_diagonal(distance_mat,1)

    adj_mat = np.logical_and(distance_mat, adj_mat)
    
    adj_mat_ar_coo = adj_mat.to_numpy()
    adj_mat_ar_coo = sparse.coo_matrix(adj_mat_ar_coo)

    N_component, labels = sparse.csgraph.connected_components(adj_mat_ar_coo)
    
    # If there is no isolated island, we can save directly the matrix.
    if N_component == 1:
        sparse.save_npz(dir_adjacency + city_name + '_adjacency_matrix.npz', adj_mat_ar_coo)
        continue
        
    # Otherwise we need to reconnect isolated components.
    zones['label'] = labels
    # We reconstruct the adjancency matrix.
    adj_tab_reco = zones.sjoin(zones.rename(columns = {'id_unit':'id_unit_2'}),lsuffix = '1',rsuffix = '2') 
    adj_tab_reco = adj_tab_reco.rename(columns = {'index_right':'adjacency'})

    # Adjacency is 1 if two zones are adjacent.
    adj_tab_reco['adjacency'] = 1
    # We set adjacency to 0 for the adjacent zones that are in the same connected component.
    adj_tab_reco['adjacency'] = adj_tab_reco['adjacency'].mask(adj_tab_reco['label_1'] == adj_tab_reco['label_2'], 0)

    adj_mat_reco = adj_tab_reco[['id_unit','id_unit_2','adjacency']].pivot(index = 'id_unit',
                                                                           columns = 'id_unit_2',
                                                                           values = 'adjacency')
    adj_mat_reco = adj_mat_reco.replace(np.nan,0)
    adj_mat_reco = adj_mat_reco.reindex(zones['id_unit'])
    adj_mat_reco = adj_mat_reco.transpose() 
    adj_mat_reco = adj_mat_reco.reindex(zones['id_unit'].rename('id_unit_2'))
    
    adj_mat = adj_mat.replace(True, 1)
    adj_mat = adj_mat.replace(False, 0)
    adj_mat_fin = adj_mat_reco + adj_mat
    adj_mat_fin = adj_mat_fin.replace(2,1)
    
    if len(adj_mat_fin)!= len(zones):
        print(city_name + ' has a problem. The length of the zones is different from the adjacency matrix.')
        
    adj_mat_fin = adj_mat_fin.to_numpy()
    adj_mat_fin = sparse.coo_matrix(adj_mat_fin)
    
    sparse.save_npz(dir_adjacency + city_name + '_adjacency_matrix.npz', adj_mat_fin)

    N_component, labels = sparse.csgraph.connected_components(adj_mat_fin)

    if N_component>1:
        print(city_name + ' has a problem. The matrix is not fully connected. There are ' + str(N_component) + ' components.')
    
    #sparse.save_npz(dir_adjacency_edited + city_name + '_adjacency_matrix.npz', adjacency_matrix)

Landerd has a problem. The matrix is not fully connected. There are 2 components.
Veere has a problem. The matrix is not fully connected. There are 2 components.
Amsterdam has a problem. The matrix is not fully connected. There are 2 components.
Baarle_Nassau has a problem. The matrix is not fully connected. There are 8 components.
Staphorst has a problem. The matrix is not fully connected. There are 2 components.
Losser has a problem. The matrix is not fully connected. There are 2 components.
Zuidplas has a problem. The matrix is not fully connected. There are 2 components.
Groningen has a problem. The matrix is not fully connected. There are 2 components.
Oss has a problem. The matrix is not fully connected. There are 3 components.
Rotterdam has a problem. The matrix is not fully connected. There are 2 components.


# Connect disconnected components in adjacency matrices per city.

In [3]:
dir_zones = dir_zones_ori

In [4]:
for i in list_files:
    city_name = i[3:-5]

    adjacency_matrix = sparse.load_npz(dir_adjacency + city_name + '_adjacency_matrix.npz')
    N_component, labels = sparse.csgraph.connected_components(adjacency_matrix)
    if N_component >1:
        print(city_name + ' has ' + str(N_component) + ' components.')

Landerd has 2 components.
Veere has 2 components.
Amsterdam has 2 components.
Baarle_Nassau has 8 components.
Staphorst has 2 components.
Losser has 2 components.
Zuidplas has 2 components.
Groningen has 2 components.
Oss has 3 components.
Rotterdam has 2 components.


- Veere: Remove 4361SE0
- Landerd: Remove 5411VD0,5409TC0
- Staphorst: Remove 8035RL0
- Losser: Remove 7574PE
- Zuidplas: Remove 2741JW0
- Groningen: Remove 9747AK0
- Oss: remove 5386LE, 5374RA1
- Rotterdam: remove 3198LD1

### Veere

In [5]:
zones

Unnamed: 0,id_unit,geometry,label
0,3011AB0,"POLYGON ((92671.110 437161.384, 92699.230 4371...",0
1,3011AB1,"POLYGON ((92673.700 437196.146, 92658.060 4371...",0
2,3011AC0,"POLYGON ((92741.660 437146.960, 92740.490 4371...",0
3,3011AC1,"POLYGON ((92712.520 437207.453, 92718.771 4371...",0
4,3011AG0,"POLYGON ((92661.434 437135.605, 92688.560 4371...",0
...,...,...,...
16373,3198LZ0,"POLYGON ((74981.705 435363.066, 74982.575 4353...",0
16374,3201AW0,"POLYGON ((83098.556 430293.617, 83014.754 4302...",0
16375,3201LS0,"POLYGON ((83413.691 430257.228, 83422.635 4302...",0
16376,3202LB0,"POLYGON ((82895.319 431669.821, 82898.160 4316...",0


In [6]:
city = 'Veere'

zones =  gpd.read_file(dir_zones + 'PC_'+ city + '.gpkg')

adjacency_matrix = sparse.load_npz(dir_adjacency + city + '_adjacency_matrix.npz')

N_component, labels = sparse.csgraph.connected_components(adjacency_matrix)
adjacency_matrix = adjacency_matrix.toarray()

zones['component'] = labels
zones.loc[zones['component'] != 0]

Unnamed: 0,Postcode,gemeente,id_unit,area_ratio,geometry,component
1333,4361SR,Veere,4361SR0,1.0,"POLYGON ((19899.768 394428.251, 19909.724 3944...",1


In [7]:
adjacency_matrix = np.delete(adjacency_matrix, (1333), axis=0)
adjacency_matrix = np.delete(adjacency_matrix, (1333), axis=1)

In [8]:
zones = zones.loc[zones['component'] == 0].reset_index(drop = True)
zones = zones.drop(columns ='component')
zones.to_file(dir_zones_edited + 'PC_'  + city + '.gpkg')

adjacency_matrix = sparse.coo_matrix(adjacency_matrix)

sparse.save_npz(dir_adjacency_edited + city + '_adjacency_matrix.npz', adjacency_matrix)

  pd.Int64Index,


### Landerd

In [9]:
city = 'Landerd'

zones =  gpd.read_file(dir_zones + 'PC_'+ city + '.gpkg')
#zones = zones[['Postcode','id_unit','area_ratio','geometry']]
adjacency_matrix = sparse.load_npz(dir_adjacency + city + '_adjacency_matrix.npz')

N_component, labels = sparse.csgraph.connected_components(adjacency_matrix)
adjacency_matrix = adjacency_matrix.toarray()

zones['component'] = labels
zones.loc[zones['component'] != 0]

Unnamed: 0,Postcode,gemeente,id_unit,area_ratio,geometry,component
540,5411VD,Landerd,5411VD0,1.0,"POLYGON ((179437.021 407495.832, 179400.705 40...",1
549,5409TC,Landerd,5409TC0,1.0,"POLYGON ((179513.621 407091.494, 179407.096 40...",1


In [10]:
adjacency_matrix = np.delete(adjacency_matrix, (549), axis=0)
adjacency_matrix = np.delete(adjacency_matrix, (549), axis=1)
adjacency_matrix = np.delete(adjacency_matrix, (540), axis=0)
adjacency_matrix = np.delete(adjacency_matrix, (540), axis=1)

In [11]:
zones = zones.loc[zones['component'] == 0].reset_index(drop = True)
zones = zones.drop(columns ='component')
zones.to_file(dir_zones_edited + 'PC_'  + city + '.gpkg')

adjacency_matrix = sparse.coo_matrix(adjacency_matrix)

sparse.save_npz(dir_adjacency_edited + city + '_adjacency_matrix.npz', adjacency_matrix)

  pd.Int64Index,


### Staphorst

In [12]:
city = 'Staphorst'

zones =  gpd.read_file(dir_zones + 'PC_'+ city + '.gpkg')
#zones = zones[['Postcode','id_unit','area_ratio','geometry']]
adjacency_matrix = sparse.load_npz(dir_adjacency + city + '_adjacency_matrix.npz')

N_component, labels = sparse.csgraph.connected_components(adjacency_matrix)
adjacency_matrix = adjacency_matrix.toarray()

zones['component'] = labels
zones.loc[zones['component'] != 0]

Unnamed: 0,Postcode,gemeente,id_unit,area_ratio,geometry,component
575,8035RL,Staphorst,8035RL0,0.3479,"POLYGON ((206985.783 511050.780, 207010.660 51...",1


In [13]:
adjacency_matrix = np.delete(adjacency_matrix, (575), axis=0)
adjacency_matrix = np.delete(adjacency_matrix, (575), axis=1)

In [14]:
zones = zones.loc[zones['component'] == 0].reset_index(drop = True)
zones = zones.drop(columns ='component')
zones.to_file(dir_zones_edited + 'PC_'  + city + '.gpkg')

adjacency_matrix = sparse.coo_matrix(adjacency_matrix)

sparse.save_npz(dir_adjacency_edited + city + '_adjacency_matrix.npz', adjacency_matrix)

  pd.Int64Index,


### Losser

In [15]:
city = 'Losser'

zones =  gpd.read_file(dir_zones + 'PC_'+ city + '.gpkg')
#zones = zones[['Postcode','id_unit','area_ratio','geometry']]
adjacency_matrix = sparse.load_npz(dir_adjacency + city + '_adjacency_matrix.npz')

N_component, labels = sparse.csgraph.connected_components(adjacency_matrix)
adjacency_matrix = adjacency_matrix.toarray()

zones['component'] = labels
zones.loc[zones['component'] != 0]

Unnamed: 0,Postcode,gemeente,id_unit,area_ratio,geometry,component
958,7574PE,Losser,7574PE1,0.787229,"POLYGON ((260556.553 479386.872, 260571.348 47...",1


In [16]:
adjacency_matrix = np.delete(adjacency_matrix, (958), axis=0)
adjacency_matrix = np.delete(adjacency_matrix, (958), axis=1)

zones = zones.loc[zones['component'] == 0].reset_index(drop = True)
zones = zones.drop(columns ='component')
zones.to_file(dir_zones_edited + 'PC_'  + city + '.gpkg')

adjacency_matrix = sparse.coo_matrix(adjacency_matrix)

sparse.save_npz(dir_adjacency_edited + city + '_adjacency_matrix.npz', adjacency_matrix)

  pd.Int64Index,


### Zuidplas

In [17]:
city = 'Zuidplas'

zones =  gpd.read_file(dir_zones + 'PC_'+ city + '.gpkg')
#zones = zones[['Postcode','id_unit','area_ratio','geometry']]
adjacency_matrix = sparse.load_npz(dir_adjacency + city + '_adjacency_matrix.npz')

N_component, labels = sparse.csgraph.connected_components(adjacency_matrix)
adjacency_matrix = adjacency_matrix.toarray()

zones['component'] = labels
zones.loc[zones['component'] != 0]

Unnamed: 0,Postcode,gemeente,id_unit,area_ratio,geometry,component
1187,2741JW,Zuidplas,2741JW0,1.0,"POLYGON ((105777.337 448458.308, 105774.787 44...",1


In [18]:
adjacency_matrix = np.delete(adjacency_matrix, (1187), axis=0)
adjacency_matrix = np.delete(adjacency_matrix, (1187), axis=1)

zones = zones.loc[zones['component'] == 0].reset_index(drop = True)
zones = zones.drop(columns ='component')
zones.to_file(dir_zones_edited + 'PC_'  + city + '.gpkg')

adjacency_matrix = sparse.coo_matrix(adjacency_matrix)

sparse.save_npz(dir_adjacency_edited + city + '_adjacency_matrix.npz', adjacency_matrix)

  pd.Int64Index,


### Groningen

In [19]:
city = 'Groningen'

zones =  gpd.read_file(dir_zones + 'PC_'+ city + '.gpkg')
#zones = zones[['Postcode','id_unit','area_ratio','geometry']]
adjacency_matrix = sparse.load_npz(dir_adjacency + city + '_adjacency_matrix.npz')

N_component, labels = sparse.csgraph.connected_components(adjacency_matrix)
adjacency_matrix = adjacency_matrix.toarray()

zones['component'] = labels
zones.loc[zones['component'] != 0]

Unnamed: 0,Postcode,gemeente,id_unit,area_ratio,geometry,component
6605,9747AK,Groningen,9747AK0,1.0,"POLYGON ((229897.117 586134.703, 229897.523 58...",1


In [20]:
adjacency_matrix = np.delete(adjacency_matrix, (6605), axis=0)
adjacency_matrix = np.delete(adjacency_matrix, (6605), axis=1)

zones = zones.loc[zones['component'] == 0].reset_index(drop = True)
zones = zones.drop(columns ='component')
zones.to_file(dir_zones_edited + 'PC_'  + city + '.gpkg')

adjacency_matrix = sparse.coo_matrix(adjacency_matrix)

sparse.save_npz(dir_adjacency_edited + city + '_adjacency_matrix.npz', adjacency_matrix)

  pd.Int64Index,


### Oss

In [21]:
city = 'Oss'

zones =  gpd.read_file(dir_zones + 'PC_'+ city + '.gpkg')
#zones = zones[['Postcode','id_unit','area_ratio','geometry']]
adjacency_matrix = sparse.load_npz(dir_adjacency + city + '_adjacency_matrix.npz')

N_component, labels = sparse.csgraph.connected_components(adjacency_matrix)
adjacency_matrix = adjacency_matrix.toarray()

zones['component'] = labels
zones.loc[zones['component'] != 0]

Unnamed: 0,Postcode,gemeente,id_unit,area_ratio,geometry,component
2740,5386LE,Oss,5386LE1,0.0,"POLYGON ((162457.224 416231.795, 162479.074 41...",1
3215,5374RA,Oss,5374RA1,0.263695,"POLYGON ((168712.247 416074.095, 168713.071 41...",2


In [22]:
adjacency_matrix = np.delete(adjacency_matrix, (3215), axis=0)
adjacency_matrix = np.delete(adjacency_matrix, (3215), axis=1)
adjacency_matrix = np.delete(adjacency_matrix, (2740), axis=0)
adjacency_matrix = np.delete(adjacency_matrix, (2740), axis=1)

zones = zones.loc[zones['component'] == 0].reset_index(drop = True)
zones = zones.drop(columns ='component')
zones.to_file(dir_zones_edited + 'PC_'  + city + '.gpkg')

adjacency_matrix = sparse.coo_matrix(adjacency_matrix)

sparse.save_npz(dir_adjacency_edited + city + '_adjacency_matrix.npz', adjacency_matrix)

  pd.Int64Index,


### Rotterdam

In [23]:
city = 'Rotterdam'

zones =  gpd.read_file(dir_zones + 'PC_'+ city + '.gpkg')
#zones = zones[['Postcode','id_unit','area_ratio','geometry']]
adjacency_matrix = sparse.load_npz(dir_adjacency + city + '_adjacency_matrix.npz')

N_component, labels = sparse.csgraph.connected_components(adjacency_matrix)
adjacency_matrix = adjacency_matrix.toarray()

zones['component'] = labels
zones.loc[zones['component'] != 0]

Unnamed: 0,Postcode,gemeente,id_unit,area_ratio,geometry,component
13770,3198LD,Rotterdam,3198LD1,1.0,"POLYGON ((66624.824 438500.664, 66623.970 4383...",1


In [24]:
adjacency_matrix = np.delete(adjacency_matrix, (13770), axis=0)
adjacency_matrix = np.delete(adjacency_matrix, (13770), axis=1)

zones = zones.loc[zones['component'] == 0].reset_index(drop = True)
zones = zones.drop(columns ='component')
zones.to_file(dir_zones_edited + 'PC_'  + city + '.gpkg')

adjacency_matrix = sparse.coo_matrix(adjacency_matrix)

sparse.save_npz(dir_adjacency_edited + city + '_adjacency_matrix.npz', adjacency_matrix)

  pd.Int64Index,


### Amsterdam

In [25]:
city = 'Amsterdam'

zones =  gpd.read_file(dir_zones + 'PC_'+ city + '.gpkg')
#zones = zones[['Postcode','id_unit','area_ratio','geometry']]
adjacency_matrix = sparse.load_npz(dir_adjacency + city + '_adjacency_matrix.npz')

N_component, labels = sparse.csgraph.connected_components(adjacency_matrix)
adjacency_matrix = adjacency_matrix.toarray()

zones['component'] = labels
# Remove 6891ZZ0 6961LJ0.
zones.loc[zones['component'] == 1]

zones_1 = zones.loc[zones['component'] == 0]
adjacency_matrix_1 = np.delete(adjacency_matrix, 
                             (zones.loc[zones['component'] ==1].index.values), 
                             axis=0)
adjacency_matrix_1 = np.delete(adjacency_matrix_1, 
                             (zones.loc[zones['component'] ==1].index.values), 
                             axis=1)

zones_2 = zones.loc[zones['component'] == 1]
adjacency_matrix_2 = np.delete(adjacency_matrix, 
                             (zones.loc[zones['component'] == 0].index.values), 
                             axis=0)
adjacency_matrix_2 = np.delete(adjacency_matrix_2, 
                             (zones.loc[zones['component'] == 0].index.values), 
                             axis=1)

adjacency_matrix_1 = sparse.coo_matrix(adjacency_matrix_1)
sparse.save_npz(dir_adjacency_edited + city + '_adjacency_matrix_1.npz', adjacency_matrix_1)
zones_1 = zones_1.drop(columns ='component')
zones_1.to_file(dir_zones_edited + 'PC_'  + city + '_1.gpkg')
adjacency_matrix_2 = sparse.coo_matrix(adjacency_matrix_2)
sparse.save_npz(dir_adjacency_edited + city + '_adjacency_matrix_2.npz', adjacency_matrix_2)
zones_2 = zones_2.drop(columns ='component')
zones_2.to_file(dir_zones_edited + 'PC_'  + city + '_2.gpkg')

  pd.Int64Index,
  pd.Int64Index,
