In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from scipy import sparse
import os
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from fastprogress.fastprogress import master_bar, progress_bar

In [None]:
dir_exposure = '../data/results/exposure/'
dir_adjacency_matrix = '../data/processed_data/adjacency_matrix/'
dir_zones = '../data/processed_data/zones_delineation/'
dir_cov_mat = '../data/processed_data/covariance_matrix/'
dir_gemeente = '../data/processed_data/city_boundary/'

parameters = pd.read_csv('parameter.csv')
parameters = parameters.set_index('variable')
dissimilarity_threshold = parameters.loc['dissimilarity_threshold','value']

In [None]:
def cluster_analysis(zones,adjacency_matrix,exposure,list_N_clusters):
   
    '''
    This function aggregates spatial units into homogeneous regions.

           Parameters:
                    zones (gpd.GeoDataFrame): Spatial units.
                    adjacency_matrix (np.array): Matrix indicating the adjacency of spatial units.
                    exposure (pd.DataFrame): Exposure in each spatial unit.

            Returns:
                    zones (gpd.GeoDataFrame): Spatial units with the region they belong to.

    '''
   # N_component, labels = sparse.csgraph.connected_components(adjacency_matrix)
   # if N_component > 1:
   #     print(city + ' has a problem')
    zones = zones.copy()
    zones = zones.loc[:,['id_unit','geometry']]
    zones = zones.merge(exposure, on = 'id_unit', how = 'left')
    zones['expos_NW'] = zones['expos_NW'].mask(zones['expos_NW'].isna(),zones['share_NW_c'].mean())
    zones['expos_rel'] = zones['expos_NW']/zones['share_NW_c']
    zones['expos_rel'] = zones['expos_rel'].mask(zones['share_NW_c']==0,0)
    X = np.array([zones.loc[:,'expos_NW']]).T

    for i in list_N_clusters:
        clustering = AgglomerativeClustering(n_clusters=i,
                                            connectivity=adjacency_matrix,
                                            linkage = 'ward').fit(X)

        zones['region_' + str(i)] = clustering.labels_

    return zones

In [None]:
file = 'Leiden.gpkg'
city = file[:-5]

list_N_clusters = [200,100,50,35,25]
exposure = pd.read_csv(dir_exposure + city + '_exposure.csv')
adjacency_matrix = sparse.load_npz(dir_adjacency_matrix + city 
                                    + '_adjacency_matrix.npz')
zones = gpd.read_file(dir_zones + 'PC_'+ file,layer = 'zone')
zones = cluster_analysis(zones,adjacency_matrix, exposure, list_N_clusters)

In [None]:
zones[['id_unit','geometry']].rename(columns={'id_unit':'region'}).to_file('../data/results/regions/animation/regions_init.gpkg')
k = 1
for j in list_N_clusters:
    regions = zones[['region_' + str(j),'geometry']].rename(columns = {'region_' + str(j):'region'}).dissolve(by = 'region').reset_index()
    regions['iteration'] = k
    regions.to_file('../data/results/regions/animation/regions_'  + str(j)+'.gpkg')
    k = k + 1