# Nodes

Calculate the degree of the nodes (i.e. the cities) in each corpus.  
Additionally calculate the predicted degree and residuals of each node.

At the end there is one edge related section where the predictions & residuals are added to the edge shapefile for mapping for the final report.

In [1]:
# packages
import pandas as pd
import geopandas as gpd
import numpy as np
import os
import unidecode

## Functions

In [2]:
def degree_count(df, en_column, fr_column):
    '''creates a dictionary for english and french occurrences of each city which is updated with 
    the counts from the appropriate column
    '''
    
    en_degrees = dict()
    for city in cities: 
        en_degrees[city] = 0
    
    fr_degrees = en_degrees.copy()
    
    for i, row in df.iterrows():
        en_degrees[row.city_A] += row[en_column]
        en_degrees[row.city_B] += row[en_column]
        fr_degrees[row.city_A] += row[fr_column]
        fr_degrees[row.city_B] += row[fr_column]
    
    return en_degrees, fr_degrees

## Load File

In [3]:
# directories
INDIR = "../../input"
OUTDIR = "../../output"
DATADIR = "../../../../data"
FILE_cities = "List_of_cities_300k.csv"

In [4]:
# load list of cities
list_of_cities = pd.read_csv(os.path.join(INDIR, FILE_cities), sep=';')
cities = [unidecode.unidecode(city) for city in list_of_cities.Mua_en]
df_cities = pd.DataFrame(cities, columns = ["city"])

In [5]:
# load nodes shapefile
fp = os.path.join(INDIR, "nodes.shp")
nodes = gpd.read_file(fp)

In [31]:
# load the edges shapefile
fp = os.path.join(OUTDIR, "edges_nz.shp")
edges_gdf = gpd.read_file(fp)

In [6]:
# load edges with predictions
# fp = os.path.join(OUTDIR, "edges_nz.csv"),
edges = pd.read_csv("../../output/edges_nz.csv", sep=';')

In [7]:
edges.head()

Unnamed: 0,city_A,city_B,POP_A,POP_B,CC_A,CC_B,distance,border_DUM,reg_border,FR_con,...,col_en,col_fr,col_prop_fr,col_prop_en,res_en2,res_fr2,pred_en2,pred_fr2,pred_en2_exp,pred_fr2_exp
0,Paris,London,9591,8256,FR,UK,342.719841,1,0,1,...,21874,10733,1.787238,2.095814,0.935661,1.138099,9.057393,8.142979,8581.745601,3439.148052
1,Paris,Madrid,9591,4955,FR,ES,1047.105521,1,1,1,...,3394,2404,0.400309,0.325189,0.545147,1.127986,7.584617,6.656904,1967.693075,778.137758
2,Paris,Berlin,9591,3776,FR,DE,879.835916,1,0,1,...,7899,4723,0.786465,0.756827,1.615544,2.009756,7.358948,6.450443,1570.183558,632.98268
3,Paris,Milan,9591,3698,FR,IT,640.685161,1,1,1,...,3342,2713,0.451763,0.320207,0.552024,1.240146,7.562301,6.665664,1924.267515,784.984794
4,Paris,Barcelona,9591,3659,FR,ES,827.813778,1,1,1,...,2397,1696,0.282415,0.229664,0.41954,0.978479,7.362434,6.457549,1575.666582,637.496707


### Degrees (occurrences of a node)

In [8]:
# calculate occurences in french and english
en_degrees, fr_degrees = degree_count(edges, 'col_en', 'col_fr')


# add calculated occurences to dataframe
df_cities['en_degree'] = en_degrees.values()
df_cities['fr_degree'] = fr_degrees.values()

In [9]:
df_cities.sort_values('en_degree', ascending=False).head(10)

Unnamed: 0,city,en_degree,fr_degree
1,London,186465,49256
0,Paris,133309,143167
3,Berlin,86893,38241
7,Rome,59845,41235
12,Manchester,58826,8950
18,Vienna,54401,30030
19,Munich,44445,21051
31,Liverpool,42161,6466
4,Milan,40709,26289
2,Madrid,40165,22624


In [10]:
# calculate proportional occurences in french and english
en_degrees, fr_degrees = degree_count(edges, 'col_prop_en', 'col_prop_fr')


# add calculated occurences to dataframe
df_cities['en_deg_prop'] = en_degrees.values()
df_cities['fr_deg_prop'] = fr_degrees.values()

In [11]:
df_cities.head()

Unnamed: 0,city,en_degree,fr_degree,en_deg_prop,fr_deg_prop
0,Paris,133309,143167,12.772737,23.83989
1,London,186465,49256,17.865773,8.202013
2,Madrid,40165,22624,3.84833,3.767304
3,Berlin,86893,38241,8.32548,6.367817
4,Milan,40709,26289,3.900452,4.377593


## 

In [12]:
# calculate predicted ln(occurences) in french and english
en_degrees, fr_degrees = degree_count(edges, 'pred_en2', 'pred_fr2')


# add calculated predicted occurences to dataframe
df_cities['fr_pred'] = fr_degrees.values()
df_cities['en_pred'] = en_degrees.values()

In [13]:
# total residuals of cities in french and english
en_degrees, fr_degrees = degree_count(edges, 'res_en2', 'res_fr2')


# add total residuals to dataframe
df_cities['fr_res'] = fr_degrees.values()
df_cities['en_res'] = en_degrees.values()

In [14]:
# percentage residual of prediction
def estimation(row, pred_col, res_col): 
    percentage = (row[res_col] / row[pred_col]) * 100
    
    return round(percentage, 2)

In [15]:
df_cities['fr_estimation'] = df_cities.apply(estimation, pred_col = 'fr_pred', res_col = 'fr_res', axis = 1)
df_cities['en_estimation'] = df_cities.apply(estimation, pred_col = 'en_pred', res_col = 'en_res', axis = 1)

In [16]:
df_cities[['city','en_pred','fr_pred','en_res','fr_res','fr_estimation', 'en_estimation']].sort_values('fr_estimation', ascending=True).head(10)

Unnamed: 0,city,en_pred,fr_pred,en_res,fr_res,fr_estimation,en_estimation
137,Busto Arsizio,37.082886,29.200033,-27.200418,-21.915212,-75.05,-73.35
61,Gelsenkirchen-Bottrop,268.85172,216.700212,-149.399945,-143.358783,-66.16,-55.57
11,Katowice,349.723997,287.761829,-174.970657,-185.959607,-64.62,-50.03
120,Castellammare di Stabia-Torre Annunziata,55.40952,42.788507,-35.716614,-27.049788,-63.22,-64.46
118,Las Palmas,74.894463,48.250903,12.228059,-29.547223,-61.24,16.33
104,Wuppertal,234.219352,182.403369,-85.545929,-99.208532,-54.39,-36.52
51,Duisburg,333.385294,269.224736,-125.962653,-143.133951,-53.17,-37.78
94,Bielefeld,253.871667,194.637994,-77.489561,-102.092487,-52.45,-30.52
47,Bochum-Herne,381.329922,307.005568,-153.697651,-155.873645,-50.77,-40.31
106,Bournemouth,211.916206,160.344725,-15.199802,-77.189548,-48.14,-7.17


## Merge with Coordinates for mapping

In [17]:
nodes.head()

Unnamed: 0,MUA,POP,CC,WEST,SOUTH,NORTH,CEAST,REGION,EN_DUM,FR_DUM,geometry
0,Utrecht,390,NL,1,0,0,0,west,0,0,POINT (5.11148 52.09863)
1,Eindhoven,316,NL,1,0,0,0,west,0,0,POINT (5.47234 51.45709)
2,Amsterdam,1052,NL,1,0,0,0,west,0,0,POINT (4.87429 52.37074)
3,Rotterdam,1025,NL,1,0,0,0,west,0,0,POINT (4.47374 51.92364)
4,Poznan,679,PL,0,0,0,1,central_east,0,0,POINT (16.91751 52.40734)


In [18]:
nodes = pd.merge(nodes, df_cities, how='left', left_on = 'MUA', right_on = 'city')

In [19]:
nodes.drop(['city'], axis = 1, inplace = True)

In [20]:
nodes.head()

Unnamed: 0,MUA,POP,CC,WEST,SOUTH,NORTH,CEAST,REGION,EN_DUM,FR_DUM,...,en_degree,fr_degree,en_deg_prop,fr_deg_prop,fr_pred,en_pred,fr_res,en_res,fr_estimation,en_estimation
0,Utrecht,390,NL,1,0,0,0,west,0,0,...,6156.0,3100.0,0.589825,0.516206,220.212995,297.376867,29.019504,34.189293,13.18,11.5
1,Eindhoven,316,NL,1,0,0,0,west,0,0,...,2889.0,1569.0,0.276804,0.261267,185.476805,255.34514,17.982678,10.609184,9.7,4.15
2,Amsterdam,1052,NL,1,0,0,0,west,0,0,...,31807.0,15897.0,3.047524,2.647137,377.48726,474.041058,107.733873,118.825963,28.54,25.07
3,Rotterdam,1025,NL,1,0,0,0,west,0,0,...,11199.0,5337.0,1.07301,0.888707,365.500195,456.38953,-28.176055,-9.660149,-7.71,-2.12
4,Poznan,679,PL,0,0,0,1,central_east,0,0,...,5942.0,1904.0,0.569321,0.31705,259.733788,339.856803,-63.653444,-51.532566,-24.51,-15.16


In [21]:
# save to .shp file
fp = os.path.join(OUTDIR, "nodes.shp")
nodes.to_file(fp)

  pd.Int64Index,
  nodes.to_file(fp)


In [22]:
# drop geometry
nodes.drop(['geometry'], axis = 1, inplace = True)

# save as csv
fp = os.path.join(OUTDIR, "nodes.csv")
nodes.to_csv(fp, index = False, sep=';') 

## Edges
adding predicted values for mapping in QGIS

In [32]:
edges

Unnamed: 0,city_A,city_B,POP_A,POP_B,CC_A,CC_B,distance,border_DUM,reg_border,FR_con,...,col_en,col_fr,col_prop_fr,col_prop_en,res_en2,res_fr2,pred_en2,pred_fr2,pred_en2_exp,pred_fr2_exp
0,Paris,London,9591,8256,FR,UK,342.719841,1,0,1,...,21874,10733,1.787238,2.095814,0.935661,1.138099,9.057393,8.142979,8581.745601,3439.148052
1,Paris,Madrid,9591,4955,FR,ES,1047.105521,1,1,1,...,3394,2404,0.400309,0.325189,0.545147,1.127986,7.584617,6.656904,1967.693075,778.137758
2,Paris,Berlin,9591,3776,FR,DE,879.835916,1,0,1,...,7899,4723,0.786465,0.756827,1.615544,2.009756,7.358948,6.450443,1570.183558,632.982680
3,Paris,Milan,9591,3698,FR,IT,640.685161,1,1,1,...,3342,2713,0.451763,0.320207,0.552024,1.240146,7.562301,6.665664,1924.267515,784.984794
4,Paris,Barcelona,9591,3659,FR,ES,827.813778,1,1,1,...,2397,1696,0.282415,0.229664,0.419540,0.978479,7.362434,6.457549,1575.666582,637.496707
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8049,A Coruna,Ghent,311,300,ES,BE,1248.057073,1,1,0,...,1,4,0.000666,0.000096,-0.747780,1.203873,0.747780,0.182421,2.112306,1.200120
8050,Craiova,Brasov,311,307,RO,RO,206.805706,0,0,0,...,86,12,0.001998,0.008240,0.600303,-0.345852,3.854045,2.830759,47.183524,16.958327
8051,Coventry,Ghent,308,300,UK,BE,391.905700,1,0,0,...,5,1,0.000167,0.000479,0.028996,-1.055360,1.580442,1.055360,4.857104,2.873010
8052,Bonn,Ghent,306,300,DE,BE,239.931587,1,0,0,...,22,14,0.002331,0.002108,1.160043,1.215916,1.930999,1.423141,6.896396,4.150135


In [37]:
edges[['city_A', 'city_B']].iloc[8052] == edges_gdf[['city_A', 'city_B']].iloc[8052]

city_A    True
city_B    True
Name: 8052, dtype: bool

In [40]:
# add predictions and residuals to the edges GeoDataFrame
edges_gdf[['res_en2', 'res_fr2', 'pred_en2', 'pred_fr2']] = edges[['res_en2', 'res_fr2', 'pred_en2', 'pred_fr2']]

In [53]:
# save edges geodataframe
fp = os.path.join(OUTDIR, 'edges_nz2.shp')
edges_gdf.to_file(fp)

  pd.Int64Index,
