# Workflow #5

Comparative assessment: Node subsampling distance variants

In [None]:
import algo.net_helper as nh
import algo.helper as h
import os.path
import geopandas as gpd
import numpy as np
import pandas as pd
import contextily as cx
import matplotlib.pyplot as plt
import plotly.graph_objects as go

# settings
aoi_names = ["at_wien", "at_zs", "at_ib", "at_no", "at_zw", "at_graz_15"]

recalc = False
generate_plots = True
plot_dir = os.path.join("plots", "centr_subsampling")

file_suffix = ""

dir_data = "data"

mode = "bike_incwalk"
# tolerable access is determined by input network: all segments that have an index value assigned 
# (other than NULL, > 0) but have mode access set to False
access = "bicycle" 

# plot settings
P_SIZE_S = (4,3)
# colors: distance variants
P_C_D2 = "#2C8DBC"
P_C_D4 = "#056390"
P_C_D7 = "#003650"
# colors: min, mean, max
P_C_MIN = "#00A246"
P_C_MEAN = "#270D9D"
P_C_MAX = "#C60049"

# generated params
file = os.path.join(dir_data, f"r_<aoi_name>_edges{file_suffix}.gpkg")
f_diffstats_subsampling = os.path.join(dir_data, f"centr_diffstats_subsampling.csv")
f_cdf_full = os.path.join(dir_data, f"cdf_full.csv")

In [None]:
def run_comparison(aoi, centr_summary, centr_cdf, plot_types=["sbc"], plot_nws=[300, 600, 1500]):
    fn = file.replace("<aoi_name>", aoi)
    if not os.path.exists(fn):
        print(f"ERR: file '{fn}' not found.")
        return
    dir_detail_plot = os.path.join("plots", aoi)
    aoi_core_extent = h.get_aoi_extent(aoi)
    # read file (clipped to bounding box of core extent - needs to be clipped to exact extent after import)
    centr_df = gpd.read_file(fn, bbox=aoi_core_extent.iloc[0].geometry.bounds, engine='pyogrio') # pot. speedup: use arrow
    print("loaded input gdf.", len(centr_df), "edges")
    print("clipping to core AOI extent...")
    centr_df = centr_df.clip(aoi_core_extent)
    print("done.", len(centr_df), "edges")
    cols = centr_df.columns
    # compute cumulative distribution per centr col
    print("generating cumulative distr. for centrality columns...")
    ccdf = h.save_cdf(centr_df, os.path.join(dir_detail_plot, "centr_cdf.csv"))
    centr_cdf.append({"aoi":aoi, "df":ccdf})
    # now run comparisons
    for refcol in centr_df.columns:
        if not refcol.startswith("centr_") or not refcol.endswith("_sum") or refcol.find("_nws_")>-1:
            continue
        cref = h.CentralityDef.from_str(refcol)
        cols_cdf_plot = [cref.to_str()]
        print("processing ref col:", cref)
        # find compare target cols
        for col in cols:
            if col.find(cref.to_str()) < 0 or not col.endswith("_sum") or col.find("_nws_") < 0:
                continue
            cref_label = "orig"
            c = h.CentralityDef.from_str(col)
            c_label = f"{c.nws} m"
            enable_plots = False
            if generate_plots and c.type in plot_types and c.nws in plot_nws:
                enable_plots = True
            c_type = cref.to_str().lstrip("centr_")
            comp_label = f"{c_type}_subsampling"
            comp_variant_label = f"{c_type} {c_label}"
            # run comparison
            centr_summary.append(
                h.centr_comparison(aoi, centr_df, c, cref, dir_detail_plot=dir_detail_plot, centr_diff_name=comp_label,
                                   c_label=c_label, cref_label=cref_label, ccomp_label=comp_variant_label, generate_plots=enable_plots)
                )
            cols_cdf_plot.append(c.to_str())
        # plot cumulative centrality distribution
        ccdf[cols_cdf_plot].plot()
        h.save_plot(f"cdf_{cref}", dir_detail_plot)
    

In [None]:
if not recalc and os.path.exists(f_diffstats_subsampling):
    diffstats = pd.read_csv(f_diffstats_subsampling)
    full_cdf = pd.read_csv(f_cdf_full)
    display(diffstats.head())
else:
    centr_summary = []
    centr_cdf = []
    for aoi in aoi_names:
        run_comparison(aoi, centr_summary, centr_cdf)
        
    diffstats = pd.DataFrame.from_dict(centr_summary, orient="columns")
    display(diffstats.head())
    diffstats.to_csv(f_diffstats_subsampling)
    # collect and join all cumulative distribution results to single df and save to CSV
    collect_cdf = []
    for result in centr_cdf:
        cur_aoi = result["aoi"]
        cur_df = result["df"]
        cur_df.rename(columns=lambda x: f"{cur_aoi}__{x}", inplace=True)
        print(result["aoi"])
        collect_cdf.append(cur_df)
    full_cdf = collect_cdf[0].join(collect_cdf[1:])
    full_cdf.to_csv(f_cdf_full)

In [None]:
# compare each centrality variant across cases
hc_ch_grp = diffstats.groupby(["label_compare"]).hc_seg_share_changed.describe()
hc_ch_grp

In [None]:
diffstats[diffstats.name_cref.isin(["centr_sbc_c2000_sp_sum", "centr_sbc_c4000_sp_sum"])].head()

In [None]:
# plot share of HC segments changed, ordered by mean (grouped by comparison pair) -> min, mean, and max of AOIs (one value per AOI)
hc_ch_grp[hc_ch_grp["count"]>4][["min", "mean", "max"]].sort_values(by="mean").plot(figsize=(20,4), color=[P_C_MIN, P_C_MEAN, P_C_MAX])
#plt.show()
h.save_plot("hc_share_chg_aoi_variation", plot_dir, show=True)

## High-Centrality segments -- FILTERED: only sbc bp_d4
### Share changed

In [None]:
ds_sbc_bp = diffstats[(diffstats.c_type == "sbc") & (diffstats.c_is_bp)]
ds_sbc_bp.head()

In [None]:
# HC segments (p-based): share changed (relative to all segments which were classified as HC in cref as well as in c)
# computed across all AOIs and centrality variants (6x6)
ds_sbc_bp.groupby("label_c").hc_seg_share_changed.describe()

In [None]:
## USE ##
# HC segments (p-based): share changed (relative to all segments which were classified as HC in cref as well as in c)
# computed across all AOIs and centrality variants 
subsample_dists = ["300 m", "600 m", "900 m", "1200 m", "1500 m"]
ds_sbc_bp.groupby("label_c").hc_seg_share_changed.min()[subsample_dists].plot(legend=True, label="min", color=P_C_MIN, figsize=P_SIZE_S)
ds_sbc_bp.groupby("label_c").hc_seg_share_changed.mean()[subsample_dists].plot(legend=True, label="mean", color=P_C_MEAN)
ds_sbc_bp.groupby("label_c").hc_seg_share_changed.max()[subsample_dists].plot(legend=True, label="max", color=P_C_MAX)
plt.xlabel("subsampling distance")
h.save_plot("sbc_bp__hc_share_chg_sd_all", plot_dir, show=True)

In [None]:
ds_sbc_bp[ds_sbc_bp.c_cut == 7000].groupby(["label_c"]).hc_seg_share_changed.describe()

In [None]:
## USE ##
# HC segments: mean share changed 
# by route distance cutoff (including decay variants)
ds_sbc_bp[ds_sbc_bp.c_cut == 2000].groupby(["label_c"]).hc_seg_share_changed.mean()[subsample_dists].plot(legend=True, label="2 km", color=P_C_D2, figsize=P_SIZE_S)
ds_sbc_bp[ds_sbc_bp.c_cut == 4000].groupby(["label_c"]).hc_seg_share_changed.mean()[subsample_dists].plot(legend=True, label="4 km", color=P_C_D4)
ds_sbc_bp[ds_sbc_bp.c_cut == 7000].groupby(["label_c"]).hc_seg_share_changed.mean()[subsample_dists].plot(legend=True, label="7 km", color=P_C_D7)
plt.xlabel("subsampling distance")
h.save_plot("sbc_bp__hc_share_chg_sd_rdist", plot_dir, show=True)

### Normalized difference

In [None]:
ds_sbc_bp.groupby("label_c").hcp_dn_abs_mean.describe()

In [None]:
## USE ##
### HC (p-based) # absolute values (mean abs. norm. delta)
subsample_dists = ["300 m", "600 m", "900 m", "1200 m", "1500 m"]
ds_sbc_bp.groupby("label_c").hcp_dn_abs_mean.min()[subsample_dists].plot(legend=True, label="min", color=P_C_MIN, figsize=P_SIZE_S)
ds_sbc_bp.groupby("label_c").hcp_dn_abs_mean.mean()[subsample_dists].plot(legend=True, label="mean", color=P_C_MEAN)
ds_sbc_bp.groupby("label_c").hcp_dn_abs_mean.max()[subsample_dists].plot(legend=True, label="max", color=P_C_MAX)
plt.xlabel("subsampling distance")
h.save_plot("sbc_bp__hc_dn_sd_all_abs", plot_dir, show=True)

In [None]:
ds_sbc_bp[ds_sbc_bp.c_cut == 7000].groupby("label_c").hcp_dn_abs_mean.describe()

In [None]:
# HC segments
# by route distance cutoff (including decay variants)
diffstats[diffstats.c_cut == 2000].groupby(["label_c"]).hcp_dn_abs_mean.mean()[subsample_dists].plot(legend=True, label="2 km", color=P_C_D2, figsize=P_SIZE_S)
diffstats[diffstats.c_cut == 4000].groupby(["label_c"]).hcp_dn_abs_mean.mean()[subsample_dists].plot(legend=True, label="4 km", color=P_C_D4)
diffstats[diffstats.c_cut == 7000].groupby(["label_c"]).hcp_dn_abs_mean.mean()[subsample_dists].plot(legend=True, label="7 km", color=P_C_D7)
plt.xlabel("subsampling distance")
h.save_plot("sbc_bp__hc_dn_sd_rdist", plot_dir, show=True)

## High-Centrality segments
### Share changed

In [None]:
# HC segments (p-based): share changed (relative to all segments which were classified as HC in cref as well as in c)
# computed across all AOIs and centrality variants 
subsample_dists = ["300 m", "600 m", "900 m", "1200 m", "1500 m"]
diffstats.groupby("label_c").hc_seg_share_changed.min()[subsample_dists].plot(legend=True, label="min", color=P_C_MIN, figsize=P_SIZE_S)
diffstats.groupby("label_c").hc_seg_share_changed.mean()[subsample_dists].plot(legend=True, label="mean", color=P_C_MEAN)
diffstats.groupby("label_c").hc_seg_share_changed.max()[subsample_dists].plot(legend=True, label="max", color=P_C_MAX)
plt.xlabel("subsampling distance")
h.save_plot("hc_share_chg_sd_all", plot_dir, show=True)

In [None]:
# HC segments: mean share changed 
# by route distance cutoff (including decay variants)
diffstats[diffstats.c_cut == 2000].groupby(["label_c"]).hc_seg_share_changed.mean()[subsample_dists].plot(legend=True, label="2 km", color=P_C_D2, figsize=P_SIZE_S)
diffstats[diffstats.c_cut == 4000].groupby(["label_c"]).hc_seg_share_changed.mean()[subsample_dists].plot(legend=True, label="4 km", color=P_C_D4)
diffstats[diffstats.c_cut == 7000].groupby(["label_c"]).hc_seg_share_changed.mean()[subsample_dists].plot(legend=True, label="7 km", color=P_C_D7)
plt.xlabel("subsampling distance")
h.save_plot("hc_share_chg_sd_rdist", plot_dir, show=True)

In [None]:
# HC segments: share changed
# by route distance cutoff; filtered to non-decay variants
diffstats[(diffstats.c_cut == 2000) & (diffstats.c_decay_from < 0)].groupby(["label_c"]).hc_seg_share_changed.mean()[subsample_dists].plot(legend=True, label="2 km", color=P_C_D2, figsize=P_SIZE_S)
diffstats[(diffstats.c_cut == 4000) & (diffstats.c_decay_from < 0)].groupby(["label_c"]).hc_seg_share_changed.mean()[subsample_dists].plot(legend=True, label="4 km", color=P_C_D4)
diffstats[(diffstats.c_cut == 7000) & (diffstats.c_decay_from < 0)].groupby(["label_c"]).hc_seg_share_changed.mean()[subsample_dists].plot(legend=True, label="7 km", color=P_C_D7)
plt.xlabel("subsampling distance")
h.save_plot("hc_share_chg_sd_rdist_nodecay", plot_dir, show=True)

### Normalized difference

In [None]:
diffstats.groupby("label_c").hcp_dn_mean.describe()

In [None]:
### HC (p-based)
subsample_dists = ["300 m", "600 m", "900 m", "1200 m", "1500 m"]
diffstats.groupby("label_c").hcp_dn_mean.min()[subsample_dists].plot(legend=True, label="min", color=P_C_MIN, figsize=P_SIZE_S)
diffstats.groupby("label_c").hcp_dn_mean.mean()[subsample_dists].plot(legend=True, label="mean", color=P_C_MEAN)
diffstats.groupby("label_c").hcp_dn_mean.max()[subsample_dists].plot(legend=True, label="max", color=P_C_MAX)
plt.xlabel("subsampling distance")
h.save_plot("hc_dn_sd_all", plot_dir, show=True)

In [None]:
### HC (p-based) # absolute values
subsample_dists = ["300 m", "600 m", "900 m", "1200 m", "1500 m"]
diffstats.groupby("label_c").hcp_dn_abs_mean.min()[subsample_dists].plot(legend=True, label="min", color=P_C_MIN, figsize=P_SIZE_S)
diffstats.groupby("label_c").hcp_dn_abs_mean.mean()[subsample_dists].plot(legend=True, label="mean", color=P_C_MEAN)
diffstats.groupby("label_c").hcp_dn_abs_mean.max()[subsample_dists].plot(legend=True, label="max", color=P_C_MAX)
plt.xlabel("subsampling distance")
h.save_plot("hc_dn_sd_all_abs", plot_dir, show=True)


In [None]:
# HC segments
# by route distance cutoff (including decay variants)
diffstats[diffstats.c_cut == 2000].groupby(["label_c"]).hcp_dn_abs_mean.mean()[subsample_dists].plot(legend=True, label="2 km", color=P_C_D2, figsize=P_SIZE_S)
diffstats[diffstats.c_cut == 4000].groupby(["label_c"]).hcp_dn_abs_mean.mean()[subsample_dists].plot(legend=True, label="4 km", color=P_C_D4)
diffstats[diffstats.c_cut == 7000].groupby(["label_c"]).hcp_dn_abs_mean.mean()[subsample_dists].plot(legend=True, label="7 km", color=P_C_D7)
plt.xlabel("subsampling distance")
h.save_plot("hc_dn_sd_rdist", plot_dir, show=True)

In [None]:
# HC segments
# by route distance cutoff; filtered to non-decay variants
diffstats[(diffstats.c_cut == 2000) & (diffstats.c_decay_from < 0)].groupby(["label_c"]).hcp_dn_abs_mean.mean()[subsample_dists].plot(legend=True, label="2 km", color=P_C_D2, figsize=P_SIZE_S)
diffstats[(diffstats.c_cut == 4000) & (diffstats.c_decay_from < 0)].groupby(["label_c"]).hcp_dn_abs_mean.mean()[subsample_dists].plot(legend=True, label="4 km", color=P_C_D4)
diffstats[(diffstats.c_cut == 7000) & (diffstats.c_decay_from < 0)].groupby(["label_c"]).hcp_dn_abs_mean.mean()[subsample_dists].plot(legend=True, label="7 km", color=P_C_D7)
plt.xlabel("subsampling distance")
h.save_plot("hc_dn_sd_rdist_nodecay", plot_dir, show=True)

In [None]:
# HC segments
# by route distance cutoff; filtered to non-decay variants
diffstats[(diffstats.c_cut == 2000) & (diffstats.c_decay_from < 0)].groupby(["label_c"]).hcp_dn_mean.mean()[subsample_dists].plot(legend=True, label="2 km", color=P_C_D2, figsize=P_SIZE_S)
diffstats[(diffstats.c_cut == 4000) & (diffstats.c_decay_from < 0)].groupby(["label_c"]).hcp_dn_mean.mean()[subsample_dists].plot(legend=True, label="4 km", color=P_C_D4)
diffstats[(diffstats.c_cut == 7000) & (diffstats.c_decay_from < 0)].groupby(["label_c"]).hcp_dn_mean.mean()[subsample_dists].plot(legend=True, label="7 km", color=P_C_D7)
plt.xlabel("subsampling distance")

### (q-based)

In [None]:
### HC (q-based)
subsample_dists = ["300 m", "600 m", "900 m", "1200 m", "1500 m"]
diffstats.groupby("label_c").hcq_dn_mean.min()[subsample_dists].plot(legend=True, label="min", color=P_C_MIN, figsize=P_SIZE_S)
diffstats.groupby("label_c").hcq_dn_mean.mean()[subsample_dists].plot(legend=True, label="mean", color=P_C_MEAN)
diffstats.groupby("label_c").hcq_dn_mean.max()[subsample_dists].plot(legend=True, label="max", color=P_C_MAX)
plt.xlabel("subsampling distance")

## All segments

In [None]:
### All segments: normalized change
subsample_dists = ["300 m", "600 m", "900 m", "1200 m", "1500 m"]
diffstats.groupby("label_c").dn_mean.min()[subsample_dists].plot(legend=True, label="min", color=P_C_MIN, figsize=P_SIZE_S)
diffstats.groupby("label_c").dn_mean.mean()[subsample_dists].plot(legend=True, label="mean", color=P_C_MEAN)
diffstats.groupby("label_c").dn_mean.max()[subsample_dists].plot(legend=True, label="max", color=P_C_MAX)
plt.xlabel("subsampling distance")

In [None]:
### All segments: normalized change (absolute values)
subsample_dists = ["300 m", "600 m", "900 m", "1200 m", "1500 m"]
diffstats.groupby("label_c").dn_abs_mean.min()[subsample_dists].plot(legend=True, label="min", color=P_C_MIN, figsize=P_SIZE_S)
diffstats.groupby("label_c").dn_abs_mean.mean()[subsample_dists].plot(legend=True, label="mean", color=P_C_MEAN)
diffstats.groupby("label_c").dn_abs_mean.max()[subsample_dists].plot(legend=True, label="max", color=P_C_MAX)
plt.xlabel("subsampling distance")

In [None]:
# HC segments
# by route distance cutoff (including decay variants)
diffstats[diffstats.c_cut == 2000].groupby(["label_c"]).dn_abs_mean.mean()[subsample_dists].plot(legend=True, label="2 km", color=P_C_D2, figsize=P_SIZE_S)
diffstats[diffstats.c_cut == 4000].groupby(["label_c"]).dn_abs_mean.mean()[subsample_dists].plot(legend=True, label="4 km", color=P_C_D4)
diffstats[diffstats.c_cut == 7000].groupby(["label_c"]).dn_abs_mean.mean()[subsample_dists].plot(legend=True, label="7 km", color=P_C_D7)
plt.xlabel("subsampling distance")

In [None]:
# HC segments
# by route distance cutoff (including decay variants)
diffstats[diffstats.c_cut == 2000].groupby(["label_c"]).dn_mean.mean()[subsample_dists].plot(legend=True, label="2 km", color=P_C_D2, figsize=P_SIZE_S)
diffstats[diffstats.c_cut == 4000].groupby(["label_c"]).dn_mean.mean()[subsample_dists].plot(legend=True, label="4 km", color=P_C_D4)
diffstats[diffstats.c_cut == 7000].groupby(["label_c"]).dn_mean.mean()[subsample_dists].plot(legend=True, label="7 km", color=P_C_D7)
plt.xlabel("subsampling distance")