In [1]:
import pandas as pd
import geopandas as gpd
from shapely import  wkb
import sqlalchemy
from scipy import stats
import matplotlib.pyplot as plt

## settings

In [None]:
local_crs = 3006

plot = True

## read flows

In [None]:
# read from database
url = sqlalchemy.URL.create(
    "postgresql+psycopg", port=5432,
    host="host", database="database", username="username")
engine = sqlalchemy.create_engine(url)

In [None]:
sql_query_flows_sthlm = "SELECT * FROM flows.sthlm_flows_2024_random1to11;"
sql_query_flows_gbg = "SELECT * FROM flows.gbg_flows_2024_random1to2;"

with engine.connect() as conn_flowsense:
    with conn_flowsense.execute(sqlalchemy.text(sql_query_flows_sthlm)) as cursor:
        df_flows_sthlm = pd.read_sql(sql_query_flows_sthlm, con=conn_flowsense)
    with conn_flowsense.execute(sqlalchemy.text(sql_query_flows_gbg)) as cursor:
        df_flows_gbg = pd.read_sql(sql_query_flows_gbg, con=conn_flowsense)

flows_sthlm = gpd.GeoDataFrame(df_flows_sthlm, geometry=df_flows_sthlm['geometry'].apply(wkb.loads), crs=local_crs)
flows_gbg = gpd.GeoDataFrame(df_flows_gbg, geometry=df_flows_gbg['geometry'].apply(wkb.loads), crs=local_crs)

#### sum flows in both directions per road

In [None]:
def sum_and_renormalize(flows):

    # sum relative flow and traj count for edge and its corresponding reverse
    for col in flows.columns:
        if col.startswith('relflow_'):
            flows = pd.merge(
                flows.groupby("ISA_index").sum([col, col.replace('relflow', 'trajcount')]),
                flows[['ISA_index', 'geometry', 'RLID']],
                on='ISA_index',
                how='left').drop_duplicates('ISA_index', keep='first').reset_index(drop=True)
    flows.set_index('ISA_index', inplace=True)
    flows = gpd.GeoDataFrame(flows, geometry=flows.geometry, crs=local_crs)

    # renormalize flows now that values are summed
    for col in flows.columns:
        if col.startswith('relflow_'):
            flows[col] = flows[col] / flows[col].max()

    return flows

In [None]:
flows_sthlm = sum_and_renormalize(flows_sthlm)
flows_gbg = sum_and_renormalize(flows_gbg)

## read ground truth data

In [None]:
sql_query_sthlm_2024 = "SELECT * FROM ground_truth.flows_cars_sthlm_2024;"

with engine.connect() as conn_flowsense:
    with conn_flowsense.execute(sqlalchemy.text(sql_query_sthlm_2024)) as cursor:
        df_carflows_sthlm_2024 = pd.read_sql(sql_query_sthlm_2024, con=conn_flowsense)

carflows_sthlm_2024 = gpd.GeoDataFrame(df_carflows_sthlm_2024, geometry=df_carflows_sthlm_2024['geometry'].apply(wkb.loads), crs=local_crs)

In [None]:
sql_query_gbg_local_2023 = "SELECT * FROM ground_truth.flows_cars_gbg_local_2023;"

with engine.connect() as conn_flowsense:
    with conn_flowsense.execute(sqlalchemy.text(sql_query_gbg_local_2023)) as cursor:
        df_carflows_gbg_local_2023 = pd.read_sql(sql_query_gbg_local_2023, con=conn_flowsense)

carflows_gbg_local_2023 = gpd.GeoDataFrame(df_carflows_gbg_local_2023, geometry=df_carflows_gbg_local_2023['geometry'].apply(wkb.loads), crs=local_crs)

In [None]:
sql_query_gbg_highway_2023 = "SELECT * FROM ground_truth.flows_cars_gbg_highway_2023;"

with engine.connect() as conn_flowsense:
    with conn_flowsense.execute(sqlalchemy.text(sql_query_gbg_highway_2023)) as cursor:
        df_carflows_gbg_highway_2023 = pd.read_sql(sql_query_gbg_highway_2023, con=conn_flowsense)

carflows_gbg_highway_2023 = gpd.GeoDataFrame(df_carflows_gbg_highway_2023, geometry=df_carflows_gbg_highway_2023['geometry'].apply(wkb.loads), crs=local_crs)

## combine flows and ground truth

#### sample one point per groundtruth line, and find corresponding flow value

In [None]:
def sample_and_combine(flows, groundtruth):
    # store original line geometry
    flows['original_geom_flows'] = flows.geometry

    # sample a point per groundtruth line
    sample_points = groundtruth.copy()
    sample_points['original_geom_gt'] = sample_points.geometry
    if groundtruth.geom_type[0] == 'LineString':
        sample_points.geometry = sample_points.geometry.interpolate(0.5, normalized=True)

    # find flow-line within maximum distance away, identify nearest if multiple nearby
    sample_points['_id'] = sample_points.index
    joined = gpd.sjoin_nearest(sample_points, flows, how='inner', max_distance=10, distance_col='_dist')
    idx = joined.groupby('_id')['_dist'].idxmin()
    joined = joined.loc[idx].reset_index(drop=True)
    joined.drop_duplicates(subset=['_id'], keep='first', inplace=True)
    joined.drop(columns=['_id', '_dist'], inplace=True)

    if plot:
        print('{}% of sample points successfully joined'.format(round(100*len(joined)/len(sample_points), 2)))

    return joined

In [None]:
carflows_sthlm_2024_joined = sample_and_combine(flows_sthlm, carflows_sthlm_2024)
carflows_gbg_local_2023_joined = sample_and_combine(flows_gbg, carflows_gbg_local_2023)
carflows_gbg_highway_2023_joined = sample_and_combine(flows_gbg, carflows_gbg_highway_2023)

## Comparison

In [None]:
sign_threshold = 0.01

In [None]:
df = carflows_sthlm_2024_joined
gt_col = 'trafikflöde__fordon_dygn__alla_fordon'

y = df[gt_col]

for col in df.columns:
    if col.startswith('relflow_'):
        relflow_col = col
        x = df[relflow_col]

        r, p = stats.spearmanr(x, y)
        print(relflow_col)
        print(r, p)
        if p <= sign_threshold:
            print('--> significant correlation between estimated and ground truth flows, Spearman R: {}'.format(round(r,3)))
        else:
            print('--> NO significant correlation')
        print('')

In [None]:
df = carflows_gbg_local_2023_joined
gt_col = 'ådt_normalized'

y = df[gt_col]

for col in df.columns:
    if col.startswith('relflow_'):
        relflow_col = col
        x = df[relflow_col]

        r, p = stats.spearmanr(x, y)
        print(relflow_col)
        print(r, p)
        if p <= sign_threshold:
            print('--> significant correlation between estimated and ground truth flows, Spearman R: {}'.format(round(r,3)))
        else:
            print('--> NO significant correlation')
        print('')

In [None]:
df = carflows_gbg_highway_2023_joined
gt_col = 'adt_samtliga_fordon_normalized'

y = df[gt_col]

for col in df.columns:
    if col.startswith('relflow_'):
        relflow_col = col
        x = df[relflow_col]

        r, p = stats.spearmanr(x, y)
        print(relflow_col)
        print(r, p)
        if p <= sign_threshold:
            print('--> significant correlation between estimated and ground truth flows, Spearman R: {}'.format(round(r,3)))
        else:
            print('--> NO significant correlation')
        print('')

In [None]:
df = carflows_sthlm_2024_joined
gt_col = 'trafikflöde__fordon_dygn__alla_fordon_normalized'
relflow_col = 'relflow_minavgspeed0'

y = df[gt_col]
x = df[relflow_col]

plt.scatter(x,y, alpha=0.2)
plt.xlabel('estimated flow')
plt.ylabel('ground truth flow')
plt.title('any trajectory at Sthlm local/major roads')

In [None]:
df = carflows_gbg_local_2023_joined
gt_col = 'ådt_normalized'
relflow_col = 'relflow_minavgspeed0'

y = df[gt_col]
x = df[relflow_col]

plt.scatter(x,y, alpha=0.2)
plt.xlabel('estimated flow')
plt.ylabel('ground truth flow')
plt.title('any trajectory at Gbg local/major roads')

In [None]:
df = carflows_gbg_highway_2023_joined
gt_col = 'adt_samtliga_fordon_normalized'
relflow_col = 'relflow_minavgspeed0'

y = df[gt_col]
x = df[relflow_col]

plt.scatter(x,y, alpha=0.2)
plt.xlabel('estimated flow')
plt.ylabel('ground truth flow')
plt.title('any trajectory at Gbg highway locations')

In [None]:
df = carflows_gbg_highway_2023_joined
gt_col = 'adt_samtliga_fordon_normalized'
relflow_col = 'relflow_minavgspeed20'

y = df[gt_col]
x = df[relflow_col]

plt.scatter(x,y, alpha=0.2)
plt.xlabel('estimated flow')
plt.ylabel('ground truth flow')
plt.title('trajectories avg_speed > 20 km/h at Gbg highway locations')