compare between model results and survey data: see how the results change/improve when we use coefficients estimated from our survey
- data: persons
- variables: distance_to_work, distance_to_school, is_worker, is_grade/highschool/university
- distance bins from [workplace_location.csv](https://github.com/psrc/psrc_activitysim/blob/main/configs_dev/workplace_location.csv)



In [1]:
import os
import toml
import pandas as pd
import numpy as np
import validation_data_input
import psrc_theme
import plotly.express as px

# to show plotly figures in quarto HTML file
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"
pio.templates.default = "simple_white+psrc_color"

config = toml.load(os.path.join(os.getcwd(), 'validation_configuration.toml'))

# get shared data
%store -r validation_data

In [2]:
# read data
per_data = validation_data.persons_data_uncloned.copy()

#land_use = validation_data.land_use[['zone_id','log_hh_1']].copy()
hh_data = validation_data.hh_data_uncloned.merge(validation_data.land_use,how="left",left_on='home_zone_id',right_on='zone_id')
# delete shared data
del validation_data

per_data = per_data.merge(hh_data,how="left",on=['household_id','source'])

# remove workers that have a distance of -1
per_data = per_data[(per_data['distance_to_work'] > -1) & (~per_data['work_from_home'])]


In [3]:
# distance to school bins from workplace_location.csv
per_data['distance_to_work_bin'] = pd.cut(per_data['distance_to_work'], bins=[0,1,2,5,15,9999],
                                    labels=['util_dist_0_1', 'util_dist_1_2', 'util_dist_2_5',
                                            'util_dist_5_15', 'util_dist_15_up'])
# Create bins: bins of 2 miles up to 60 miles
max_bin = 60
bin_size = 2
per_data['d_work_bin_60mi'] = pd.cut(per_data['distance_to_work'], bins=np.arange(0, max_bin+bin_size, bin_size), labels=[str(i) for i in np.arange(0, max_bin, bin_size)])


# grouping income, hh density, employment density into very low, low, medium, medium-high and high
# todo: check if we should be using model data for grouping
var_group = hh_data.loc[hh_data['source']=="model results", ['income','log_hh_1']].quantile([.125, .25, .50, .75])

var_group

Unnamed: 0,income,log_hh_1
0.125,24000.0,4.434358
0.25,43000.0,5.089858
0.5,82000.0,5.721498
0.75,135000.0,6.457123


In [4]:
# add income group
per_data['hhincome_group'] = pd.cut(per_data['income'],bins=[-9999999.0] + var_group['income'].tolist() + [9999999.0], labels=['very low', 'low', 'medium', 'medium-high', 'high'])
# add hh density groups
per_data['hh_density_group'] = pd.cut(per_data['log_hh_1'],bins=[-9999999.0] + var_group['log_hh_1'].tolist() + [9999999.0], labels=['very low', 'low', 'medium', 'medium-high', 'high'])

## Distance to work

In [5]:
def plot_distance(df:pd.DataFrame, group:str, title_name:str):

    count = df.loc[(df['source']=='model results') & (df[group]),['distance_to_work_bin']].value_counts()
    print(f"model person count =\n"
          f"{count.sort_values()}")

    # plot1
    df_plot = df.loc[(df[group]) & (df['distance_to_work_bin'] != float('nan'))].groupby(['source','distance_to_work_bin'])['person_weight'].sum().reset_index()
    df_plot['percentage'] = df_plot.groupby(['source'], group_keys=False)['person_weight'].\
            apply(lambda x: 100 * x / float(x.sum()))
    # df_plot
    #
    fig1 = px.bar(df_plot, x='distance_to_work_bin', y="percentage", color="source", barmode="group",
                  title=title_name)
    fig1.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
    fig1.update_layout(height=400, width=700, font=dict(size=11))
    fig1.show()

    # plot2
    df_plot = per_data.loc[(per_data[group]) & (per_data['d_work_bin_60mi'].notna())].groupby(['source','d_work_bin_60mi'])['person_weight'].sum().reset_index()
    df_plot['percentage'] = df_plot.groupby(['source'], group_keys=False)['person_weight'].\
                apply(lambda x: 100 * x / float(x.sum()))
    # df_plot
    #
    fig2 = px.line(df_plot, x='d_work_bin_60mi', y="percentage", color="source",
                   title=title_name)
    fig2.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
    fig2.update_layout(height=400, width=700, font=dict(size=11))
    fig2.show()

# distance to work
plot_distance(per_data,'is_worker',"worker: distance to work")

model person count =
distance_to_work_bin
util_dist_0_1            59073
util_dist_1_2           109844
util_dist_2_5           255592
util_dist_15_up         545475
util_dist_5_15          621501
dtype: int64


## Distance to work by segments

In [6]:
df_plot = per_data.loc[(per_data['is_worker']) & (per_data['distance_to_work_bin'] != float('nan'))].groupby(['source','hhincome_group','distance_to_work_bin'])['person_weight'].sum().reset_index()
df_plot['percentage'] = df_plot.groupby(['source','hhincome_group'], group_keys=False)['person_weight']. \
    apply(lambda x: 100 * x / float(x.sum()))
# df_plot

fig = px.bar(df_plot, x="distance_to_work_bin", y="percentage", color="source",barmode="group",
             facet_col="hhincome_group", facet_col_wrap=2,
             title="Distance to work by income group")
fig.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
fig.update_layout(height=900, width=700, font=dict(size=11))
fig.show()

In [7]:
df_plot = per_data.loc[(per_data['is_worker']) & (per_data['d_work_bin_60mi'] != float('nan'))].groupby(['source','hhincome_group','d_work_bin_60mi'])['person_weight'].sum().reset_index()
df_plot['percentage'] = df_plot.groupby(['source','hhincome_group'], group_keys=False)['person_weight']. \
    apply(lambda x: 100 * x / float(x.sum()))
# df_plot

fig2 = px.line(df_plot, x='d_work_bin_60mi', y="percentage", color="source",
               facet_col="hhincome_group", facet_col_wrap=2,
               title="Distance to work by income group")
fig2.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
fig2.update_layout(height=400, width=700, font=dict(size=11))
fig2.show()

- Household density represents the household density of a person's home location

In [8]:
df_plot = per_data.loc[(per_data['is_worker']) & (per_data['distance_to_work_bin'] != float('nan'))].groupby(['source','hh_density_group','distance_to_work_bin'])['person_weight'].sum().reset_index()
df_plot['percentage'] = df_plot.groupby(['source','hh_density_group'], group_keys=False)['person_weight']. \
    apply(lambda x: 100 * x / float(x.sum()))
# df_plot

fig = px.bar(df_plot, x="distance_to_work_bin", y="percentage", color="source",barmode="group",
             facet_col="hh_density_group", facet_col_wrap=2,
             title="Distance to work by household density")
fig.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
fig.update_layout(height=900, width=700, font=dict(size=11))
fig.show()

In [9]:
df_plot = per_data.loc[(per_data['is_worker']) & (per_data['d_work_bin_60mi'] != float('nan'))].groupby(['source','hh_density_group','d_work_bin_60mi'])['person_weight'].sum().reset_index()
df_plot['percentage'] = df_plot.groupby(['source','hh_density_group'], group_keys=False)['person_weight']. \
    apply(lambda x: 100 * x / float(x.sum()))
# df_plot

fig2 = px.line(df_plot, x='d_work_bin_60mi', y="percentage", color="source",
               facet_col="hh_density_group", facet_col_wrap=2,
               title="Distance to work by household density")
fig2.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
fig2.update_layout(height=400, width=700, font=dict(size=11))
fig2.show()