In [1]:
import os
import toml
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots

# to show plotly figures in quarto HTML file
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"

config = toml.load(os.path.join(os.getcwd(), 'validation_configuration.toml'))

In [2]:
# read data

land_use = pd.read_csv(config['p_survey_landuse'])
# model data
per_data_model = pd.read_parquet(config['p_model_persons']).reset_index()
tour_data_model = pd.read_parquet(config['p_model_tours']).reset_index()
# add weight to model data with all 1
per_data_model['hh_weight_2017_2019'] = np.repeat(1, len(per_data_model))
# tour_data_model['hh_weight_2017_2019'] = np.repeat(1, len(tour_data_model))

# survey data
per_data_survey = pd.read_csv(config['p_survey_persons']).groupby('person_id_elmer').first().reset_index() # remove duplicates
tour_data_survey = pd.read_csv(config['p_survey_tours'])
# tour_data_survey['hh_weight_2017_2019'] = 1

# unweighted survey data
tour_data_survey_unweighted = tour_data_survey.copy()
# tour_data_survey_unweighted['hh_weight_2017_2019'] = 1

per_data_survey_unweighted = per_data_survey.copy()
per_data_survey_unweighted['hh_weight_2017_2019'] = 1

## person types


In [4]:
#| echo: true

ptype_cat = {1: "1: Full-Time Worker",
             2: "2: Part-Time Worker",
             3: "3: University Student",
             4: "4: Non-Working Adult Age <65",
             5: "5: Non-Working Adult Age 65+",
             6: "6: High School Student Age 16+",
             7: "7: Child Age 5-15",
             8: "8: Child Age 0-4"}

- household density groups

In [5]:
# household density groups
density_var = 'log_hh_1'

var_group = land_use[density_var].quantile([.00, .125, .25, .50, .75,1.00])
land_use['household_density_bin'] = pd.cut(land_use[density_var], bins=var_group.tolist(),
                                    labels=['very low', 'low', 'medium', 'medium-high', 'high'])
# Nan for zones with no households

var_group

0.000    0.000000
0.125    3.330215
0.250    4.468057
0.500    5.521530
0.750    6.240536
1.000    9.074037
Name: log_hh_1, dtype: float64

In [6]:
tours_list = ['tour_id', 'tour_mode', 'tour_type', 'person_id', 'household_id']
persons_list = ['person_id', 'household_id', 'home_zone_id', 'ptype', 'hh_weight_2017_2019']
land_use_list = ['zone_id', 'log_hh_1', 'household_density_bin']

# merge survey data
_df1 = tour_data_survey[tours_list]. \
    merge(per_data_survey[persons_list],how="left", on=['person_id', 'household_id']).\
    merge(land_use[land_use_list],how="left",left_on='home_zone_id',right_on='zone_id')
_df1['source'] = "survey data"
# merge unweighted survey data
_df2 = tour_data_survey_unweighted[tours_list]. \
    merge(per_data_survey_unweighted[persons_list],how="left", on=['person_id', 'household_id']). \
    merge(land_use[land_use_list],how="left",left_on='home_zone_id',right_on='zone_id')
_df2['source'] = "unweighted survey"
# merge model data
_df3 = tour_data_model[tours_list]. \
    merge(per_data_model[persons_list],how="left", on=['person_id', 'household_id']). \
    merge(land_use[land_use_list],how="left",left_on='home_zone_id',right_on='zone_id')
_df3['source'] = "model results"

# tour_mode_data = pd.concat([_df1,_df2,_df3])
tour_mode_data = pd.concat([_df2,_df3])
tour_mode_data['ptype_des'] = tour_mode_data['ptype'].map(ptype_cat)

In [7]:
print(f"tour counts \n"
      f"- model results: {len(tour_data_model)}\n"
      f"- survey results: {_df1['hh_weight_2017_2019'].sum()}\n"
      f"- unweighted survey: {_df2['hh_weight_2017_2019'].sum()}\n"
      # f"group dividers:\n"
      # f"{var_group}"
      )

tour counts 
- model results: 5125139
- survey results: 2856862.0277673956
- unweighted survey: 10076.0



## Total tour mode choice

In [8]:
df_plot = tour_mode_data.groupby(['source','tour_mode'])['hh_weight_2017_2019'].sum().reset_index()
df_plot['percentage'] = df_plot.groupby(['source'], group_keys=False)['hh_weight_2017_2019']. \
    apply(lambda x: 100 * x / float(x.sum()))

fig = px.bar(df_plot, x="tour_mode", y="percentage", color="source",barmode="group", template="simple_white",
             color_discrete_sequence=config['psrc_color'],
             title="Tour mode choice")
fig.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
fig.update_layout(height=400, width=700, font=dict(size=11))
fig.show()

In [9]:
def plot_mode_choice(df: pd.DataFrame, grp_var: str, n_nol: int, height: int):
    df_plot = df.groupby(['source',grp_var,'tour_mode'])['hh_weight_2017_2019'].sum().reset_index()
    df_plot['percentage'] = df_plot.groupby(['source',grp_var], group_keys=False)['hh_weight_2017_2019']. \
        apply(lambda x: 100 * x / float(x.sum()))

    fig = px.bar(df_plot, x="percentage", y="tour_mode", color="source",barmode="group",
                 facet_col=grp_var, facet_col_wrap=n_nol, orientation='h', template="simple_white",
                 color_discrete_sequence=config['psrc_color'],
                 title="Tour mode choice by " + grp_var)
    fig.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
    fig.update_layout(height=height, width=700, font=dict(size=11), yaxis={'categoryorder':'category descending'})
    fig.show()


## Tour modes by different segments

In [10]:
plot_mode_choice(tour_mode_data,'tour_type',3,1200)

In [11]:
plot_mode_choice(tour_mode_data,'ptype_des',2,1200)

In [12]:
plot_mode_choice(tour_mode_data,'household_density_bin',3,600)