- distance bins from [workplace_location.csv](https://github.com/psrc/psrc_activitysim/blob/main/configs_dev/workplace_location.csv)



In [1]:
import os
import toml
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots

# to show plotly figures in quarto HTML file
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"

config = toml.load(os.path.join(os.getcwd(), 'validation_configuration.toml'))

In [2]:
#import util
# import toml
# import os 
# config = toml.load(os.path.join(os.getcwd(), 'validation_configuration.toml'))

# validation_data = util.ValidationData(config)

# %store validation_data

## Person data

In [3]:
# read data
# get shared data
%store -r validation_data

#config = toml.load(os.path.join(os.getcwd(), 'validation_configuration.toml'))

per_data = validation_data.persons_data_uncloned.copy()

del validation_data

n_model_results = len(per_data.loc[per_data['source']=='model results'])
n_survey_results = len(per_data.loc[per_data['source']=='survey data'])


print(f"person counts \n"
      f"- model results: {str(n_model_results)}\n"
      f"- survey results: {str(n_survey_results)}\n"
      # f"group dividers:\n"
      # f"{var_group}"
      )

person counts 
- model results: 4053154
- survey results: 7561



In [4]:
per_data['distance_to_school_bin'] = pd.cut(per_data['distance_to_school'], bins=[0,1,2,5,15,9999],
                                          labels=['util_dist_0_1', 'util_dist_1_2', 'util_dist_2_5',
                                                  'util_dist_5_15', 'util_dist_15_up'])

# Create bins: bins of 2 miles up to 60 miles
max_bin = 60
bin_size = 2
per_data['d_school_bin_60mi'] = pd.cut(per_data['distance_to_school'], bins=np.arange(0, max_bin+bin_size, bin_size), labels=[str(i) for i in np.arange(0, max_bin, bin_size)])

In [5]:
#| warning: false

# process model and survey data for summary
# def data_process(df: pd.DataFrame, data_source: str) -> pd.DataFrame:

#     # add data source
#     df['source'] = data_source
#     # distance to school bins from workplace_location.csv
#     df['distance_to_school_bin'] = pd.cut(df['distance_to_school'], bins=[0,1,2,5,15,9999],
#                                           labels=['util_dist_0_1', 'util_dist_1_2', 'util_dist_2_5',
#                                                   'util_dist_5_15', 'util_dist_15_up'])
#     # Create bins: bins of 2 miles up to 60 miles
#     max_bin = 60
#     bin_size = 2
#     df['d_school_bin_60mi'] = pd.cut(df['distance_to_school'], bins=np.arange(0, max_bin+bin_size, bin_size), labels=[str(i) for i in np.arange(0, max_bin, bin_size)])

#     return df

# # match columns and concat all source into hh_data
# col_list = ['person_id', 'household_id', 'hh_weight',
#             'is_student', 'is_gradeschool', 'is_highschool', 'is_university','school_segment', 'distance_to_school']

# # combine both sets of data
# per_data = pd.concat([data_process(per_data_model[col_list], "model results").copy(),
#                       data_process(per_data_survey[col_list], "survey data").copy()])


# per_data

## Distance to school

In [6]:
def plot_distance(df:pd.DataFrame, group:str, title_name:str):
    count = df.loc[(df['source']=='model results') & (df[group]),['distance_to_school_bin']].value_counts()
    print(f"model person count =\n"
          f"{count.sort_values()}")
    df_plot = df.loc[(df[group]) & (df['distance_to_school_bin'] != float('nan'))].groupby(['source','distance_to_school_bin'])['person_weight'].sum().reset_index()
    df_plot['percentage'] = df_plot.groupby(['source'], group_keys=False)['person_weight'].\
            apply(lambda x: 100 * x / float(x.sum()))
    # df_plot
    #
    fig1 = px.bar(df_plot, x='distance_to_school_bin', y="percentage", color="source", barmode="group",template="simple_white",
                 title=title_name)
    fig1.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
    fig1.update_layout(height=400, width=700, font=dict(size=11))
    fig1.show()

    df_plot = per_data.loc[(per_data[group]) & (per_data['d_school_bin_60mi'].notna())].groupby(['source','d_school_bin_60mi'])['person_weight'].sum().reset_index()
    df_plot['percentage'] = df_plot.groupby(['source'], group_keys=False)['person_weight'].\
                apply(lambda x: 100 * x / float(x.sum()))
    # df_plot
    #
    fig2 = px.line(df_plot, x='d_school_bin_60mi', y="percentage", color="source", template="simple_white",
                  title=title_name)
    fig2.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
    fig2.update_layout(height=400, width=700, font=dict(size=11))
    fig2.show()

# distance to school: grade school
plot_distance(per_data,'is_student',"All students: distance to school")

model person count =
distance_to_school_bin
util_dist_15_up            41532
util_dist_1_2             225216
util_dist_2_5             227671
util_dist_5_15            265606
util_dist_0_1             299369
dtype: int64


In [7]:
# distance to school: grade school
plot_distance(per_data,'is_gradeschool',"grade school: distance to school")

model person count =
distance_to_school_bin
util_dist_15_up            10409
util_dist_0_1              92548
util_dist_2_5             104479
util_dist_5_15            108302
util_dist_1_2             108639
dtype: int64


In [8]:
# distance to school: high school
plot_distance(per_data,'is_highschool',"high school: distance to school")

model person count =
distance_to_school_bin
util_dist_15_up            2528
util_dist_0_1              8978
util_dist_1_2             37753
util_dist_2_5             64876
util_dist_5_15            95713
dtype: int64


In [9]:
# distance to school: university
plot_distance(per_data,'is_university',"university: distance to school")

model person count =
distance_to_school_bin
util_dist_0_1             16973
util_dist_1_2             19078
util_dist_15_up           28417
util_dist_2_5             37474
util_dist_5_15            58543
dtype: int64
