- distance bins from [workplace_location.csv](https://github.com/psrc/psrc_activitysim/blob/main/configs_dev/workplace_location.csv)



In [1]:
import os
import toml
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots

# to show plotly figures in quarto HTML file
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"

config = toml.load(os.path.join(os.getcwd(), 'validation_configuration.toml'))

## Person data

In [2]:
# read data

# model data
per_data_model = pd.read_parquet(config['p_model_persons']).reset_index()
# add weight to model data with all 1
per_data_model['hh_weight_2017_2019'] = np.repeat(1, len(per_data_model))


# survey data
per_data_survey = pd.read_csv(config['p_survey_persons']).groupby('person_id_elmer').first().reset_index() # remove duplicates

print(f"person counts \n"
      f"- model results: {len(per_data_model)}\n"
      f"- survey results: {per_data_survey['hh_weight_2017_2019'].sum()}\n"
      # f"group dividers:\n"
      # f"{var_group}"
      )

person counts 
- model results: 4053154
- survey results: 3190247.7283157064



In [3]:
#| warning: false

# process model and survey data for summary
def data_process(df: pd.DataFrame, data_source: str) -> pd.DataFrame:

    # add data source
    df['source'] = data_source
    # distance to school bins from workplace_location.csv
    df['distance_to_school_bin'] = pd.cut(df['distance_to_school'], bins=[0,1,2,5,15,9999],
                                          labels=['util_dist_0_1', 'util_dist_1_2', 'util_dist_2_5',
                                                  'util_dist_5_15', 'util_dist_15_up'])
    # Create bins: bins of 2 miles up to 60 miles
    max_bin = 60
    bin_size = 2
    df['d_school_bin_60mi'] = pd.cut(df['distance_to_school'], bins=np.arange(0, max_bin+bin_size, bin_size), labels=[str(i) for i in np.arange(0, max_bin, bin_size)])

    return df

# match columns and concat all source into hh_data
col_list = ['person_id', 'household_id', 'hh_weight_2017_2019',
            'is_student', 'is_gradeschool', 'is_highschool', 'is_university','school_segment', 'distance_to_school']

# combine both sets of data
per_data = pd.concat([data_process(per_data_model[col_list], "model results").copy(),
                      data_process(per_data_survey[col_list], "survey data").copy()])


# per_data



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

## Distance to school

In [4]:
def plot_distance(df:pd.DataFrame, group:str, title_name:str):
    count = df.loc[(df['source']=='model results') & (df[group]),['distance_to_school_bin']].value_counts()
    print(f"model person count =\n"
          f"{count.sort_values()}")
    df_plot = df.loc[(df[group]) & (df['distance_to_school_bin'] != float('nan'))].groupby(['source','distance_to_school_bin'])['hh_weight_2017_2019'].sum().reset_index()
    df_plot['percentage'] = df_plot.groupby(['source'], group_keys=False)['hh_weight_2017_2019'].\
            apply(lambda x: 100 * x / float(x.sum()))
    # df_plot
    #
    fig1 = px.bar(df_plot, x='distance_to_school_bin', y="percentage", color="source", barmode="group",template="simple_white",
                 title=title_name)
    fig1.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
    fig1.update_layout(height=400, width=700, font=dict(size=11))
    fig1.show()

    df_plot = per_data.loc[(per_data[group]) & (per_data['d_school_bin_60mi'].notna())].groupby(['source','d_school_bin_60mi'])['hh_weight_2017_2019'].sum().reset_index()
    df_plot['percentage'] = df_plot.groupby(['source'], group_keys=False)['hh_weight_2017_2019'].\
                apply(lambda x: 100 * x / float(x.sum()))
    # df_plot
    #
    fig2 = px.line(df_plot, x='d_school_bin_60mi', y="percentage", color="source", template="simple_white",
                  title=title_name)
    fig2.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
    fig2.update_layout(height=400, width=700, font=dict(size=11))
    fig2.show()

# distance to school: grade school
plot_distance(per_data,'is_student',"All students: distance to school")

model person count =
distance_to_school_bin
util_dist_15_up            23859
util_dist_0_1             181303
util_dist_1_2             236331
util_dist_2_5             281980
util_dist_5_15            335921
dtype: int64


In [5]:
# distance to school: grade school
plot_distance(per_data,'is_gradeschool',"grade school: distance to school")

model person count =
distance_to_school_bin
util_dist_15_up             3164
util_dist_0_1             152497
util_dist_1_2             175169
util_dist_2_5             176285
util_dist_5_15            181946
dtype: int64


In [6]:
# distance to school: high school
plot_distance(per_data,'is_highschool',"high school: distance to school")

model person count =
distance_to_school_bin
util_dist_15_up             502
util_dist_0_1              9365
util_dist_1_2             39555
util_dist_2_5             66791
util_dist_5_15            93635
dtype: int64


In [7]:
# distance to school: university
plot_distance(per_data,'is_university',"university: distance to school")

model person count =
distance_to_school_bin
util_dist_0_1             19441
util_dist_15_up           20193
util_dist_1_2             21607
util_dist_2_5             38904
util_dist_5_15            60340
dtype: int64
