In [1]:
import os
import toml
import pandas as pd
import numpy as np
import plotly.express as px
import psrc_theme

# to show plotly figures in quarto HTML file
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"
pio.templates.default = "simple_white+psrc_color" # set plotly template

config = toml.load(os.path.join(os.getcwd(), 'validation_configuration.toml'))

## Person data

In [2]:
# read data

# model data
per_data_model = pd.read_parquet(config['p_model_persons']).reset_index()
# add weight to model data with all 1
per_data_model['hh_weight_2017_2019'] = np.repeat(1, len(per_data_model))

# survey data
per_data_survey = pd.read_csv(config['p_survey_persons']).groupby('person_id_elmer').first().reset_index() # remove duplicates
# unweighted survey data
per_data_survey_unweighted = per_data_survey.copy()
per_data_survey_unweighted['hh_weight_2017_2019'] = 1

In [3]:
print(f"person counts \n"
      f"- model results: {len(per_data_model)}\n"
      f"- survey results: {per_data_survey['hh_weight_2017_2019'].sum()}\n"
      f"- unweighted survey: {per_data_survey_unweighted['hh_weight_2017_2019'].sum()}\n"
      # f"group dividers:\n"
      # f"{var_group}"
      )

person counts 
- model results: 4053154
- survey results: 3190247.7283157064
- unweighted survey: 10169



## Person types

In [4]:
#| echo: true

ptype_cat = {1: "1: Full-Time Worker",
             2: "2: Part-Time Worker",
             3: "3: University Student",
             4: "4: Non-Working Adult Age <65",
             5: "5: Non-Working Adult Age 65+",
             6: "6: High School Student Age 16+",
             7: "7: Child Age 5-15",
             8: "8: Child Age 0-4"}

In [5]:
#| warning: false

def data_process(df: pd.DataFrame, data_source: str) -> pd.DataFrame:

    # add data source
    df['source'] = data_source
    # add person type description
    df['ptype_des'] = df['ptype'].map(ptype_cat)

    return df

# match columns and concat all source into hh_data
col_list = ['person_id', 'household_id', 'hh_weight_2017_2019',
            'ptype','cdap_activity']

# combine both sets of data

# combine both sets of data
per_data = pd.concat([data_process(per_data_model[col_list], "model results").copy(),
                      data_process(per_data_survey[col_list], "survey data").copy(),
                      data_process(per_data_survey_unweighted[col_list], "unweighted survey data").copy()])


# per_data



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

## Day activity pattern distribution

- **Mandatory**: the person engages in travel to at least one out-of-home mandatory activity - work, university, or school. The mandatory pattern may also include non-mandatory activities such as separate home-based tours or intermediate stops on mandatory tours.
- **Non-mandatory**: the person engages in only maintenance and discretionary tours, which, by definition, do not contain mandatory activities.
- **Home**: the person does not travel outside the home.

In [6]:
df_plot = per_data.groupby(['source','cdap_activity'])['hh_weight_2017_2019'].sum().reset_index()
df_plot['percentage'] = df_plot.groupby(['source'], group_keys=False)['hh_weight_2017_2019'].\
            apply(lambda x: x / float(x.sum()))

# df_plot
fig = px.bar(df_plot, x="cdap_activity", y="percentage", barmode="group",color="source",
            title="CDAP")
# fig.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
fig.update_layout(height=400, width=700, yaxis=dict(tickformat=".1%"))
fig.show()

# CDAP activity by person type

In [7]:
df_plot = per_data.groupby(['source','ptype_des','cdap_activity'])['hh_weight_2017_2019'].sum().reset_index()
df_plot['percentage'] = df_plot.groupby(['source','ptype_des'], group_keys=False)['hh_weight_2017_2019'].\
    apply(lambda x: x / float(x.sum()))

fig = px.bar(df_plot, x="cdap_activity", y="percentage", color="source",barmode="group",
                facet_col="ptype_des", facet_col_wrap=2,
                title="cdap_activity and person types")
fig.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
fig.for_each_yaxis(lambda a: a.update(tickformat = ".1%"))
fig.update_layout(height=900, width=800)
fig.show()