In [1]:
import polars as pl
import os
import pandas as pd
import numpy as np
import validation_data_input
import plotly.express as px
import toml
from pathlib import Path
import util
import psrc_theme

# to show plotly figures in quarto HTML file
import plotly.io as pio

pio.renderers.default = "plotly_mimetype+notebook_connected"
pio.templates.default = "simple_white+psrc_color" # set plotly template

In [2]:
# %store -r validation_data

# config = validation_data.config.copy()
# hh = validation_data.hh.copy()
# person = validation_data.person.copy()

config = toml.load(Path(Path.cwd(), '..\..\..\..\configuration', 'validation_configuration.toml'))
input_config = toml.load(Path(Path.cwd(), '..\..\..\..\configuration', 'input_configuration.toml'))

data = util.ValidationData(config,input_config,['hh', 'person'])

hh = data.hh.to_pandas()
person = data.person.to_pandas()

In [3]:
# Try to load PSRC database if available from Elmer
try:
    person_elmer = validation_data_input.load_elmer_table("HHSurvey.v_persons_labels", 
                                              sql="SELECT * FROM HHSurvey.v_persons_labels"+\
                                                  " WHERE survey_year in ("+input_config['base_year']+")")
except:
    person_elmer = pd.DataFrame()

Total Persons

In [4]:
df = person.groupby('source')['psexpfac'].sum().reset_index()
df.rename(columns={'psexpfac': 'Total Persons'}, inplace=True)

# If Elmer Data is available, use it instead of the Daysim-formatted survey
if not person_elmer.empty:
    # total_persons_elmer = person_elmer['person_weight'].sum()
    df = pd.concat([df, 
                    pd.DataFrame({'source': ['Full Survey Data'], 
                                  'Total Persons': [person_elmer['person_weight'].sum()]})
                    ], ignore_index=True)
df['Total Persons'] = df['Total Persons'].apply(lambda x: f"{x:,.0f}")
df

Unnamed: 0,source,Total Persons
0,model,5788853
1,survey,3782136
2,Full Survey Data,4221182


Person Types

In [5]:
# add person type labels
ptype_cat = {1: "1: full time worker",
             2: "2: part time worker",
             3: "3: non-worker age 65+",
             4: "4: other non-working adult",
             5: "5: university student",
             6: "6: grade school student/child age 16+",
             7: "7: child age 5-15",
             8: "8: child age 0-4"}
person['pptyp_label'] = person['pptyp'].map(ptype_cat)

In [6]:
df_plot = person.groupby(['source','pptyp_label'])['psexpfac'].sum().reset_index()
df_plot['percentage'] = df_plot.groupby(['source'], group_keys=False)['psexpfac'].\
        apply(lambda x: x / float(x.sum()))

df_plot_ct = person.groupby(['source','pptyp_label'])['psexpfac'].count().reset_index(). \
    rename(columns={'psexpfac':'sample count'})
df_plot = df_plot.merge(df_plot_ct, on=['source','pptyp_label'])

fig = px.bar(df_plot.sort_values(by=['source']), x="pptyp_label", y="percentage", color="source",
             hover_data=['sample count'],
             barmode="group",title="person type")
fig.update_layout(height=400, width=700, font=dict(size=11),
                  yaxis=dict(tickformat=".2%"))
fig.show()