In [1]:
import polars as pl
import os
import pandas as pd
import numpy as np
from sqlalchemy import create_engine,text
import validation_data_input
import plotly.express as px
import toml
from pathlib import Path
import util
import psrc_theme

# to show plotly figures in quarto HTML file
import plotly.io as pio

pio.renderers.default = "plotly_mimetype+notebook_connected"
pio.templates.default = "simple_white+psrc_color" # set plotly template

In [2]:
# %store -r validation_data

# config = validation_data.config.copy()
# input_config = validation_data.input_config.copy()
# hh = validation_data.hh.copy()
# df_parcel = validation_data.land_use.copy()
# parcel_geog = validation_data.parcel_geog.copy()

config = toml.load(Path(Path.cwd(), '..\..\..\..\configuration', 'validation_configuration.toml'))
input_config = toml.load(Path(Path.cwd(), '..\..\..\..\configuration', 'input_configuration.toml'))

data = util.ValidationData(config,input_config,['hh', 'land_use'])

hh = data.hh.to_pandas()
df_parcel = data.land_use.to_pandas()
# parcel_geog = data.parcel_geog.to_pandas()

conn = create_engine('sqlite:///../../../../inputs/db/'+input_config['db_name'])
parcel_geog = pd.read_sql(text("SELECT * FROM "
                    + "parcel_"
                    + input_config["base_year"]
                    + "_geography"), con=conn.connect())

In [3]:
# Try to load PSRC database if available from Elmer
try:
    hh_elmer = validation_data_input.load_elmer_table("HHSurvey.v_households_labels", 
                                                  sql="SELECT * FROM HHSurvey.v_households_labels"+\
                                                  " WHERE survey_year in ("+input_config['base_year']+")")
except:
    person_elmer = pd.DataFrame()

In [4]:
# df_parcel_geog = parcel_geog.merge(parcel_urbansim, left_on='ParcelID', right_on='PARCELID', how='left')
df_hh = hh.merge(df_parcel, left_on='hhparcel', right_on='parcelid', how='left'). \
     merge(parcel_geog, left_on='hhparcel', right_on='ParcelID', how='left')


Total Households

In [5]:
df = df_hh.groupby('source')['hhexpfac'].sum().reset_index()
df['Total Households'] = df['hhexpfac'].apply(lambda x: f"{x:,.0f}")

total_hh_elmer = hh_elmer['hh_weight'].sum()
df = pd.concat([df[['source','Total Households']],
                pd.DataFrame(data = {'source': ['Full Survey Data'], 
                                     'Total Households': [f"{total_hh_elmer:,.0f}"]})])

# df = pd.concat(df[['source','Total Households']],df[['source','Total Households']])
df

Unnamed: 0,source,Total Households
0,model,2421059
1,survey,1690793
0,Full Survey Data,1733407


- income, hh density, employment density grouped into very low, low, medium, medium-high and high

In [6]:
# Group income, hh density, and employment density into 4 groups
var_group = df_hh.loc[df_hh['source'] == 'model',['hhincome','emptot_1','hh_1']].quantile([.125, .25, .50, .75])

var_group

Unnamed: 0,hhincome,emptot_1,hh_1
0.125,29544.0,1.231649,114.835858
0.25,53659.0,27.619542,228.695183
0.5,102045.0,334.533535,552.434048
0.75,174711.0,1653.558366,1550.123455


In [7]:
# data manipulation
# add income group
df_hh['hhincome_group'] = pd.cut(df_hh['hhincome'],bins=[-9999999.0] + var_group['hhincome'].tolist() + [9999999.0], labels=['very low', 'low', 'medium', 'medium-high', 'high'])
# add hh density groups
df_hh['hh_density_group'] = pd.cut(df_hh['hh_1'],bins=[-9999999.0] + var_group['hh_1'].tolist() + [9999999.0], labels=['very low', 'low', 'medium', 'medium-high', 'high'])
# add employment density groups
df_hh['emp_density_group'] = pd.cut(df_hh['emptot_1'],bins=[-9999999.0] + var_group['emptot_1'].tolist() + [9999999.0], labels=['very low', 'low', 'medium', 'medium-high', 'high'])


In [8]:
def plot_hh_stat(df:pd.DataFrame, var:str, title_cat:str, wid = 700):
    df_plot = df.groupby(['source',var])['hhexpfac'].sum().reset_index()
    df_plot['percentage'] = df_plot.groupby(['source'], group_keys=False)['hhexpfac'].\
            apply(lambda x: x / float(x.sum()))
    
    df_plot_ct = df.groupby(['source',var])['hhexpfac'].count().reset_index(). \
        rename(columns={'hhexpfac':'sample count'})
    df_plot = df_plot.merge(df_plot_ct, on=['source',var])
    
    fig = px.bar(df_plot.sort_values(by=['source']), x=var, y="percentage", color="source",
                 hover_data=['sample count'],
                 barmode="group",title=title_cat)
    fig.update_layout(height=400, width=wid, font=dict(size=11),
                      yaxis=dict(tickformat=".2%"))
    fig.show()

## demographics

In [9]:
plot_hh_stat(df_hh, 'hhincome_group', 'household income')







## Home Location

In [10]:
df_hh[df_hh['source']=="model"]['CountyName'].value_counts()

CountyName
King              1288223
Pierce             500736
Snohomish          482010
Kitsap             150085
Outside Region          5
Name: count, dtype: int64

In [11]:
plot_hh_stat(df_hh, 'CountyName', 'home county')

In [12]:
plot_hh_stat(df_hh, 'district_name', 'home district', wid=900)

In [13]:
plot_hh_stat(df_hh, 'hh_density_group', 'home density')





