In [1]:
import os
import pandas as pd
import numpy as np
from sqlalchemy import create_engine,text
# import validation_data_input
import plotly.express as px
import toml
import polars as pl
from pathlib import Path
import util
import psrc_theme

# to show plotly figures in quarto HTML file
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"
pio.templates.default = "simple_white+psrc_color" # set plotly template

In [2]:
# %store -r validation_data

# config = validation_data.config.copy()
# input_config = validation_data.input_config.copy()
# person = validation_data.person.copy()
# hh = validation_data.hh.copy()
# df_parcel = validation_data.land_use.copy()
# parcel_geog = validation_data.parcel_geog.copy()

config = toml.load(Path(Path.cwd(), '..\..\..\..\configuration', 'validation_configuration.toml'))
input_config = toml.load(Path(Path.cwd(), '..\..\..\..\configuration', 'input_configuration.toml'))

data = util.ValidationData(config,input_config,['hh', 'person', 'land_use'])

hh = data.hh.to_pandas()
person = data.person.to_pandas()
# parcel_geog = data.parcel_geog.to_pandas()
df_parcel = data.land_use.to_pandas()

conn = create_engine('sqlite:///'+config['model_dir']+'/inputs/db/'+input_config['db_name'])
parcel_geog = pd.read_sql(text("SELECT * FROM "
                    + "parcel_"
                    + input_config["base_year"]
                    + "_geography"), con=conn.connect())

In [3]:
hh = hh.merge(df_parcel, left_on='hhparcel', right_on='parcelid', how='left')

In [4]:
# Group income, hh density, and employment density into 4 groups
var_group = hh.loc[hh['source'] == 'model',['hhincome','emptot_1','hh_1']].quantile([.25, .50, .75])

# var_group

In [5]:
hh['hhincome_group'] = pd.cut(hh['hhincome'],bins=[-9999999.0] + var_group['hhincome'].tolist() + [9999999.0], labels=['low', 'medium', 'medium-high', 'high'])
# add hh density groups
hh['hh_density_group'] = pd.cut(hh['hh_1'],bins=[-9999999.0] + var_group['hh_1'].tolist() + [9999999.0], labels=['low', 'medium', 'medium-high', 'high'])
# add employment density groups
hh['emp_density_group'] = pd.cut(hh['emptot_1'],bins=[-9999999.0] + var_group['emptot_1'].tolist() + [9999999.0], labels=['low', 'medium', 'medium-high', 'high'])


In [6]:
person = person.merge(hh, on=['hhno','source'], how='left')

In [7]:
person = person.merge(parcel_geog, left_on='pwpcl', right_on='ParcelID', how='left')
person = person.merge(parcel_geog, left_on='hhparcel', right_on='ParcelID', how='left', suffixes=['_work', '_home'])

In [8]:
df_workers = person.loc[person['pwtyp']>0, :].copy()

In [9]:
df = df_workers.loc[df_workers['pwaudist'] > 0, :].copy()
df['wt_dist'] = df['pwaudist']*df['psexpfac']
df = df.groupby(['source'])[['wt_dist','psexpfac']].sum().reset_index()
df['Mean Distance to Work by Car (pwaudist)'] = df['wt_dist']/df['psexpfac']
df[['source','Mean Distance to Work by Car (pwaudist)']]

Unnamed: 0,source,Mean Distance to Work by Car (pwaudist)
0,model,10.480631
1,survey,11.319597


In [10]:

def plot_work_location(df:pd.DataFrame, var:str, title_cat:str, sub_name:str):
    df = df[df['pwaudist'] > 0].copy()
    df['wt_dist'] = df['pwaudist']*df['psexpfac']
    df_plot = df.groupby(['source',var])[['wt_dist','psexpfac']].sum().reset_index()
    df_plot['average_wt_pwaudist'] = df_plot['wt_dist']/df_plot['psexpfac']
    df_plot

    df_plot_ct = df.groupby(['source',var])[['wt_dist','psexpfac']].count().reset_index(). \
    rename(columns={'psexpfac':'sample count'})
    df_plot = df_plot.merge(df_plot_ct, on=['source',var])

    fig = px.bar(df_plot, x=var, y=['average_wt_pwaudist', 'source'], color="source",
                    barmode="group",
                     hover_data=['sample count'],
                    title="Work Distance by "+ title_cat)
    fig.for_each_annotation(lambda a: a.update(text = sub_name + "=<br>" + a.text.split("=")[-1]))
    fig.update_xaxes(title_text=sub_name)
    fig.update_layout(height=400, width=800, font=dict(size=11),
                      yaxis=dict(tickformat=".2f"))
    fig.for_each_yaxis(lambda a: a.update(tickformat = ".2f"))
    fig.show()

## Work Location by Workplace Geography

In [11]:
plot_work_location(df_workers, 'CountyName_work', 'Work County', 'County')

In [12]:
plot_work_location(df_workers, 'rg_proposed_work', 'Regional Geography', 'Regional Geography')

## Workplace by Home Geography

In [13]:
plot_work_location(df_workers, 'CountyName_home', 'Home County', 'County')

In [14]:
plot_work_location(df_workers, 'rg_proposed_home', 'Home Regional Geography', 'Regional Geography')

## Workplace Location by Person/Household Characteristics

In [15]:
plot_work_location(df_workers, 'pwtyp', 'Employment Type', 'employment type')

In [16]:
plot_work_location(df_workers, 'hhincome_group', 'Income', 'Income Group')

In [17]:
plot_work_location(df_workers, 'hh_density_group', 'Household Density', 'Household Density Group')

In [18]:
plot_work_location(df_workers, 'emp_density_group', 'Employment Density', 'Employment Density Group')

In [19]:
# distance to school bins from workplace_location.csv
df_workers['distance_to_work_bin'] = pd.cut(df_workers['pwaudist'], bins=[0,1,2,5,15,9999],
                                    labels=['0-1', '1-2', '2-5',
                                            '5-15', '15+'])
# Create bins: bins of 2 miles up to 60 miles
max_bin = 60
bin_size = 2
df_workers['d_work_bin_60mi'] = pd.cut(df_workers['pwaudist'], bins=np.arange(0, max_bin+bin_size, bin_size), labels=[str(i) for i in np.arange(0, max_bin, bin_size)])

## Distance Bins

In [20]:
def plot_distance(df:pd.DataFrame, group:str, title_name:str):

    count = df.loc[(df['source']=='model') & (df[group]),['distance_to_work_bin']].value_counts()
    # print(f"model person count =\n"
    #       f"{count.sort_values()}")

    # plot1
    df_plot = df.loc[(df[group]) & (df['distance_to_work_bin'] != float('nan'))].groupby(['source','distance_to_work_bin'])['psexpfac'].sum().reset_index()
    df_plot['percentage'] = df_plot.groupby(['source'], group_keys=False)['psexpfac'].\
            apply(lambda x: 100 * x / float(x.sum()))

    df_plot_ct = df.loc[(df['distance_to_work_bin'] != float('nan'))].groupby(['source','distance_to_work_bin'])['psexpfac'].count().reset_index().rename(columns={'psexpfac':'sample count'})
    df_plot = df_plot.merge(df_plot_ct, on=['source','distance_to_work_bin'])

    # df_plot
    #
    fig1 = px.bar(df_plot, x='distance_to_work_bin', y="percentage", color="source", barmode="group",hover_data=['sample count'],
                  title=title_name)
    fig1.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
    fig1.update_layout(height=400, width=700, font=dict(size=11))
    fig1.show()

    # # plot2
    df_plot = df.loc[(df[group]) & (df['d_work_bin_60mi'].notna())].groupby(['source','d_work_bin_60mi'])['psexpfac'].sum().reset_index()
    df_plot['percentage'] = df_plot.groupby(['source'], group_keys=False)['psexpfac'].\
                apply(lambda x: 100 * x / float(x.sum()))
    # df_plot
    #
    fig2 = px.line(df_plot, x='d_work_bin_60mi', y="percentage", color="source",
                   title=title_name)
    fig2.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
    fig2.update_layout(height=400, width=700, font=dict(size=11))
    fig2.show()

# distance to work
plot_distance(df_workers,'pwtyp',"worker: distance to work (miles)")

In [21]:
def plot_group(df:pd.DataFrame, group:str, title_name:str, height:int = 900):
    df_plot = df_workers.groupby(['source',group,'distance_to_work_bin'])['psexpfac'].sum().reset_index()
    df_plot['percentage'] = df_plot.groupby(['source',group], group_keys=False)['psexpfac']. \
        apply(lambda x: x / float(x.sum()))
    
    df_plot_ct = df_workers.groupby(['source',group,'distance_to_work_bin'])['psexpfac'].count().reset_index().rename(columns={'psexpfac':'sample count'})
    df_plot = df_plot.merge(df_plot_ct, on=['source',group,'distance_to_work_bin'])

    fig = px.bar(df_plot, x="distance_to_work_bin", y="percentage", color="source",barmode="group",
                facet_col=group, facet_col_wrap=2, hover_data=['sample count'],
                title="Distance to work by "+title_name)
    fig.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
    fig.update_layout(height=height, width=700, font=dict(size=11),
                      yaxis=dict(tickformat=".0%"))
    fig.show()

def plot_group_line(df:pd.DataFrame, group:str, title_name:str, height:int = 400):
    df_plot = df_workers.groupby(['source',group,'d_work_bin_60mi'])['psexpfac'].sum().reset_index()
    df_plot['percentage'] = df_plot.groupby(['source',group], group_keys=False)['psexpfac']. \
        apply(lambda x: x / float(x.sum()))
    # df_plot

    fig2 = px.line(df_plot, x='d_work_bin_60mi', y="percentage", color="source",
                facet_col=group, facet_col_wrap=2,
                title="Distance to work by "+title_name)
    fig2.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
    fig2.update_layout(height=height, width=700, font=dict(size=11),
                       yaxis=dict(tickformat=".0%"))
    fig2.show()

In [22]:
df_workers['worker_type'] = df_workers.pwtyp.map({1: 'full-time', 2: 'part-time'})
plot_group(df_workers, 'worker_type', 'worker type', height= 400)

In [23]:
plot_group_line(df_workers, 'worker_type', 'worker type')

In [24]:
plot_group(df_workers, 'hhincome_group', 'income group',500)

In [25]:
plot_group_line(df_workers, 'hhincome_group', 'household income',500)

In [26]:
plot_group(df_workers, 'hh_density_group', 'density group',500)

In [27]:
plot_group_line(df_workers, 'hh_density_group', 'household density',500)

In [28]:
plot_group(df_workers, 'CountyName_work', 'work county',500)

In [29]:
plot_group_line(df_workers, 'CountyName_work', 'Work County',500)

In [30]:
plot_group(df_workers, 'rg_proposed_work', 'work geography',700)

In [31]:
plot_group_line(df_workers, 'rg_proposed_work', 'Work Geography',700)

## Origin-Destination Flows

In [32]:
# Get District lookup from parcel geographies

# Load observed flow data
# Load parcel geography lookups
# Load parcel geography lookups
conn = create_engine('sqlite:///'+config['model_dir']+'/inputs/db/'+input_config['db_name'])
df_lehd = pd.read_sql(text("SELECT * FROM observed_lehd_district_flows"), con=conn.connect())


df_lehd = df_lehd[df_lehd['model_year'].astype('int') == int(input_config['base_year'])]
df_lehd['source'] = 'LEHD'
df_lehd

Unnamed: 0,home_dist,work_dist,S000,model_year,data_year,source
0,0,0,1030967,2023,2021,LEHD
1,0,1,19431,2023,2021,LEHD
2,0,2,18937,2023,2021,LEHD
3,0,3,12253,2023,2021,LEHD
4,0,4,29293,2023,2021,LEHD
...,...,...,...,...,...,...
139,11,7,3304,2023,2021,LEHD
140,11,8,4982,2023,2021,LEHD
141,11,9,1882,2023,2021,LEHD
142,11,10,2695,2023,2021,LEHD


In [33]:
dist_name = {1: '1: Suburban Snohomish',
              2: '2: Everett-Lynwood-Edmonds',
              3: '3: North Seattle-Shoreline',
              4: '4: Seattle CBD',
              5: '5: West-South Seattle',
              6: '6: East Side',
              7: '7: Renton-FedWay-Kent',
              8: '8: Tacoma',
              9: '9: Kitsap',
              10: '10: South Pierce'}

df_lehd['home_dist_name'] = df_lehd['home_dist'].map(dist_name)
df_lehd['work_dist_name'] = df_lehd['work_dist'].map(dist_name)

In [34]:
df_model = person[person['source'] == 'model'].groupby(['District_home','District_work']).sum()[['psexpfac']].reset_index()
df_model['source'] = 'model'

df_model[['District_home','District_work']] = df_model[['District_home','District_work']].astype('int')
df_model['home_dist_name'] = df_model['District_home'].map(dist_name)
df_model['work_dist_name'] = df_model['District_work'].map(dist_name)

df_model.rename(columns={'psexpfac': 'workers_total'}, inplace=True)
df_lehd.rename(columns={'S000': 'workers_total'}, inplace=True)

# Drop LEHD data to/from external zones
df_lehd = df_lehd[(df_lehd['home_dist'] > 0) & (df_lehd['work_dist'] > 0)].copy()

In [35]:
_df_model = pd.pivot_table(df_model, index='home_dist_name', columns='work_dist_name', values='workers_total', aggfunc='sum')
_df_lehd = pd.pivot_table(df_lehd, index='home_dist_name', columns='work_dist_name', values='workers_total', aggfunc='sum')

In [36]:

df = pd.concat([df_lehd,df_model])
df.head()

var = 'work_dist_name'
df_plot = df.groupby(['source',var])[['workers_total']].sum().reset_index()
df_plot

fig = px.bar(df_plot.sort_values(by='source', ascending=False), x=var, y=['workers_total', 'source'], color="source",
                barmode="group",
                title="Work District Location")
# fig.for_each_annotation(lambda a: a.update(text = sub_name + "=<br>" + a.text.split("=")[-1]))
# fig.update_xaxes(title_text=sub_name)
fig.update_layout(height=400, width=800, font=dict(size=11),
                  xaxis=dict(categoryarray= list(dist_name.values())),
                  yaxis=dict(tickformat=".0f"))
fig.show()

In [37]:

df = pd.concat([df_model,df_lehd])
df.head()

var = 'home_dist_name'
df_plot = df.groupby(['source',var])[['workers_total']].sum().reset_index()
df_plot

fig = px.bar(df_plot.sort_values(by='source', ascending=False), x=var, y=['workers_total', 'source'], color="source",
                barmode="group",
                title="Home District")
# fig.for_each_annotation(lambda a: a.update(text = sub_name + "=<br>" + a.text.split("=")[-1]))
# fig.update_xaxes(title_text=sub_name)
fig.update_layout(height=400, width=800, font=dict(size=11),
                  xaxis=dict(categoryarray= list(dist_name.values())),
                  yaxis=dict(tickformat=".0f"))
fig.show()

In [38]:
# _df_model/_df_model['workers_total'].sum()
# df_model_dist = _df_model/_df_model.sum().sum()
# df_lehd_dist = _df_lehd/_df_lehd.sum().sum()

#### District Distribution: Model - LEHD

In [39]:
import seaborn as sns
import matplotlib
df = (_df_model-_df_lehd)/_df_lehd

df_lehd
# # get max and min for color ramp
# abs_max = df.abs().values.max()
# abs_min = -abs_max

# def make_pretty(styler):
    
#     def background_with_norm(s):
#         cm = sns.diverging_palette(10, 240, n=9, as_cmap=True)
#         norm = matplotlib.colors.TwoSlopeNorm(vmin=abs_min, vcenter=0, vmax=abs_max)
#         return ['background-color: {:s}'.format(matplotlib.colors.to_hex(c.flatten())) for c in cm(norm(s.values))]

#     styler.set_caption("District distribution")
#     styler.format("{:.2%}")
#     styler.apply(background_with_norm)
#     styler.set_properties(**{'color': 'black'})
#     return styler

# df[list(dist_name.values())].reindex(list(dist_name.values())).style.pipe(make_pretty)

Unnamed: 0,home_dist,work_dist,workers_total,model_year,data_year,source,home_dist_name,work_dist_name
13,1,1,55803,2023,2021,LEHD,1: Suburban Snohomish,1: Suburban Snohomish
14,1,2,50851,2023,2021,LEHD,1: Suburban Snohomish,2: Everett-Lynwood-Edmonds
15,1,3,12434,2023,2021,LEHD,1: Suburban Snohomish,3: North Seattle-Shoreline
16,1,4,20744,2023,2021,LEHD,1: Suburban Snohomish,4: Seattle CBD
17,1,5,11215,2023,2021,LEHD,1: Suburban Snohomish,5: West-South Seattle
...,...,...,...,...,...,...,...,...
139,11,7,3304,2023,2021,LEHD,,7: Renton-FedWay-Kent
140,11,8,4982,2023,2021,LEHD,,8: Tacoma
141,11,9,1882,2023,2021,LEHD,,9: Kitsap
142,11,10,2695,2023,2021,LEHD,,10: South Pierce
