In [1]:
import os
import pandas as pd
import numpy as np
import validation_data_input
import plotly.express as px
import toml
import psrc_theme

# to show plotly figures in quarto HTML file
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"
pio.templates.default = "simple_white+psrc_color" # set plotly template

config = toml.load(os.path.join(os.getcwd(), '..\\..\\..\\..\\configuration', 'validation_configuration.toml'))
input_config = toml.load(os.path.join(os.getcwd(), '..\\..\\..\\..\\configuration', 'input_configuration.toml'))
# model_dir = os.path.join(os.getcwd(), '..\\..\\..\\..\\')

In [2]:
# os.getcwd()

In [3]:
person = validation_data_input.get_data('person')
hh = validation_data_input.get_data('household')

In [4]:
# parcel land use data
df_parcel = pd.read_csv(os.path.join(config['model_dir'],r'outputs\landuse\buffered_parcels.txt'),
                        delim_whitespace=True,
                        usecols=['parcelid','emptot_1','hh_1'])

In [5]:
# Load parcel geography lookups
parcel_geog = pd.read_sql_table(
    'parcel_'+input_config['base_year']+'_geography',
    'sqlite:///'+config['model_dir']+'/inputs/db/'+input_config['db_name'],
    columns=['ParcelID','CountyName','rg_proposed','District','district_name']
    )

In [6]:
hh = hh.merge(df_parcel, left_on='hhparcel', right_on='parcelid', how='left')

In [7]:
# Group income, hh density, and employment density into 4 groups
var_group = hh.loc[hh['source'] == 'model',['hhincome','emptot_1','hh_1']].quantile([.25, .50, .75])

# var_group

In [8]:
hh['hhincome_group'] = pd.cut(hh['hhincome'],bins=[-9999999.0] + var_group['hhincome'].tolist() + [9999999.0], labels=['low', 'medium', 'medium-high', 'high'])
# add hh density groups
hh['hh_density_group'] = pd.cut(hh['hh_1'],bins=[-9999999.0] + var_group['hh_1'].tolist() + [9999999.0], labels=['low', 'medium', 'medium-high', 'high'])
# add employment density groups
hh['emp_density_group'] = pd.cut(hh['emptot_1'],bins=[-9999999.0] + var_group['emptot_1'].tolist() + [9999999.0], labels=['low', 'medium', 'medium-high', 'high'])


In [9]:
person = person.merge(hh, on=['hhno','source'], how='left')

In [10]:
person = person.merge(parcel_geog, left_on='pwpcl', right_on='ParcelID', how='left')
person = person.merge(parcel_geog, left_on='hhparcel', right_on='ParcelID', how='left', suffixes=['_work', '_home'])

In [11]:
df_workers = person.loc[person['pwtyp']>0, :].copy()

In [12]:
df = df_workers.loc[df_workers['pwaudist'] > 0, :].copy()
df['wt_dist'] = df['pwaudist']*df['psexpfac']
df = df.groupby(['source'])[['wt_dist','psexpfac']].sum().reset_index()
df['Mean Distance to Work by Car (pwaudist)'] = df['wt_dist']/df['psexpfac']
df[['source','Mean Distance to Work by Car (pwaudist)']]

Unnamed: 0,source,Mean Distance to Work by Car (pwaudist)
0,model,11.635203
1,survey,11.789571
2,survey (2017/2019),11.967041


In [13]:
df_workers.to_clipboard()

In [14]:

def plot_work_location(df:pd.DataFrame, var:str, title_cat:str, sub_name:str):
    df = df[df['pwaudist'] > 0].copy()
    df['wt_dist'] = df['pwaudist']*df['psexpfac']
    df_plot = df.groupby(['source',var])[['wt_dist','psexpfac']].sum().reset_index()
    df_plot['average_wt_pwaudist'] = df_plot['wt_dist']/df_plot['psexpfac']
    df_plot

    df_plot_ct = df.groupby(['source',var])[['wt_dist','psexpfac']].count().reset_index(). \
    rename(columns={'psexpfac':'sample count'})
    df_plot = df_plot.merge(df_plot_ct, on=['source',var])

    fig = px.bar(df_plot, x=var, y=['average_wt_pwaudist', 'source'], color="source",
                    barmode="group",
                     hover_data=['sample count'],
                    title="Work Distance by "+ title_cat)
    fig.for_each_annotation(lambda a: a.update(text = sub_name + "=<br>" + a.text.split("=")[-1]))
    fig.update_xaxes(title_text=sub_name)
    fig.update_layout(height=400, width=800, font=dict(size=11),
                      yaxis=dict(tickformat=".2f"))
    fig.for_each_yaxis(lambda a: a.update(tickformat = ".2f"))
    fig.show()

## Work Location by Workplace Geography

In [15]:
plot_work_location(df_workers, 'CountyName_work', 'Work County', 'County')

In [16]:
plot_work_location(df_workers, 'rg_proposed_work', 'Regional Geography', 'Regional Geography')

## Workplace by Home Geography

In [17]:
plot_work_location(df_workers, 'CountyName_home', 'Home County', 'County')

In [18]:
plot_work_location(df_workers, 'rg_proposed_home', 'Home Regional Geography', 'Regional Geography')

## Workplace Location by Person/Household Characteristics

In [19]:
plot_work_location(df_workers, 'pwtyp', 'Employment Type', 'employment type')

In [20]:
plot_work_location(df_workers, 'hhincome_group', 'Income', 'Income Group')

In [21]:
plot_work_location(df_workers, 'hh_density_group', 'Household Density', 'Household Density Group')

In [22]:
plot_work_location(df_workers, 'emp_density_group', 'Employment Density', 'Employment Density Group')

In [23]:
# distance to school bins from workplace_location.csv
df_workers['distance_to_work_bin'] = pd.cut(df_workers['pwaudist'], bins=[0,1,2,5,15,9999],
                                    labels=['0-1', '1-2', '2-5',
                                            '5-15', '15+'])
# Create bins: bins of 2 miles up to 60 miles
max_bin = 60
bin_size = 2
df_workers['d_work_bin_60mi'] = pd.cut(df_workers['pwaudist'], bins=np.arange(0, max_bin+bin_size, bin_size), labels=[str(i) for i in np.arange(0, max_bin, bin_size)])

## Distance Bins

In [24]:
def plot_distance(df:pd.DataFrame, group:str, title_name:str):

    count = df.loc[(df['source']=='model') & (df[group]),['distance_to_work_bin']].value_counts()
    # print(f"model person count =\n"
    #       f"{count.sort_values()}")

    # plot1
    df_plot = df.loc[(df[group]) & (df['distance_to_work_bin'] != float('nan'))].groupby(['source','distance_to_work_bin'])['psexpfac'].sum().reset_index()
    df_plot['percentage'] = df_plot.groupby(['source'], group_keys=False)['psexpfac'].\
            apply(lambda x: 100 * x / float(x.sum()))

    df_plot_ct = df.loc[(df['distance_to_work_bin'] != float('nan'))].groupby(['source','distance_to_work_bin'])['psexpfac'].count().reset_index().rename(columns={'psexpfac':'sample count'})
    df_plot = df_plot.merge(df_plot_ct, on=['source','distance_to_work_bin'])

    # df_plot
    #
    fig1 = px.bar(df_plot, x='distance_to_work_bin', y="percentage", color="source", barmode="group",hover_data=['sample count'],
                  title=title_name)
    fig1.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
    fig1.update_layout(height=400, width=700, font=dict(size=11))
    fig1.show()

    # # plot2
    df_plot = df.loc[(df[group]) & (df['d_work_bin_60mi'].notna())].groupby(['source','d_work_bin_60mi'])['psexpfac'].sum().reset_index()
    df_plot['percentage'] = df_plot.groupby(['source'], group_keys=False)['psexpfac'].\
                apply(lambda x: 100 * x / float(x.sum()))
    # df_plot
    #
    fig2 = px.line(df_plot, x='d_work_bin_60mi', y="percentage", color="source",
                   title=title_name)
    fig2.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
    fig2.update_layout(height=400, width=700, font=dict(size=11))
    fig2.show()

# distance to work
plot_distance(df_workers,'pwtyp',"worker: distance to work (miles)")

In [25]:
def plot_group(df:pd.DataFrame, group:str, title_name:str, height:int = 900):
    df_plot = df_workers.groupby(['source',group,'distance_to_work_bin'])['psexpfac'].sum().reset_index()
    df_plot['percentage'] = df_plot.groupby(['source',group], group_keys=False)['psexpfac']. \
        apply(lambda x: x / float(x.sum()))
    
    df_plot_ct = df_workers.groupby(['source',group,'distance_to_work_bin'])['psexpfac'].count().reset_index().rename(columns={'psexpfac':'sample count'})
    df_plot = df_plot.merge(df_plot_ct, on=['source',group,'distance_to_work_bin'])

    fig = px.bar(df_plot, x="distance_to_work_bin", y="percentage", color="source",barmode="group",
                facet_col=group, facet_col_wrap=2, hover_data=['sample count'],
                title="Distance to work by "+title_name)
    fig.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
    fig.update_layout(height=height, width=700, font=dict(size=11),
                      yaxis=dict(tickformat=".0%"))
    fig.show()

def plot_group_line(df:pd.DataFrame, group:str, title_name:str, height:int = 400):
    df_plot = df_workers.groupby(['source',group,'d_work_bin_60mi'])['psexpfac'].sum().reset_index()
    df_plot['percentage'] = df_plot.groupby(['source',group], group_keys=False)['psexpfac']. \
        apply(lambda x: x / float(x.sum()))
    # df_plot

    fig2 = px.line(df_plot, x='d_work_bin_60mi', y="percentage", color="source",
                facet_col=group, facet_col_wrap=2,
                title="Distance to work by "+title_name)
    fig2.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
    fig2.update_layout(height=height, width=700, font=dict(size=11),
                       yaxis=dict(tickformat=".0%"))
    fig2.show()

In [26]:
df_workers['worker_type'] = df_workers.pwtyp.map({1: 'full-time', 2: 'part-time'})
plot_group(df_workers, 'worker_type', 'worker type', height= 400)

In [27]:
plot_group_line(df_workers, 'worker_type', 'worker type')

In [28]:
plot_group(df_workers, 'hhincome_group', 'income group',500)

In [29]:
plot_group_line(df_workers, 'hhincome_group', 'household income',500)

In [30]:
plot_group(df_workers, 'hh_density_group', 'density group',500)

In [31]:
plot_group_line(df_workers, 'hh_density_group', 'household density',500)

In [32]:
plot_group(df_workers, 'CountyName_work', 'work county',500)

In [33]:
plot_group_line(df_workers, 'CountyName_work', 'Work County',500)

In [34]:
plot_group(df_workers, 'rg_proposed_work', 'work geography',700)

In [35]:
plot_group_line(df_workers, 'rg_proposed_work', 'Work Geography',700)

## Origin-Destination Flows

In [36]:
# Get District lookup from parcel geographies

# Load observed flow data
# Load parcel geography lookups
db_location = 'sqlite:///'+config['model_dir']+'/inputs/db/'
# db_location = 'sqlite:///R:\\e2projects_two\\SoundCast\\Inputs\\dev\\db\\'

df_lehd = pd.read_sql_table(
    'observed_lehd_district_flows',
    db_location+input_config['db_name'],
    )

df_lehd = df_lehd[df_lehd['model_year'].astype('int') == int(input_config['model_year'])]
df_lehd['source'] = 'LEHD'

In [37]:
dist_name = {1: '1: Suburban Snohomish',
              2: '2: Everett-Lynwood-Edmonds',
              3: '3: North Seattle-Shoreline',
              4: '4: Seattle CBD',
              5: '5: West-South Seattle',
              6: '6: East Side',
              7: '7: Renton-FedWay-Kent',
              8: '8: Tacoma',
              9: '9: Kitsap',
              10: '10: South Pierce'}

df_lehd['home_dist_name'] = df_lehd['home_dist'].map(dist_name)
df_lehd['work_dist_name'] = df_lehd['work_dist'].map(dist_name)

In [38]:
df_model = person[person['source'] == 'model'].groupby(['District_home','District_work']).sum()[['psexpfac']].reset_index()
df_model['source'] = 'model'

df_model[['District_home','District_work']] = df_model[['District_home','District_work']].astype('int')
df_model['home_dist_name'] = df_model['District_home'].map(dist_name)
df_model['work_dist_name'] = df_model['District_work'].map(dist_name)

df_model.rename(columns={'psexpfac': 'workers_total'}, inplace=True)
df_lehd.rename(columns={'S000': 'workers_total'}, inplace=True)

# Drop LEHD data to/from external zones
df_lehd = df_lehd[(df_lehd['home_dist'] > 0) & (df_lehd['work_dist'] > 0)].copy()

In [39]:
_df_model = pd.pivot_table(df_model, index='home_dist_name', columns='work_dist_name', values='workers_total', aggfunc='sum')
_df_lehd = pd.pivot_table(df_lehd, index='home_dist_name', columns='work_dist_name', values='workers_total', aggfunc='sum')

In [40]:

df = pd.concat([df_lehd,df_model])
df.head()

var = 'work_dist_name'
df_plot = df.groupby(['source',var])[['workers_total']].sum().reset_index()
df_plot

fig = px.bar(df_plot.sort_values(by='source', ascending=False), x=var, y=['workers_total', 'source'], color="source",
                barmode="group",
                title="Work District Location")
# fig.for_each_annotation(lambda a: a.update(text = sub_name + "=<br>" + a.text.split("=")[-1]))
# fig.update_xaxes(title_text=sub_name)
fig.update_layout(height=400, width=800, font=dict(size=11),
                  xaxis=dict(categoryarray= list(dist_name.values())),
                  yaxis=dict(tickformat=".0f"))
fig.show()

In [41]:

df = pd.concat([df_model,df_lehd])
df.head()

var = 'home_dist_name'
df_plot = df.groupby(['source',var])[['workers_total']].sum().reset_index()
df_plot

fig = px.bar(df_plot.sort_values(by='source', ascending=False), x=var, y=['workers_total', 'source'], color="source",
                barmode="group",
                title="Home District")
# fig.for_each_annotation(lambda a: a.update(text = sub_name + "=<br>" + a.text.split("=")[-1]))
# fig.update_xaxes(title_text=sub_name)
fig.update_layout(height=400, width=800, font=dict(size=11),
                  xaxis=dict(categoryarray= list(dist_name.values())),
                  yaxis=dict(tickformat=".0f"))
fig.show()

In [42]:
# _df_model/_df_model['workers_total'].sum()
# df_model_dist = _df_model/_df_model.sum().sum()
# df_lehd_dist = _df_lehd/_df_lehd.sum().sum()

#### District Distribution: Model - LEHD

In [52]:
import seaborn as sns
import matplotlib
df = (_df_model-_df_lehd)/_df_lehd

# get max and min for color ramp
abs_max = df.abs().values.max()
abs_min = -abs_max

def make_pretty(styler):
    
    def background_with_norm(s):
        cm = sns.diverging_palette(10, 240, n=9, as_cmap=True)
        norm = matplotlib.colors.TwoSlopeNorm(vmin=abs_min, vcenter=0, vmax=abs_max)
        return ['background-color: {:s}'.format(matplotlib.colors.to_hex(c.flatten())) for c in cm(norm(s.values))]

    styler.set_caption("District distribution")
    styler.format("{:.2%}")
    styler.apply(background_with_norm)
    styler.set_properties(**{'color': 'black'})
    return styler

df[list(dist_name.values())].reindex(list(dist_name.values())).style.pipe(make_pretty)

work_dist_name,1: Suburban Snohomish,2: Everett-Lynwood-Edmonds,3: North Seattle-Shoreline,4: Seattle CBD,5: West-South Seattle,6: East Side,7: Renton-FedWay-Kent,8: Tacoma,9: Kitsap,10: South Pierce
home_dist_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1: Suburban Snohomish,101.27%,12.58%,15.24%,-15.60%,-57.39%,7.42%,-63.93%,-88.67%,-94.33%,-92.08%
2: Everett-Lynwood-Edmonds,28.56%,82.96%,24.85%,3.04%,-26.67%,19.33%,-52.73%,-86.37%,-90.49%,-90.08%
3: North Seattle-Shoreline,7.87%,23.72%,112.03%,38.99%,-2.83%,-13.66%,15.83%,-68.86%,-80.81%,-74.49%
4: Seattle CBD,-33.75%,9.07%,31.16%,139.96%,-34.03%,-36.67%,16.02%,-60.75%,-79.40%,-73.28%
5: West-South Seattle,-54.37%,-48.71%,11.74%,8.41%,100.88%,-2.62%,49.23%,-22.94%,-54.43%,-24.68%
6: East Side,15.08%,-5.48%,8.04%,-11.91%,-32.10%,47.80%,16.19%,-55.65%,-82.93%,-58.68%
7: Renton-FedWay-Kent,-77.90%,-80.71%,-21.34%,9.23%,-7.22%,-10.05%,66.01%,7.77%,-54.69%,8.57%
8: Tacoma,-93.71%,-91.43%,-50.39%,-16.29%,-25.39%,-37.55%,3.23%,74.39%,-36.75%,10.77%
9: Kitsap,-89.55%,-72.43%,-70.91%,35.40%,-56.95%,-69.53%,-65.51%,-13.18%,123.80%,-44.53%
10: South Pierce,-89.95%,-91.84%,-46.16%,-19.05%,-32.94%,-34.04%,6.09%,18.21%,3.61%,68.82%
