In [1]:
import os
import pandas as pd
import numpy as np
import validation_data_input
import plotly.express as px
import toml
import psrc_theme

# to show plotly figures in quarto HTML file
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"
pio.templates.default = "simple_white+psrc_color" # set plotly template

config = toml.load(os.path.join(os.getcwd(), '..\\..\\..\\..\\configuration', 'validation_configuration.toml'))
input_config = toml.load(os.path.join(os.getcwd(), '..\\..\\..\\..\\configuration', 'input_configuration.toml'))
# model_dir = os.path.join(os.getcwd(), '..\\..\\..\\..\\')

In [2]:
hh = validation_data_input.get_data('household')

In [3]:
# parcel land use data
df_parcel = pd.read_csv(os.path.join(config['model_dir'],r'outputs\landuse\buffered_parcels.txt'),
                        delim_whitespace=True,
                        usecols=['parcelid','emptot_1','hh_1'])

In [4]:
hh = hh.merge(df_parcel, left_on='hhparcel', right_on='parcelid', how='left')

- income, hh density, employment density grouped into very low, low, medium, medium-high and high

In [5]:
# Group income, hh density, and employment density into 4 groups
var_group = hh.loc[hh['source'] == 'model',['hhincome','emptot_1','hh_1']].quantile([.125, .25, .50, .75])
var_group['labels']=['very low','low','medium','high']
var_group[['labels','emptot_1','hh_1']]

Unnamed: 0,labels,emptot_1,hh_1
0.125,very low,0.0,82.0
0.25,low,8.0,167.594888
0.5,medium,116.427656,357.834326
0.75,high,550.059575,734.382212


In [6]:
income_4group = hh.loc[hh['source'] == 'model',['hhincome']].quantile([.25, .50, .75])
income_4group['labels']=['low','medium','high']
income_4group[['labels','hhincome']]

Unnamed: 0,labels,hhincome
0.25,low,56262.0
0.5,medium,105870.0
0.75,high,180100.0


In [7]:
# data manipulation
# hhwkrs is not always accurate; recalculate from part and full time workers
hh['hhwkrs'] = hh['hhftw']+hh['hhptw']
hh['hhwkrs']
# Add column for (potential) drivers adults (all hh members 16 and above)
hh['drivers'] = hh['hhsize']-hh['hh515']-hh['hhcu5']#-hh['hhhsc']

# add income group
hh['hhincome_group'] = pd.cut(hh['hhincome'],bins=[-9999999.0] + var_group['hhincome'].tolist(), labels=var_group['labels'])
hh['hhincome_4group'] = pd.cut(hh['hhincome'],bins=[-9999999.0] + income_4group['hhincome'].tolist(), labels=income_4group['labels'])
# add hh density groups
hh['hh_density_group'] = pd.cut(hh['hh_1'],bins=[-9999999.0] + var_group['hh_1'].tolist(), labels=var_group['labels'])
# add employment density groups
hh['emp_density_group'] = pd.cut(hh['emptot_1'],bins=[-9999999.0] + var_group['emptot_1'].tolist(), labels=var_group['labels'])

# add auto_ownership with 4+
hh['auto_ownership_simple'] = np.where(hh['hhvehs']>=4, "4+", hh['hhvehs'])
# add auto_ownership with 2+
hh['auto_ownership_2'] = np.where(hh['hhvehs']<2, hh['hhvehs'], "2+")
# add hhsize with 4+
hh['hhsize_simple'] = np.where(hh['hhsize']>=4, "4+", hh['hhsize'])
# add num_workers with 4+
hh['num_workers_simple'] = np.where(hh['hhwkrs']>=4, "4+", hh['hhwkrs'])
# add num_drivers with 4+
hh['num_drivers_simple'] = np.where(hh['drivers']>=4, "4+",hh['drivers'])

In [8]:
# Load ACS data
df_acs = pd.read_sql_table('observed_acs_vehicles_drivers', 'sqlite:///../../../../inputs/db/'+input_config['db_name'])
df_acs = df_acs.groupby('vehicles').sum()[['households']].reset_index()
df_acs.rename(columns={'households': 'hhexpfac'}, inplace=True)
df_acs['source'] = 'ACS'
df_acs.replace(4, '4+', inplace=True)
df_acs.rename(columns={'vehicles':'auto_ownership_simple'}, inplace=True)

In [9]:
# df_plot = hh.groupby(['source','auto_ownership_simple'])['hhexpfac'].sum().reset_index()
# df_plot = pd.concat([df_plot,df_acs])
# df_plot = df_plot.reset_index(drop=True)
# df_plot['percentage'] = df_plot.groupby(['source'], group_keys=False)['hhexpfac'].\
#         apply(lambda x: x / float(x.sum()))

In [10]:
df_plot = hh.groupby(['source','auto_ownership_simple'])['hhexpfac'].sum().reset_index()
df_plot = pd.concat([df_plot,df_acs])
df_plot = df_plot.reset_index(drop=True)
df_plot['percentage'] = df_plot.groupby(['source'], group_keys=False)['hhexpfac'].\
        apply(lambda x: x / float(x.sum()))

df_plot_ct = hh.groupby(['source','auto_ownership_simple'])['hhexpfac'].count().reset_index(). \
    rename(columns={'hhexpfac':'sample count'})
df_plot = df_plot.merge(df_plot_ct, on=['source','auto_ownership_simple'], how='left')
df_plot = df_plot[df_plot['auto_ownership_simple'] != '-1']

fig = px.bar(df_plot.sort_values(by=['source']), x="auto_ownership_simple", y="percentage", color="source",
             hover_data=['sample count'],
             barmode="group",title="Auto ownership")
fig.update_layout(height=400, width=700, font=dict(size=11),
                  yaxis=dict(tickformat=".2%"))
fig.show()

## Auto ownership by segments

In [11]:

# auto ownership in Income groups
def plot_auto(df:pd.DataFrame, var:str, title_cat:str, sub_name:str):
    df2 = df.loc[df['auto_ownership_simple'] != '-1'].copy()
    df_plot = df2.groupby(['source',var,'auto_ownership_simple'])['hhexpfac'].sum().reset_index()
    df_plot['percentage'] = df_plot.groupby(['source',var], group_keys=False)['hhexpfac'].\
        apply(lambda x: x / float(x.sum()))

    df_plot_ct = df2.groupby(['source',var,'auto_ownership_simple'])['hhexpfac'].count().reset_index(). \
        rename(columns={'hhexpfac':'sample count'})
    df_plot = df_plot.merge(df_plot_ct, on=['source',var,'auto_ownership_simple'])

    fig = px.bar(df_plot, x="auto_ownership_simple", y="percentage", color="source",
                 facet_col=var, barmode="group",
                 hover_data=['sample count'],
                 title="Auto ownership by "+ title_cat)
    fig.for_each_annotation(lambda a: a.update(text = sub_name + "=<br>" + a.text.split("=")[-1]))
    fig.update_xaxes(title_text="n of cars")
    fig.update_layout(height=400, width=800, font=dict(size=11),
                      yaxis=dict(tickformat=".2%"))
    fig.for_each_yaxis(lambda a: a.update(tickformat = ".2%"))
    fig.show()

In [12]:
plot_auto(hh,'hhincome_4group','income level', 'Income')

In [13]:
plot_auto(hh,'hhsize_simple','household size', 'HH size')

In [14]:
plot_auto(hh.loc[hh['num_drivers_simple']!="0"],'num_drivers_simple','number of (poential) drivers age 16+','num drivers')

In [15]:
plot_auto(hh,'num_workers_simple','number of workers','num workers')

In [16]:
plot_auto(hh.dropna(subset=['hh_density_group']),'hh_density_group','household density','density')

In [17]:
plot_auto(hh.dropna(subset=['emp_density_group']),'emp_density_group','employment density','density')

## Validate auto ownership with ACS vehicle ownership data

In [18]:
# ACS auto ownership validation dataset
df_acs = pd.read_csv(config['p_acs_auto_ownership'])
# add lookup for maz and block groups
df_acs = df_acs.merge(pd.read_csv(config['p_maz_bg_lookup']), on='block_group_id')
df_acs_taz = df_acs[['TAZ','block_group_id','cars_none_control', 'cars_one_control','cars_two_or_more_control']].drop_duplicates()

hh_taz = hh.merge(df_acs_taz, how='left', left_on='hhtaz', right_on='TAZ')


In [19]:

df = hh_taz.groupby(['source','block_group_id','auto_ownership_2'])['hhexpfac'].sum().reset_index()

df['percentage'] = df.groupby(['source','block_group_id'], group_keys=False)['hhexpfac'].\
    apply(lambda x: 100 * x / float(x.sum()))

# acs auto ownership data
acs_auto_ownership = pd.read_csv(config['p_acs_auto_ownership'], usecols=['cars_none_control', 'cars_one_control', 'cars_two_or_more_control', 'block_group_id'])

# calculate percentage of households having 0, 1 or 2+ vehicle(s) in each block group
acs_auto_ownership['total'] = acs_auto_ownership['cars_one_control'] + acs_auto_ownership['cars_two_or_more_control'] + acs_auto_ownership['cars_none_control']
acs_auto_ownership['0'] = 100 * acs_auto_ownership['cars_none_control']/acs_auto_ownership['total']
acs_auto_ownership['1'] = 100 * acs_auto_ownership['cars_one_control']/acs_auto_ownership['total']
acs_auto_ownership['2+'] = 100 * acs_auto_ownership['cars_two_or_more_control']/acs_auto_ownership['total']
acs_auto_ownership['source'] = "acs data"
bg_auto_ownership = acs_auto_ownership[['source','block_group_id','0','1','2+']]
bg_auto_ownership = pd.melt(bg_auto_ownership, id_vars=['source','block_group_id'], value_vars=['0','1','2+'], var_name='auto_ownership_2',value_name='percentage')

# combine both sets of data
col_list = ['source','block_group_id','auto_ownership_2','percentage']
bg_auto_ownership = pd.concat([df[col_list].copy(),
                               bg_auto_ownership[col_list].copy()])

In [20]:
df_plot = pd.pivot(bg_auto_ownership, index=['block_group_id','auto_ownership_2'], columns='source', values='percentage').reset_index()



fig = px.scatter(df_plot, x="acs data", y="model", trendline="ols", trendline_color_override='rgb(136, 136, 136)',
                 template="plotly_white",
                 facet_col='auto_ownership_2', height=400, width=1000,
                 title="Auto ownership model results validation with acs data")
fig.update_xaxes(dtick=20)
fig.update_yaxes(dtick=20,range=[0, 100])
fig.update_traces(marker_size=3)
fig.update_layout(height=400, width=950, font=dict(size=11))
fig.show()