In [1]:
import os
import toml
import pandas as pd
import numpy as np
import validation_data_input
import plotly.express as px

# to show plotly figures in quarto HTML file
import plotly.io as pio

%store -r validation_data

pio.renderers.default = "plotly_mimetype+notebook_connected"

# auto ownership
#config = toml.load(os.path.join(os.getcwd(), 'validation_configuration.toml'))
config = toml.load(os.path.join(os.getcwd(), 'validation_configuration.toml'))

## Household data

- income, hh density, employment density grouped into very low, low, medium, medium-high and high

In [2]:
# read data
#land_use = pd.read_csv(config['p_survey_landuse'], usecols=['zone_id','log_emptot_1','log_hh_1'])
land_use = validation_data.land_use[['zone_id','log_emptot_1','log_hh_1']].copy()

# hh_data = validation_data_input.get_households_data(['home_zone_id','auto_ownership','hhsize','num_workers','num_adults','num_drivers','income'])
hh_data = validation_data.hh_data_uncloned.copy()
# delete shared data
del validation_data

hh_data = hh_data.merge(land_use,how="left",left_on='home_zone_id',right_on='zone_id').\
    merge(pd.read_csv(config['p_maz_bg_lookup'])[['MAZ', 'block_group_id']], how="left",left_on='home_zone_id',right_on='MAZ')

# grouping income, hh density, employment density into very low, low, medium, medium-high and high
# todo: check if we should be using model data for grouping
var_group = hh_data.loc[hh_data['source']=="model results", ['income','log_emptot_1','log_hh_1']].quantile([.125, .25, .50, .75])

var_group

Unnamed: 0,income,log_emptot_1,log_hh_1
0.125,24000.0,1.321043,4.434358
0.25,43000.0,2.922615,5.089858
0.5,82000.0,4.726863,5.721498
0.75,135000.0,6.239594,6.457123


In [3]:
# data manipulation

# add income group
hh_data['hhincome_group'] = pd.cut(hh_data['income'],bins=[-9999999.0] + var_group['income'].tolist() + [9999999.0], labels=['very low', 'low', 'medium', 'medium-high', 'high'])
# add hh density groups
hh_data['hh_density_group'] = pd.cut(hh_data['log_hh_1'],bins=[-9999999.0] + var_group['log_hh_1'].tolist() + [9999999.0], labels=['very low', 'low', 'medium', 'medium-high', 'high'])
# add employment density groups
hh_data['emp_density_group'] = pd.cut(hh_data['log_emptot_1'],bins=[-9999999.0] + var_group['log_emptot_1'].tolist() + [9999999.0], labels=['very low', 'low', 'medium', 'medium-high', 'high'])

# add auto_ownership with 4+
hh_data['auto_ownership_simple'] = np.where(hh_data['auto_ownership']>=4, "4+", hh_data['auto_ownership'])
# add auto_ownership with 2+
hh_data['auto_ownership_2'] = np.where(hh_data['auto_ownership']<2, hh_data['auto_ownership'], "2+")
# add hhsize with 4+
hh_data['hhsize_simple'] = np.where(hh_data['hhsize']>=4, "4+", hh_data['hhsize'])
# add num_workers with 4+
hh_data['num_workers_simple'] = np.where(hh_data['num_workers']>=4, "4+", hh_data['num_workers'])
# add num_adults with 4+
hh_data['num_adults_simple'] = np.where(hh_data['num_adults']>=4, "4+",hh_data['num_adults'])
# add num_drivers with 4+
hh_data['num_drivers_simple'] = np.where(hh_data['num_drivers']>=4, "4+",hh_data['num_drivers'])


## Auto ownership across all households

In [4]:
df_plot = hh_data.groupby(['source','auto_ownership_simple'])['hh_weight'].sum().reset_index()

df_plot['percentage'] = df_plot.groupby(['source'], group_keys=False)['hh_weight'].\
        apply(lambda x: 100 * x / float(x.sum()))
df_plot['source'] = df_plot['source'].astype(pd.CategoricalDtype(['model results', 'survey data', 'unweighted survey']))

fig = px.bar(df_plot.sort_values(by=['source']), x="auto_ownership_simple", y="percentage", color="source",
             barmode="group",template="simple_white",
             title="Auto ownership")
fig.update_layout(height=400, width=700, font=dict(size=11))
fig.show()

## Auto ownership by segments

In [5]:
# auto ownership in Income groups
def plot_auto(df:pd.DataFrame, var:str, title_cat:str,sub_name:str):
    print(f"n=\n"
          f"{df.loc[df['source']=='model results',var].value_counts()[df[var].sort_values().unique()]}")
    df_plot = df.groupby(['source',var,'auto_ownership_simple'])['hh_weight'].sum().reset_index()
    df_plot['percentage'] = df_plot.groupby(['source',var], group_keys=False)['hh_weight'].\
        apply(lambda x: 100 * x / float(x.sum()))

    fig = px.bar(df_plot, x="auto_ownership_simple", y="percentage", color="source",
                 facet_col=var, barmode="group",template="simple_white",
                 title="Auto ownership by "+ title_cat)
    fig.for_each_annotation(lambda a: a.update(text = sub_name + "=<br>" + a.text.split("=")[-1]))
    fig.update_xaxes(title_text="n of cars")
    fig.update_layout(height=400, width=800, font=dict(size=11))
    fig.show()

In [6]:
plot_auto(hh_data,'hhincome_group','income level', 'Income')

n=
very low       202751
low            198701
medium         402805
medium-high    404394
high           396612
Name: hhincome_group, dtype: int64


In [7]:
plot_auto(hh_data,'hhsize_simple','household size', 'HH size')

n=
1     452162
2     533682
3     254336
4+    365083
Name: hhsize_simple, dtype: int64


In [8]:
plot_auto(hh_data.loc[hh_data['num_adults_simple']!="0"],'num_adults_simple','number of adults','num adults')

n=
1     504125
2     762899
3     240181
4+     97983
Name: num_adults_simple, dtype: int64


In [9]:
plot_auto(hh_data,'num_workers_simple','number of workers','num workers')

n=
0     355309
1     637690
2     504466
3      99225
4+      8573
Name: num_workers_simple, dtype: int64


In [10]:
plot_auto(hh_data.loc[hh_data['num_drivers_simple']!="0"],'num_drivers_simple','number of drivers','num drivers')

n=
1     493312
2     708568
3     274525
4+    128847
Name: num_drivers_simple, dtype: int64


In [11]:
plot_auto(hh_data.dropna(subset=['hh_density_group']),'hh_density_group','household density','density')

n=
very low       200677
low            200647
medium         401318
medium-high    401356
high           401265
Name: hh_density_group, dtype: int64


In [12]:
plot_auto(hh_data.dropna(subset=['emp_density_group']),'emp_density_group','employment density','density')

n=
very low       200684
low            200685
medium         401476
medium-high    401132
high           401286
Name: emp_density_group, dtype: int64


## Validate auto ownership with ACS vehicle ownership data

- TO-DO:
    1. get auto ownership values [0,1,2,3,4+] from ACS data.
    2. check block_group_id values: not matching ACS block group IDs

- household counts in ACS data

In [13]:
print(f"n=\n"
      f"{hh_data.loc[hh_data['source']=='model results','auto_ownership_2'].value_counts()[hh_data['auto_ownership_2'].sort_values().unique()]}")
      
df = hh_data.groupby(['source','block_group_id','auto_ownership_2'])['hh_weight'].sum().reset_index()
# df_plot = df_plot.loc[df_plot['auto_ownership_2']!="-1"]

df['percentage'] = df.groupby(['source','block_group_id'], group_keys=False)['hh_weight'].\
    apply(lambda x: 100 * x / float(x.sum()))

# acs auto ownership data
acs_auto_ownership = pd.read_csv(config['p_acs_auto_ownership'])[['cars_none_control', 'cars_one_control', 'cars_two_or_more_control', 'block_group_id']]

# calculate percentage of households having 0, 1 or 2+ vehicle(s) in each block group
acs_auto_ownership['total'] = acs_auto_ownership['cars_one_control'] + acs_auto_ownership['cars_two_or_more_control'] + acs_auto_ownership['cars_none_control']
acs_auto_ownership['0'] = 100 * acs_auto_ownership['cars_none_control']/acs_auto_ownership['total']
acs_auto_ownership['1'] = 100 * acs_auto_ownership['cars_one_control']/acs_auto_ownership['total']
acs_auto_ownership['2+'] = 100 * acs_auto_ownership['cars_two_or_more_control']/acs_auto_ownership['total']
acs_auto_ownership['source'] = "acs data"
bg_auto_ownership = acs_auto_ownership[['source','block_group_id','0','1','2+']]
bg_auto_ownership = pd.melt(bg_auto_ownership, id_vars=['source','block_group_id'], value_vars=['0','1','2+'], var_name='auto_ownership_2',value_name='percentage')

# combine both sets of data
col_list = ['source','block_group_id','auto_ownership_2','percentage']
bg_auto_ownership = pd.concat([df[col_list].copy(),
                               bg_auto_ownership[col_list].copy()])

# bg_auto_ownership

n=
0     105197
1     524821
2+    975245
Name: auto_ownership_2, dtype: int64


- scatterplot

In [14]:
df_plot = pd.pivot(bg_auto_ownership, index=['block_group_id','auto_ownership_2'], columns='source', values='percentage').reset_index()



fig = px.scatter(df_plot, x="acs data", y="model results", trendline="ols", trendline_color_override='rgb(136, 136, 136)',
                 template="plotly_white",
                 facet_col='auto_ownership_2', height=400, width=1000,
                 title="Auto ownership model results validation with acs data")
fig.update_xaxes(dtick=20)
fig.update_yaxes(dtick=20,range=[0, 100])
fig.update_traces(marker_size=3)
fig.update_layout(height=360, width=800, font=dict(size=11))
fig.show()
