In [1]:
import os
import pandas as pd
import numpy as np
import validation_data_input
import plotly.express as px
import toml
import psrc_theme

# to show plotly figures in quarto HTML file
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"
pio.templates.default = "simple_white+psrc_color" # set plotly template

config = toml.load(os.path.join(os.getcwd(), '..\\..\\..\\..\\configuration', 'validation_configuration.toml'))
input_config = toml.load(os.path.join(os.getcwd(), '..\\..\\..\\..\\configuration', 'input_configuration.toml'))
# model_dir = os.path.join(os.getcwd(), '..\\..\\..\\..\\')

In [2]:
person = validation_data_input.get_data('person')
hh = validation_data_input.get_data('household')

In [3]:
# parcel land use data
df_parcel = pd.read_csv(os.path.join(config['model_dir'],r'outputs\landuse\buffered_parcels.txt'),
                        delim_whitespace=True,
                        usecols=['parcelid','emptot_1','hh_1'])

In [4]:
# Load parcel geography lookups
parcel_geog = pd.read_sql_table(
    'parcel_'+input_config['base_year']+'_geography',
    'sqlite:///'+config['model_dir']+'/inputs/db/'+input_config['db_name'],
    columns=['ParcelID','CountyName','rg_proposed']
    )

In [5]:
hh = hh.merge(df_parcel, left_on='hhparcel', right_on='parcelid', how='left')

In [6]:
# Group income, hh density, and employment density into 4 groups
var_group = hh.loc[hh['source'] == 'model',['hhincome','emptot_1','hh_1']].quantile([.125, .25, .50, .75])

# var_group

In [7]:
hh['hhincome_group'] = pd.cut(hh['hhincome'],bins=[-9999999.0] + var_group['hhincome'].tolist() + [9999999.0], labels=['very low', 'low', 'medium', 'medium-high', 'high'])
# add hh density groups
hh['hh_density_group'] = pd.cut(hh['hh_1'],bins=[-9999999.0] + var_group['hh_1'].tolist() + [9999999.0], labels=['very low', 'low', 'medium', 'medium-high', 'high'])
# add employment density groups
hh['emp_density_group'] = pd.cut(hh['emptot_1'],bins=[-9999999.0] + var_group['emptot_1'].tolist() + [9999999.0], labels=['very low', 'low', 'medium', 'medium-high', 'high'])


In [8]:
person = person.merge(hh, on=['hhno','source'], how='left')

In [9]:
person = person.merge(parcel_geog, left_on='pspcl', right_on='ParcelID', how='left')
person = person.merge(parcel_geog, left_on='hhparcel', right_on='ParcelID', how='left', suffixes=['_school', '_home'])

In [10]:
df_students = person.loc[person['pstyp']>0, :].copy()

In [11]:
df_students[df_students['source']=='survey'].to_clipboard()

In [12]:
df = df_students.loc[df_students['psaudist'] > 0, :].copy()
df['wt_dist'] = df['psaudist']*df['psexpfac']
df = df.groupby(['source'])[['wt_dist','psexpfac']].sum().reset_index()
df['Mean Distance to School by Car (psaudist)'] = df['wt_dist']/df['psexpfac']
df[['source','Mean Distance to School by Car (psaudist)']]

Unnamed: 0,source,Mean Distance to School by Car (psaudist)
0,model,4.45432
1,survey,5.376894
2,survey (2017/2019),4.108154


In [13]:
# auto ownership in Income groups
def plot_school_location(df:pd.DataFrame, var:str, title_cat:str, sub_name:str):
    df = df.loc[df['psaudist'] > 0, :].copy()
    df['wt_dist'] = df['psaudist']*df['psexpfac']
    df_plot = df.groupby(['source',var])[['wt_dist','psexpfac']].sum().reset_index()
    df_plot['average_wt_psaudist'] = df_plot['wt_dist']/df_plot['psexpfac']

    df_plot_ct = df.groupby(['source',var])[['wt_dist','psexpfac']].count().reset_index(). \
    rename(columns={'psexpfac':'sample count'})
    df_plot = df_plot.merge(df_plot_ct, on=['source',var])

    fig = px.bar(df_plot, x=var, y=['average_wt_psaudist', 'source'], color="source",
                    barmode="group",
                     hover_data=['sample count'],
                    title="School Distance by "+ title_cat)
    fig.for_each_annotation(lambda a: a.update(text = sub_name + "=<br>" + a.text.split("=")[-1]))
    fig.update_xaxes(title_text=sub_name)
    fig.update_layout(height=400, width=800, font=dict(size=11),
                      yaxis=dict(tickformat=".2f"))
    fig.for_each_yaxis(lambda a: a.update(tickformat = ".2f"))
    fig.show()

## School Location by School Geography

In [14]:
plot_school_location(df_students, 'CountyName_school', 'School County', 'County')

In [15]:
plot_school_location(df_students, 'rg_proposed_school', 'Regional Geography', 'Regional Geography')

## School Location by Home Geography

In [16]:
plot_school_location(df_students, 'CountyName_home', 'Home County', 'County')

In [17]:
plot_school_location(df_students, 'rg_proposed_home', 'Home Regional Geography', 'Regional Geography')

## School Location by Person/Household Characteristics

In [18]:
plot_school_location(df_students[df_students['pptyp'].isin([6,7,8])], 
                     'pptyp', 'Person Type', 'student type')

In [19]:
plot_school_location(df_students, 'hh_density_group', 'Household Density', 'Household Density Group')

In [20]:
plot_school_location(df_students, 'emp_density_group', 'Employment Density', 'Employment Density Group')

## Distance Bins

In [21]:
# distance to school bins from workplace_location.csv
df_students['distance_to_school_bin'] = pd.cut(df_students['psaudist'], bins=[0,1,2,5,15,9999],
                                    labels=['0-1', '1-2', '2-5',
                                            '5-15', '15+'])
# Create bins: bins of 2 miles up to 60 miles
max_bin = 60
bin_size = 2
df_students['d_school_bin_60mi'] = pd.cut(df_students['psaudist'], bins=np.arange(0, max_bin+bin_size, bin_size), labels=[str(i) for i in np.arange(0, max_bin, bin_size)])

In [22]:
def plot_distance(df:pd.DataFrame, group:str, title_name:str):

    count = df.loc[(df['source']=='model') & (df[group]),['distance_to_school_bin']].value_counts()
    # print(f"model person count =\n"
    #       f"{count.sort_values()}")

    # plot1
    df_plot = df.loc[(df[group]) & (df['distance_to_school_bin'] != float('nan'))].groupby(['source','distance_to_school_bin'])['psexpfac'].sum().reset_index()
    df_plot['percentage'] = df_plot.groupby(['source'], group_keys=False)['psexpfac'].\
            apply(lambda x: 100 * x / float(x.sum()))
    # df_plot
    #
    fig1 = px.bar(df_plot, x='distance_to_school_bin', y="percentage", color="source", barmode="group",
                  title=title_name)
    fig1.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
    fig1.update_layout(height=400, width=700, font=dict(size=11))
    fig1.show()

    # # plot2
    df_plot = df.loc[(df[group]) & (df['d_school_bin_60mi'].notna())].groupby(['source','d_school_bin_60mi'])['psexpfac'].sum().reset_index()
    df_plot['percentage'] = df_plot.groupby(['source'], group_keys=False)['psexpfac'].\
                apply(lambda x: 100 * x / float(x.sum()))
    # df_plot
    #
    fig2 = px.line(df_plot, x='d_school_bin_60mi', y="percentage", color="source",
                   title=title_name)
    fig2.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
    fig2.update_layout(height=400, width=700, font=dict(size=11))
    fig2.show()

# distance to work
plot_distance(df_students,'pstyp',"distance to school (miles)")

In [23]:
df_students['student_type'] = df_students['pptyp'].map({5: 'University',
                                                        6: 'High School 16+',
                                                        7: 'Child Age 5-15',
                                                        8: 'Child Age 0-4'})

In [24]:
df_plot = df_students.loc[(~df_students['student_type'].isnull()) & (df_students['distance_to_school_bin'] != float('nan'))].groupby(['source','distance_to_school_bin','student_type'])['psexpfac'].sum().reset_index()
df_plot['percentage'] = df_plot.groupby(['source','student_type'], group_keys=False)['psexpfac']. \
    apply(lambda x: 100 * x / float(x.sum()))
# df_plot

df_plot_ct = df_students.loc[(~df_students['student_type'].isnull()) & (df_students['distance_to_school_bin'] != float('nan'))].groupby(['source','distance_to_school_bin','student_type'])['psexpfac'].count().reset_index().rename(columns={'psexpfac':'sample count'})
df_plot = df_plot.merge(df_plot_ct, on=['source','student_type','distance_to_school_bin'])


fig = px.bar(df_plot, x="distance_to_school_bin", y="percentage", color="source",barmode="group",
             facet_col="student_type", facet_col_wrap=2,
             hover_data=['sample count'],
             title="Distance to school by student type")
fig.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
fig.update_layout(height=900, width=700, font=dict(size=11))
fig.show()



In [25]:

df_plot = df_students.loc[(~df_students['student_type'].isnull()) & (df_students['d_school_bin_60mi'] != float('nan'))].groupby(['source','student_type','d_school_bin_60mi'])['psexpfac'].sum().reset_index()
df_plot['percentage'] = df_plot.groupby(['source','student_type'], group_keys=False)['psexpfac']. \
    apply(lambda x: 100 * x / float(x.sum()))
# df_plot

fig2 = px.line(df_plot, x='d_school_bin_60mi', y="percentage", color="source",
               facet_col="student_type", facet_col_wrap=2,
               title="Distance to school by income group")
fig2.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
fig2.update_layout(height=400, width=700, font=dict(size=11))
fig2.show()

In [26]:
df_plot = df_students.loc[(df_students['pptyp']) & (df_students['distance_to_school_bin'] != float('nan'))].groupby(['source','hhincome_group','distance_to_school_bin'])['psexpfac'].sum().reset_index()
df_plot['percentage'] = df_plot.groupby(['source','hhincome_group'], group_keys=False)['psexpfac']. \
    apply(lambda x: 100 * x / float(x.sum()))
# df_plot

df_plot_ct = df_students.loc[(df_students['distance_to_school_bin'] != float('nan'))].groupby(['source','distance_to_school_bin','hhincome_group'])['psexpfac'].count().reset_index().rename(columns={'psexpfac':'sample count'})
df_plot = df_plot.merge(df_plot_ct, on=['source','hhincome_group','distance_to_school_bin'])


fig = px.bar(df_plot, x="distance_to_school_bin", y="percentage", color="source",barmode="group",
             facet_col="hhincome_group", facet_col_wrap=2,
             hover_data=['sample count'],
             title="Distance to school by household income")
fig.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
fig.update_layout(height=900, width=700, font=dict(size=11))
fig.show()

In [27]:

df_plot = df_students.loc[(df_students['pptyp']) & (df_students['d_school_bin_60mi'] != float('nan'))].groupby(['source','hhincome_group','d_school_bin_60mi'])['psexpfac'].sum().reset_index()
df_plot['percentage'] = df_plot.groupby(['source','hhincome_group'], group_keys=False)['psexpfac']. \
    apply(lambda x: 100 * x / float(x.sum()))
# df_plot

fig2 = px.line(df_plot, x='d_school_bin_60mi', y="percentage", color="source",
               facet_col="hhincome_group", facet_col_wrap=2,
               title="Distance to school by income group")
fig2.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
fig2.update_layout(height=400, width=700, font=dict(size=11))
fig2.show()

In [28]:
df_plot = df_students.loc[(df_students['pptyp']) & (df_students['distance_to_school_bin'] != float('nan'))].groupby(['source','hh_density_group','distance_to_school_bin'])['psexpfac'].sum().reset_index()
df_plot['percentage'] = df_plot.groupby(['source','hh_density_group'], group_keys=False)['psexpfac']. \
    apply(lambda x: 100 * x / float(x.sum()))
# df_plot

df_plot_ct = df_students.loc[(df_students['distance_to_school_bin'] != float('nan'))].groupby(['source','distance_to_school_bin','hh_density_group'])['psexpfac'].count().reset_index().rename(columns={'psexpfac':'sample count'})
df_plot = df_plot.merge(df_plot_ct, on=['source','hh_density_group','distance_to_school_bin'])


fig = px.bar(df_plot, x="distance_to_school_bin", y="percentage", color="source",barmode="group",
             facet_col="hh_density_group", facet_col_wrap=2,
             hover_data=['sample count'],
             title="Distance to school by household density")
fig.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
fig.update_layout(height=900, width=700, font=dict(size=11))
fig.show()

In [29]:

df_plot = df_students.loc[(df_students['pptyp']) & (df_students['d_school_bin_60mi'] != float('nan'))].groupby(['source','hh_density_group','d_school_bin_60mi'])['psexpfac'].sum().reset_index()
df_plot['percentage'] = df_plot.groupby(['source','hh_density_group'], group_keys=False)['psexpfac']. \
    apply(lambda x: 100 * x / float(x.sum()))
# df_plot

fig2 = px.line(df_plot, x='d_school_bin_60mi', y="percentage", color="source",
               facet_col="hh_density_group", facet_col_wrap=2,
               title="Distance to school by household density")
fig2.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
fig2.update_layout(height=400, width=700, font=dict(size=11))
fig2.show()