In [1]:
import os
import pandas as pd
import numpy as np
# import validation_data_input
from sqlalchemy import create_engine,text
import plotly.express as px
import toml
import polars as pl
from pathlib import Path
import util
import psrc_theme

# to show plotly figures in quarto HTML file
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"
pio.templates.default = "simple_white+psrc_color" # set plotly template

In [2]:
# %store -r validation_data

# config = validation_data.config.copy()
# input_config = validation_data.input_config.copy()
# person = validation_data.person.copy()
# hh = validation_data.hh.copy()
# df_parcel = validation_data.land_use.copy()
# parcel_geog = validation_data.parcel_geog.copy()

config = toml.load(Path(Path.cwd(), '..\..\..\..\configuration', 'validation_configuration.toml'))
input_config = toml.load(Path(Path.cwd(), '..\..\..\..\configuration', 'input_configuration.toml'))

data = util.ValidationData(config,input_config,['hh', 'person', 'land_use'])

hh = data.hh.to_pandas()
person = data.person.to_pandas()
# parcel_geog = data.parcel_geog.to_pandas()
df_parcel = data.land_use.to_pandas()

conn = create_engine('sqlite:///../../../../inputs/db/'+input_config['db_name'])
parcel_geog = pd.read_sql(text("SELECT * FROM "
                    + "parcel_"
                    + input_config["base_year"]
                    + "_geography"), con=conn.connect())

In [3]:
hh = hh.merge(df_parcel, left_on='hhparcel', right_on='parcelid', how='left')

In [4]:
# Group income, hh density, and employment density into 4 groups
var_group = hh.loc[hh['source'] == 'model',['hhincome','emptot_1','hh_1']].quantile([.125, .25, .50, .75])

# var_group

In [5]:
# Group ages
survey_ages = person.loc[person['source'] == 'survey', 'pagey'].astype('int').unique()
survey_ages.sort()
# survey_ages = np.insert(survey_ages, -1, 0)
survey_ages = np.append(survey_ages, 999)

# Create labels
person['age'] = pd.cut(person['pagey'], bins=survey_ages)

In [6]:
hh['hhincome_group'] = pd.cut(hh['hhincome'],bins=[-9999999.0] + var_group['hhincome'].tolist() + [9999999.0], labels=['very low', 'low', 'medium', 'medium-high', 'high'])
# add hh density groups
hh['hh_density_group'] = pd.cut(hh['hh_1'],bins=[-9999999.0] + var_group['hh_1'].tolist() + [9999999.0], labels=['very low', 'low', 'medium', 'medium-high', 'high'])
# add employment density groups
hh['emp_density_group'] = pd.cut(hh['emptot_1'],bins=[-9999999.0] + var_group['emptot_1'].tolist() + [9999999.0], labels=['very low', 'low', 'medium', 'medium-high', 'high'])


In [7]:
person = person.merge(hh, on=['hhno','source'], how='left')

In [8]:
person = person.merge(parcel_geog, left_on='hhparcel', right_on='ParcelID', how='left')

In [9]:
ptype_cat = {1: "1: full time worker",
             2: "2: part time worker",
             3: "3: non-worker age 65+",
             4: "4: other non-working adult",
             5: "5: university student",
             6: "6: grade school student/child age 16+",
             7: "7: child age 5-15",
             8: "8: child age 0-4"}
person['pptyp_label'] = person['pptyp'].map(ptype_cat)
person['pptyp_label'] = person['pptyp_label'].astype(pd.CategoricalDtype(ptype_cat.values()))

In [10]:
df = person.pivot_table(index='source', columns='ptpass', aggfunc='sum', values='psexpfac')
df_tot = person.groupby('source')['psexpfac'].sum().reset_index()
df = df.merge(df_tot, on='source')
df['no pass'] = df[0]/df['psexpfac']
df['% owns transit pass'] = df[1]/df['psexpfac']
df[['source','% owns transit pass']]

Unnamed: 0,source,% owns transit pass
0,model,0.21172
1,survey,0.271554


In [11]:

# auto ownership in Income groups
def plot_transit_pass(df:pd.DataFrame, var:str, title_cat:str, sub_name:str):

    # Group and calculate percentage
    df_plot = df.groupby(['source', var, 'ptpass'])['psexpfac'].sum().reset_index()
    df_plot['percentage'] = df_plot.groupby(['source', var], group_keys=False)['psexpfac'].apply(lambda x: x / float(x.sum()))

    # Count sample for each group
    df_plot_ct = df.groupby(['source', var, 'ptpass'])['psexpfac'].count().reset_index().rename(columns={'psexpfac': 'sample count'})
    df_plot = df_plot.merge(df_plot_ct, on=['source', var, 'ptpass'])

    # Only show the share with a pass (ptpass==1)
    df_plot = df_plot[df_plot['ptpass'] == 1]

    # Extract the numeric part from pptyp_label and sort by it
    try:
        df_plot['label_numeric'] = df_plot[var].str.extract('(\d+)').astype(int)
        df_plot = df_plot.sort_values(by='label_numeric')
    except:
        pass

    # remove the old survey data if needed
    # df_plot = df_plot[df_plot['source'].isin(['model','survey'])]

    # Create the bar plot with ordered x-axis
    fig = px.bar(df_plot, x=var, y="percentage", color="source",
                 barmode="group",
                 hover_data=['sample count'],
                 title="Transit Pass Ownership by " + title_cat)
    
    # Update the x-axis label to use sub_name
    fig.update_layout(xaxis_title=sub_name)

    fig.show()

## Pass Ownership by Home Location

In [12]:
plot_transit_pass(person, 'CountyName', 'Home County', 'County')

In [13]:
plot_transit_pass(person, 'rg_proposed', 'Regional Geography', 'Geog')

## Pass Ownership by Person/Household Characteristics

In [14]:
plot_transit_pass(person, 'pptyp_label', 'Person Type', 'person type')











In [15]:
plot_transit_pass(person, 'hh_density_group', 'Household Density', 'HH Density at Home')











In [16]:
plot_transit_pass(person, 'emp_density_group', 'Employment Density', 'Emp Density at Home')









