In [1]:
import polars as pl
import plotly.express as px
import toml
from pathlib import Path
import util
import psrc_theme

# to show plotly figures in quarto HTML file
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"
pio.templates.default = "simple_white+psrc_color" # set plotly template

In [2]:
config = toml.load(Path(Path.cwd(), '..\..\..\..\configuration', 'validation_configuration.toml'))
input_config = toml.load(Path(Path.cwd(), '..\..\..\..\configuration', 'input_configuration.toml'))

data = util.ValidationData(config,input_config,['person','tour'])

## number of tours per person

In [3]:
ptype_cat = {1: "1: full time worker",
             2: "2: part time worker",
             3: "3: non-worker age 65+",
             4: "4: other non-working adult",
             5: "5: university student",
             6: "6: grade school student/child age 16+",
             7: "7: child age 5-15",
             8: "8: child age 0-4"}
pdpurp_cat = {1: "Work",
              2: "School",
              3: "Escort",
              4: "Personal Business",
              5: "Shop",
              6: "Meal",
              7: "Social"}

df_person = data.person.with_columns(
    data.person['pptyp'].map_dict(ptype_cat).alias('pptyp_label').cast(pl.Categorical)
)

df_tour = data.tour.with_columns(
    data.tour['pdpurp'].map_dict(pdpurp_cat).alias('pdpurp_label').cast(pl.Categorical)
)

df_tour = df_tour.join(df_person, on=['hhno','pno','source'], how='left')

In [4]:
df_person.groupby('source').agg([
        pl.sum('psexpfac').alias('psexpfac_sum')])

source,psexpfac_sum
str,f64
"""survey""",3782100.0
"""survey (2017/2…",3738500.0
"""model""",4356019.0


In [5]:
df_plot = df_tour.groupby('source').agg([
    pl.sum('toexpfac').alias('toexpfac_sum')
]).join(
    df_person.groupby('source').agg([
        pl.sum('psexpfac').alias('psexpfac_sum')
    ]),
    on='source',
    how='left'
).with_columns(
    (pl.col('toexpfac_sum') / pl.col('psexpfac_sum')).alias('average tours'),
    pl.lit('').alias('person')
)

fig = px.bar(df_plot.to_pandas(), x="person", y="average tours", color="source",
             barmode="group", title="number of tours per person")
fig.update_layout(height=400, width=400, font=dict(size=11),
                  xaxis=dict(dtick=1, categoryorder='category ascending'),
                  yaxis=dict(tickformat=".3"))
fig.show()

## percent of tours by purpose

In [6]:
df_plot = df_tour.groupby(['source', 'pdpurp_label']).agg([
    pl.sum('toexpfac').alias('toexpfac_sum')
]).with_columns(
    (pl.col('toexpfac_sum') / pl.col('toexpfac_sum').sum().over('source')).alias('percentage')
)

df_plot_ct = df_tour.groupby(['source', 'pdpurp_label']).agg([
    pl.count('toexpfac').alias('sample count')
])

df_plot = df_plot.join(df_plot_ct, on=['source', 'pdpurp_label'], how='left')

fig = px.bar(df_plot.to_pandas().sort_values(by=['source']), x="pdpurp_label", y="percentage", color="source",
             barmode="group", hover_data=['sample count'], title="tour purpose")
fig.update_layout(height=400, width=700, font=dict(size=11),
                  yaxis=dict(tickformat=".2%"))
fig.show()

## number of tours per person by segment

In [7]:
def plot_segment(df, tour_group_var, person_group_var, title_name):
    fig = px.bar(df.to_pandas(), x=tour_group_var[1], y="average tour", color="source",
                 barmode="group", title=title_name)
    fig.update_layout(height=400, width=700, font=dict(size=11),
                      yaxis=dict(tickformat=".2"))
    fig.show()

df_plot = df_tour.groupby(['source', 'pdpurp_label']).agg([
    pl.sum('toexpfac').alias('toexpfac_sum')
]).join(
    df_person.groupby(['source']).agg([
        pl.sum('psexpfac').alias('psexpfac_sum')
    ]),
    on='source',
    how='left'
).with_columns(
    (pl.col('toexpfac_sum') / pl.col('psexpfac_sum')).alias('average tour')
)

plot_segment(df_plot, tour_group_var=['source', 'pdpurp_label'], person_group_var=['source'],
             title_name="number of tours per person by tour purpose")


In [8]:
# plot_segment(tour_group_var=['source','pptyp_label'],person_group_var=['source','pptyp_label'],
#              title_name="number of tours per person by person type")

df_plot = df_tour.groupby(['source', 'pptyp_label']).agg([
    pl.sum('toexpfac').alias('toexpfac_sum')
]).join(
    df_person.groupby(['source', 'pptyp_label']).agg([
        pl.sum('psexpfac').alias('psexpfac_sum')
    ]),
    on=['source', 'pptyp_label'],
    how='left'
).with_columns(
    (pl.col('toexpfac_sum') / pl.col('psexpfac_sum')).alias('average tour')
)

plot_segment(df_plot, tour_group_var=['source', 'pptyp_label'], person_group_var=['source', 'pptyp_label'],
             title_name="number of tours per person by person type")

In [9]:
wk_tour = df_tour.filter(pl.col('pdpurp') == 1)

df_plot = wk_tour.groupby(['source', 'pptyp_label']).agg([
    pl.sum('toexpfac').alias('toexpfac_sum')
]).join(
    df_person.groupby(['source', 'pptyp_label']).agg([
        pl.sum('psexpfac').alias('psexpfac_sum')
    ]),
    on=['source', 'pptyp_label'],
    how='left'
).with_columns(
    (pl.col('toexpfac_sum') / pl.col('psexpfac_sum')).alias('average tour')
)

fig = px.bar(df_plot.to_pandas(), x='pptyp_label', y="average tour", color="source",
             barmode="group", title="number of work tours per person by person type")
fig.update_layout(height=400, width=700, font=dict(size=11),
                  yaxis=dict(tickformat=".2"))
fig.show()

### Tour by Purpose and Person Type

In [10]:

def plot_by_pptyp(df_tour, person_type):
    df_plot = df_tour.filter(
        pl.col('pptyp') == int(person_type)
    ).groupby(['source', 'pdpurp_label']).agg([
        pl.sum('toexpfac').alias('toexpfac_sum')
    ]).join(
        df_person.filter(pl.col('pptyp') == int(person_type)).groupby(['source']).agg([
            pl.sum('psexpfac').alias('psexpfac_sum')
        ]),
        on=['source'],
        how='left'
    ).with_columns(
        (pl.col('toexpfac_sum') / pl.col('psexpfac_sum')).alias('average tour')
    )

    plot_segment(df_plot, tour_group_var=['source', 'pdpurp_label'], person_group_var=['source'],
                 title_name="number of tours per person for person type " + str(person_type))


In [11]:
plot_by_pptyp(df_tour, '1')

In [12]:
plot_by_pptyp(df_tour, '2')

In [13]:
plot_by_pptyp(df_tour, '3')

In [14]:
plot_by_pptyp(df_tour, '4')

In [15]:
plot_by_pptyp(df_tour, '5')

In [16]:
plot_by_pptyp(df_tour, '6')

In [17]:
plot_by_pptyp(df_tour, '7')

In [18]:
plot_by_pptyp(df_tour, '8')