In [None]:
from pathlib import Path

import holoviews as hv
import polars as pl
from bokeh.io import output_notebook  # for JupyterLab, not needed for VSCode

output_notebook()  # for JupyterLab, not needed for VSCode

In [None]:
dir2019 = Path(r"Q:\Data\Observed\Transit\Caltrain\2019\annual_pax_count-extracts")
boardings2019nb = pl.read_csv(
    dir2019 / "caltrain-annual_pax_count-2019-pax-NB-AMWR-total_boardings.csv",
)
boardings2019sb = pl.read_csv(
    dir2019 / "caltrain-annual_pax_count-2019-pax-SB-AMWR-total_boardings.csv",
)
boardings2019 = boardings2019sb.join(
    boardings2019nb, on=["ID", "Station"], how="left"
).select(
    "ID",
    station="Station",
    avg_mid_weekday_boardings_2019=(pl.col("SB-Boardings") + pl.col("NB-Boardings")),
)
ridership_dir = Path(r"Q:\Data\Observed\Transit\Caltrain\ridership")
boardings_postcovid = pl.read_csv(
    ridership_dir
    / "4. Caltrain Average Ridership Estimates - Origin Station Detail - 240607 - reformatted.csv",
    # columns=["Date", "Origin Station", "Average Weekday Ridership"],
)
weekday_boardings_by_station = boardings2019.join(
    (
        boardings_postcovid
        # use boardings from Jan/Feb 2024,
        # as the pre-COVID annual pax counts were from the Jan/Feb timeframe
        .filter(pl.col("Date").is_in(["January 2024", "February 2024"]))
        .group_by("Origin Station")
        .agg(avg_weekday_boardings_2024=pl.mean("Average Weekday Ridership"))
        .rename({"Origin Station": "station"})
    ),
    on="station",
)

In [None]:
def add_geography_column(boardings):
    return boardings.with_columns(
        # HACK adding numbering for sorted results when plotting
        geography=pl.when(pl.col("station") == "San Francisco")
        .then(pl.lit("1. SF (4th & King)"))
        .when(pl.col("station").is_in(["22nd Street", "Bayshore"]))
        .then(pl.lit("2. SF (22nd St / Bayshore)"))
        .otherwise(pl.lit("3. ex-SF"))
    )


def group_weekday_boardings_by_geography(boardings):
    return (
        add_geography_column(boardings)
        .drop("ID", "station")
        .group_by("geography")
        .sum()
        .melt(
            id_vars="geography",
            variable_name="year",
            # 2019: mid-weekday (Tue-Thu), whereas 2024: weekday (Mon-Fri)
            value_name="avg(_mid)_weekday_boardings",
        )
        .with_columns(
            pl.col("year")
            .replace(
                {
                    "avg_mid_weekday_boardings_2019": 2019,
                    "avg_weekday_boardings_2024": 2024,
                }
            )
            .cast(int)
        )
    )


def calc_geography_ridership_shares(
    boardings_grouped_for_stacking, time_col, boardings_col
):
    return boardings_grouped_for_stacking.with_columns(
        full_sys_boardings=pl.sum(boardings_col).over(time_col)
    ).select(
        time_col,
        "geography",
        boardings_col,
        (pl.col(boardings_col) / pl.col("full_sys_boardings")).alias(
            f"{boardings_col}_shares"
        ),
    )

In [None]:
weekday_boardings_grouped = calc_geography_ridership_shares(
    group_weekday_boardings_by_geography(weekday_boardings_by_station),
    "year",
    "avg(_mid)_weekday_boardings",
)

In [None]:
awr_df = pl.read_csv(
    ridership_dir / "2. Caltrain Monthly AWR Estimates - 240607 - reformatted.csv"
).with_columns(
    # for some reason .str.to_date(format="%B %Y") isn't working
    year=pl.col("Date").str.split(" ").list.last().cast(int)
)
awr_plot = (
    awr_df
    # filter out incomplete years
    .filter((pl.col("year") > 2017) & (pl.col("year") < 2024))
    .group_by("year")
    .mean()
    .sort("year")
    .plot.line(
        x="year",
        y="Average Weekday Ridership",
        title="Caltrain full system average weekday ridership",
    )
)
hv.save(awr_plot, "../figs/caltrain-avg_weekday_ridership.html")
awr_plot

In [None]:
weekday_boardings_grouped.sort(by="geography").plot.bar(
    x="year",
    y="avg(_mid)_weekday_boardings",
    by="geography",
    stacked=True,
    title=(
        "Caltrain average weekday boardings by Geography (only Tue-Thu for 2019)\n"
        "(DON'T USE THIS ONE: NOT comparing apples to apples)"
    ),
)
# the 2019 numbers are calculated from the Caltrain's annual ridership count (done in the January/February timeframe on a Tue-Thu day)
# the 2024 numbers are calculated from the fare media sales-based ridership estimates, averaging the ridership estimates from the two months of January and February 2024 over Mon-Fri

In [None]:
avg_weekday_boardings_by_geo_prepostcovid_shares_plot = weekday_boardings_grouped.sort(
    by="geography"
).plot.bar(
    x="year",
    y="avg(_mid)_weekday_boardings_shares",
    by="geography",
    stacked=True,
    title=(
        "Caltrain average weekday boarding shares by Geography\n"
        "2019: annual pax count, conducted in Jan/Feb timeframe on a Tue-Thu day\n"
        "2024: fare media sales-based ridership estimates, averaged over Mon-Fri in Jan &Feb"
    ),
)
hv.save(
    avg_weekday_boardings_by_geo_prepostcovid_shares_plot,
    "../figs/caltrain-avg_weekday_boardings_by_geo_prepostcovid_shares.html",
)
avg_weekday_boardings_by_geo_prepostcovid_shares_plot


In [None]:
boardings_postcovid_by_geography_dow = calc_geography_ridership_shares(
    add_geography_column(
        boardings_postcovid.select(
            pl.col("Average Weekday Ridership").alias(
                "weekday"
            ),  # avg_daily_boardings for ...
            pl.concat_list(  # avg_daily_boardings for ...
                "Average Saturday Ridership",
                "Average Sunday Ridership",
                "Average Holiday Ridership",
            )
            .list.mean()
            .alias("weekend/holiday"),
            station=pl.col("Origin Station"),
        )
    )
    .drop("station")
    .group_by("geography")
    .sum()
    .melt(
        id_vars="geography",
        variable_name="dow",
        # 2019: mid-weekday (Tue-Thu), whereas 2024: weekday (Mon-Fri)
        value_name="avg_daily_boardings",
    )
    .sort(by="geography"),
    "dow",
    "avg_daily_boardings",
)
boardings_postcovid_by_geo_dow_plot = boardings_postcovid_by_geography_dow.plot.bar(
    x="dow",
    y="avg_daily_boardings",
    by="geography",
    stacked=True,
    title=("Caltrain average daily boardings by DOW & Geography (Nov 2023-Apr 2024)"),
)
hv.save(
    boardings_postcovid_by_geo_dow_plot,
    "../figs/caltrain-boardings_postcovid_by_geo_dow.html",
)
boardings_postcovid_by_geo_dow_plot

In [None]:
boardings_postcovid_by_geo_dow_shares_plot = boardings_postcovid_by_geography_dow.plot.bar(
    x="dow",
    y="avg_daily_boardings_shares",
    by="geography",
    stacked=True,
    title=(
        "Caltrain average daily boardings shares by DOW & Geography (Nov 2023-Apr 2024)"
    ),
)
hv.save(
    boardings_postcovid_by_geo_dow_shares_plot,
    "../figs/caltrain-boardings_postcovid_by_geo_dow_shares.html",
)
boardings_postcovid_by_geo_dow_shares_plot