In [None]:
%load_ext autoreload

In [None]:
import sys
from pathlib import Path

import altair as alt
import polars as pl

sys.path.append("../")
%autoreload
from altair_utils import (
    color_covid_pre_post_both,
    color_sfcore_restofsf_restofbayarea,
    color_value_transit_teal,
)
from utils import line_plot


In [None]:
dir2019 = Path(r"Q:\Data\Observed\Transit\Caltrain\2019\annual_pax_count-extracts")
boardings2019nb = pl.read_csv(
    dir2019 / "caltrain-annual_pax_count-2019-pax-NB-AMWR-total_boardings.csv",
)
boardings2019sb = pl.read_csv(
    dir2019 / "caltrain-annual_pax_count-2019-pax-SB-AMWR-total_boardings.csv",
)
boardings2019 = boardings2019sb.join(
    boardings2019nb, on=["ID", "Station"], how="left", coalesce=True
).select(
    "ID",
    station="Station",
    avg_mid_weekday_boardings_2019=(pl.col("SB-Boardings") + pl.col("NB-Boardings")),
)
ridership_dir = Path(r"Q:\Data\Observed\Transit\Caltrain\ridership")
boardings_postcovid = pl.read_csv(
    ridership_dir
    / "4. Caltrain Average Ridership Estimates - Origin Station Detail - 240607 - reformatted.csv",
    # columns=["Date", "Origin Station", "Average Weekday Ridership"],
)
weekday_boardings_by_station = boardings2019.join(
    (
        boardings_postcovid
        # use boardings from Jan/Feb 2024,
        # as the pre-COVID annual pax counts were from the Jan/Feb timeframe
        .filter(pl.col("Date").is_in(["January 2024", "February 2024"]))
        .group_by("Origin Station")
        .agg(avg_weekday_boardings_2024=pl.mean("Average Weekday Ridership"))
        .rename({"Origin Station": "station"})
    ),
    on="station",
)

In [None]:
def add_geography_column(boardings):
    return boardings.with_columns(
        # HACK adding numbering for sorted results when plotting
        geography=pl.when(pl.col("station") == "San Francisco")
        .then(pl.lit("SF (4th & King)"))
        .when(pl.col("station").is_in(["22nd Street", "Bayshore"]))
        .then(pl.lit("SF (22nd St / Bayshore)"))
        .otherwise(pl.lit("rest of Caltrain system"))
    )


def unpivot_weekday_boardings(boardings, group_by_geography):
    if group_by_geography:
        boardings = (
            add_geography_column(boardings)
            .drop("ID", "station")
            .group_by("geography")
            .sum()
        )
        melt_id_vars = "geography"
    else:
        melt_id_vars = ["ID", "station"]
    return boardings.unpivot(
        index=melt_id_vars,
        variable_name="year",
        # 2019: mid-weekday (Tue-Thu), whereas 2024: weekday (Mon-Fri)
        value_name="avg(_mid)_weekday_boardings",
    ).with_columns(
        pl.col("year")
        .replace(
            {
                "avg_mid_weekday_boardings_2019": 2019,
                "avg_weekday_boardings_2024": 2024,
            }
        )
        .cast(int)
    )


def calc_geography_ridership_shares(
    boardings_grouped_for_stacking,
    time_col,
    boardings_col,
    geography_cols=["geography"],
):
    return boardings_grouped_for_stacking.with_columns(
        full_sys_boardings=pl.sum(boardings_col).over(time_col)
    ).select(
        time_col,
        *geography_cols,
        boardings_col,
        (pl.col(boardings_col) / pl.col("full_sys_boardings")).alias(
            f"{boardings_col}_shares"
        ),
    )

In [None]:
weekday_boardings_by_station_long = (
    calc_geography_ridership_shares(
        unpivot_weekday_boardings(
            weekday_boardings_by_station, group_by_geography=False
        ),
        "year",
        "avg(_mid)_weekday_boardings",
        geography_cols=["ID", "station"],
    )
    .sort("year", "ID")
    .rename(
        {
            "avg(_mid)_weekday_boardings": "boardings",
            "avg(_mid)_weekday_boardings_shares": "boardings geographic share",
        }
    )
    .with_columns(
        pl.col("boardings").round(0), pl.col("boardings geographic share").round(3)
    )
)
weekday_boardings_by_station_long.write_csv(
    "../output/data/caltrain-avg_weekday_boardings-by_station.csv"
)

# by_station_plot = (
#     weekday_boardings_by_station_long.plot(
#         x="station", y="avg(_mid)_weekday_boardings", by="year", ylim=(0, None)
#     ).opts(xrotation=45)
#     + weekday_boardings_by_station_long.plot(
#         x="station", y="avg(_mid)_weekday_boardings_shares", by="year", ylim=(0, None)
#     ).opts(xrotation=45)
# ).cols(1)
# hv.save(
#     by_station_plot, "../output/Links/caltrain-avg_weekday_ridership-by_station.png"
# )
# hv.save(
#     by_station_plot, "../output/Links/caltrain-avg_weekday_ridership-by_station.html"
# )
# by_station_plot


In [None]:
awr_by_station_chart = (
    alt.Chart(
        weekday_boardings_by_station_long.with_columns(
            color_legend=pl.col("year").replace_strict(
                {2019: "decrease from 2019", 2024: "increase from 2019"}
            )
        )
    )
    .mark_bar()
    .encode(
        x=alt.X(
            "station",
            axis=alt.Axis(labelAngle=-45),
            sort=alt.EncodingSortField("ID"),
            title=None,
        ),
        y=alt.Y(
            "boardings",
            title="Caltrain average weekday boardings 2019 vs 2024",
            stack=False,
        ),
        color=alt.Color("color_legend:O").scale(
            domain=["decrease from 2019", "increase from 2019"],
            range=color_covid_pre_post_both[:2],
        ),
        tooltip=["station", "year", "boardings"],
        opacity=alt.value(0.5),
    )
)
awr_by_station_chart.properties(width="container").save(
    "../output/Links/caltrain-avg_weekday_ridership-by_station.html"
)
awr_by_station_chart.save(
    "../output/Links/caltrain-avg_weekday_ridership-by_station.png", scale=3
)
awr_by_station_chart


In [None]:
awr_by_station_shares_chart = (
    alt.Chart(
        weekday_boardings_by_station_long.with_columns(
            color_legend=pl.col("year").replace_strict(
                {2019: "decrease from 2019", 2024: "increase from 2019"}
            )
        )
    )
    .mark_bar(blend="multiply")
    .encode(
        x=alt.X(
            "station",
            axis=alt.Axis(labelAngle=-45),
            sort=alt.EncodingSortField("ID"),
            title=None,
        ),
        y=alt.Y(
            "boardings geographic share",
            stack=False,
            title="Caltrain average weekday boardings 2019 vs 2024: geographic shares",
        ).axis(format="%"),
        color=alt.Color("color_legend:O").scale(
            domain=["decrease from 2019", "increase from 2019"],
            range=color_covid_pre_post_both[:2],
        ),
        tooltip=[
            "station",
            "year",
            alt.Tooltip("boardings geographic share", format=".1%"),
        ],
    )
)
awr_by_station_shares_chart.properties(width="container").save(
    "../output/Links/caltrain-avg_weekday_ridership-by_station-shares.html"
)
awr_by_station_shares_chart.save(
    "../output/Links/caltrain-avg_weekday_ridership-by_station-shares.png", scale=3
)
awr_by_station_shares_chart


In [None]:
awr_df = (
    pl.read_csv(
        ridership_dir / "2. Caltrain Monthly AWR Estimates - 240607 - reformatted.csv"
    )
    .with_columns(
        # for some reason .str.to_date(format="%B %Y") isn't working
        year=pl.col("Date").str.split(" ").list.last().cast(int)
    )
    # filter out incomplete years
    .filter((pl.col("year") > 2017) & (pl.col("year") < 2024))
    .group_by("year")
    .mean()
    .sort("year")
)
awr_df.write_csv("../output/data/caltrain-avg_weekday_ridership.csv")
awr_chart = line_plot(
    awr_df,
    x="year",
    y="Average Weekday Ridership",
    color=color_value_transit_teal,
    title="Caltrain average weekday ridership (full system)",
)
awr_chart.save("../output/Links/caltrain-avg_weekday_ridership.png", scale=3)
awr_chart.properties(width="container").save(
    "../output/Links/caltrain-avg_weekday_ridership.html"
)
awr_chart


In [None]:
# DO NOT USE: NOT comparing apples to apples
# the 2019 numbers are calculated from the Caltrain's annual ridership count
# (done in the January/February timeframe on a Tue-Thu day)
# the 2024 numbers are calculated from the fare media sales-based ridership estimates,
# averaging the ridership estimates from the two months of Jan+Feb 2024 over Mon-Fri
# weekday_boardings_grouped_long.sort(by="geography").plot.bar(
#     x="year",
#     y="avg(_mid)_weekday_boardings",
#     by="geography",
#     stacked=True,
#     title=(
#         "Caltrain average weekday boardings by Geography (only Tue-Thu for 2019)\n"
#         "(DON'T USE THIS ONE: NOT comparing apples to apples)"
#     ),
# )



In [None]:
weekday_boardings_grouped_long = (
    calc_geography_ridership_shares(
        unpivot_weekday_boardings(
            weekday_boardings_by_station, group_by_geography=True
        ),
        "year",
        "avg(_mid)_weekday_boardings",
    )
    .rename(
        {
            "avg(_mid)_weekday_boardings": "boardings",
            "avg(_mid)_weekday_boardings_shares": "boardings share",
        }
    )
    .with_columns(
        pl.col("boardings").round(0),
        pl.col("boardings share").round(3),
        pl.col("year").replace_strict({2019: "'19", 2024: "'23"}),
    )
)
weekday_boardings_grouped_long.write_csv(
    "../output/data/caltrain-avg_weekday_boardings-shares-by_geography.csv"
)

avg_weekday_boardings_by_geo_prepostcovid_shares_plot = (
    alt.Chart(weekday_boardings_grouped_long)
    .mark_line()
    .encode(
        x="year:O",
        y=alt.Y(
            "boardings share",
            title="Caltrain average weekday boardings geographical shares",
        ).axis(format="%"),
        color=alt.Color("geography").scale(
            domain=[
                "SF (4th & King)",
                "SF (22nd St / Bayshore)",
                "rest of Caltrain system",
            ],
            range=color_sfcore_restofsf_restofbayarea,
        ),
        tooltip=[
            "year",
            "geography",
            "boardings",
            alt.Tooltip("boardings share", format=".1%"),
        ],
        # title=(
        #     "Caltrain average weekday boarding shares by Geography\n"
        #     "2019: annual pax count,\nconducted in Jan/Feb timeframe on a Tue-Thu day\n"
        #     "2024: fare media sales-based ridership estimates,\naveraged over Mon-Fri in Jan &Feb"
        # ),
    )
)

avg_weekday_boardings_by_geo_prepostcovid_shares_plot.save(
    "../output/Links/caltrain-avg_weekday_boardings-shares-by_geography.png",
    scale=3,
)
avg_weekday_boardings_by_geo_prepostcovid_shares_plot.properties(
    width="container"
).save(
    "../output/Links/caltrain-avg_weekday_boardings-shares-by_geography.html",
)
avg_weekday_boardings_by_geo_prepostcovid_shares_plot


In [None]:
# boardings_postcovid_by_geography_dow = calc_geography_ridership_shares(
#     add_geography_column(
#         boardings_postcovid.select(
#             pl.col("Average Weekday Ridership").alias(
#                 "weekday"
#             ),  # avg_daily_boardings for ...
#             pl.concat_list(  # avg_daily_boardings for ...
#                 "Average Saturday Ridership",
#                 "Average Sunday Ridership",
#                 "Average Holiday Ridership",
#             )
#             .list.mean()
#             .alias("weekend/holiday"),
#             station=pl.col("Origin Station"),
#         )
#     )
#     .drop("station")
#     .group_by("geography")
#     .sum()
#     .melt(
#         id_vars="geography",
#         variable_name="dow",
#         # 2019: mid-weekday (Tue-Thu), whereas 2024: weekday (Mon-Fri)
#         value_name="avg_daily_boardings",
#     )
#     .sort(by="geography"),
#     "dow",
#     "avg_daily_boardings",
# )
# boardings_postcovid_by_geo_dow_plot = boardings_postcovid_by_geography_dow.plot.bar(
#     x="dow",
#     y="avg_daily_boardings",
#     by="geography",
#     stacked=True,
#     title=("Caltrain average daily boardings by DOW & Geography (Nov 2023-Apr 2024)"),
# )
# # hv.save(
# #     boardings_postcovid_by_geo_dow_plot,
# #     "../output/Links/caltrain-boardings_postcovid_by_geo_dow.png",
# # )
# # hv.save(
# #     boardings_postcovid_by_geo_dow_plot,
# #     "../output/Links/caltrain-boardings_postcovid_by_geo_dow.html",
# # )
# boardings_postcovid_by_geo_dow_plot

In [None]:
# boardings_postcovid_by_geo_dow_shares_plot = boardings_postcovid_by_geography_dow.plot.bar(
#     x="dow",
#     y="avg_daily_boardings_shares",
#     by="geography",
#     stacked=True,
#     title=(
#         "Caltrain average daily boardings shares by DOW & Geography (Nov 2023-Apr 2024)"
#     ),
# )
# # hv.save(
# #     boardings_postcovid_by_geo_dow_shares_plot,
# #     "../output/Links/caltrain-boardings_postcovid_by_geo_dow_shares.png",
# # )
# # hv.save(
# #     boardings_postcovid_by_geo_dow_shares_plot,
# #     "../output/Links/caltrain-boardings_postcovid_by_geo_dow_shares.html",
# # )
# boardings_postcovid_by_geo_dow_shares_plot