In [None]:
from pathlib import Path

import geopandas as gpd
import polars as pl

In [None]:
def extract_matched_paths_columns(
    map_matching_gpkg_filepath, matched_paths_parquet_filepath
):
    matched_path_gdf = gpd.read_file(
        map_matching_gpkg_filepath,
        layer="matched_path_gdf",
        engine="pyogrio",  # pyogrio: drastic speed up over fiona
    )
    matched_path_gdf[["trip_id", "osmid", "name"]].to_parquet(
        matched_paths_parquet_filepath
    )

In [None]:
survey_dir = Path(r"Q:\Data\Surveys\HouseholdSurveys\MTC-SFCTA2022")
map_match_dir = survey_dir / "BATS 2023 TDS Conflation/OSM_match_v2"
map_matching_gpkg_filepath = map_match_dir / "tds_conflation_results.gpkg"
matched_paths_parquet_filepath = (
    map_match_dir / "matched_path_gdf-cols_extract-tripid-osmid-name.parquet"
)
output_dir = Path(r"Q:\Model Projects\Freeway Network Managed Lanes\results")

In [None]:
# only run on first run (time consuming):
extract_matched_paths_columns(
    map_matching_gpkg_filepath, matched_paths_parquet_filepath
)

In [None]:
study_segments = pl.read_csv(
    r"Q:\Model Projects\Freeway Network Managed Lanes\data\ml_study_locations.csv"
)

In [None]:
survey_results_dir = survey_dir / "Processed_20241127/reformat_2019_rmoveonly"
trips_raw = pl.read_csv(survey_results_dir / "01-taz_spatial_join/trip.csv")
trips = pl.read_csv(survey_results_dir / "04-merge_skims/adj_weights/trip.csv").join(
    trips_raw.select("trip_id", "hh_id", "person_num", "trip_num"),
    left_on=["hhno", "pno", "tsvid"],
    right_on=["hh_id", "person_num", "trip_num"],
)
# tsvid / trip_num continues regardless of day / travel_date_dow

In [None]:
# unique: some trips traverse each segment multiple times (due to errors in
# map matching, though one can do that in real life too)
# drop_nulls: SFCTA's survey processing pipeline removes some trips
# (note that some of the matched paths' 'name' filed is null)
matched_paths = (
    pl.read_parquet(matched_paths_parquet_filepath)
    .rename({"name": "map_match_name"})
    .unique()
    .join(trips, on="trip_id", how="left")
    .drop_nulls("trexpfac")
)

In [None]:
matched_paths_mode_by_segment_long = matched_paths.group_by(
    "osmid", "map_match_name", "mode"
).agg(pl.len(), pl.sum("trexpfac"))
# N.B. since we're aggregating trips over EACH segment, we don't need to do calculations
# for only keeping one trip entry even if the trip traveresed multiple study segments
modes = matched_paths_mode_by_segment_long.get_column("mode").unique()
matched_paths_mode_by_segment_wide = (
    matched_paths_mode_by_segment_long.pivot(
        index=["osmid", "map_match_name"],
        on="mode",
        values=["len", "trexpfac"],
        sort_columns=True,
    )
    .fill_null(0)
    .with_columns(trexpfac_sum=pl.sum_horizontal(pl.selectors.matches(r"trexpfac_\d")))
    .with_columns(
        [
            (pl.col(f"trexpfac_{i}") / pl.col("trexpfac_sum")).alias(f"mode_share_{i}")
            for i in modes
        ]
    )
)

In [None]:
study_segments_mode_share = study_segments.join(
    matched_paths_mode_by_segment_wide, "osmid", how="left"
)
study_segments_mode_share.write_csv(output_dir / "ml_study_locations-mode_shares.csv")
study_segments_mode_share

In [None]:
matched_trips_on_study_segments_list = (
    matched_paths.join(study_segments, "osmid", how="inner")
    .group_by(trips.columns)
    .agg(pl.col("segment"))
)
matched_trips_on_study_segments_list.write_parquet(
    output_dir / "trips-on_ml_study_segments-list.parquet"
)

In [None]:
matched_trips_on_study_segments_dummies = (
    matched_trips_on_study_segments_list.explode("segment")
    .to_dummies("segment")
    .group_by(trips.columns)
    .max()
)
matched_trips_on_study_segments_dummies.write_csv(
    output_dir / "trips-on_ml_study_segments-dummies.csv"
)