In [None]:
import sys
from pathlib import Path

import geopandas as gpd
import polars as pl

sys.path.append("../2024-downtown_today")
from downtown_today_utils import parse_geog, taz_cols_to_county

In [None]:
def extract_matched_paths_columns(
    map_matching_gpkg_filepath, matched_paths_parquet_filepath
):
    matched_path_gdf = gpd.read_file(
        map_matching_gpkg_filepath,
        layer="matched_path_gdf",
        engine="pyogrio",  # pyogrio: drastic speed up over fiona
    )
    matched_path_gdf[["trip_id", "osmid", "name"]].to_parquet(
        matched_paths_parquet_filepath
    )

In [None]:
survey_dir = Path(r"Q:\Data\Surveys\HouseholdSurveys\MTC-SFCTA2022")
map_match_dir = survey_dir / "BATS 2023 TDS Conflation/OSM_match_v2"
map_matching_gpkg_filepath = map_match_dir / "tds_conflation_results.gpkg"
matched_paths_parquet_filepath = (
    map_match_dir / "matched_path_gdf-cols_extract-tripid-osmid-name.parquet"
)


In [None]:
# only run on first run (time consuming):
extract_matched_paths_columns(
    map_matching_gpkg_filepath, matched_paths_parquet_filepath
)

In [None]:
taz_analysis_neighborhoods_gis_filepath = (
    Path(r"Q:\GIS\Policy\San_Francisco\Analysis_Neighborhoods")
    / "taz2454-sf_only-with_analysis_neighborhoods.gpkg"
)
taz_analysis_neighborhoods = gpd.read_file(
    taz_analysis_neighborhoods_gis_filepath, columns=["TAZ", "analysis_neighborhood"]
)
taz_gis_filepath = r"Q:\GIS\Model\TAZ\SFCTA_TAZ\TAZ2454.shp"
taz_county = gpd.read_file(taz_gis_filepath, columns=["TAZ", "COUNTY"])

In [None]:
managed_lanes_study_segments = pl.read_csv(
    r"Q:\Model Projects\Freeway Network Managed Lanes\data\ml_study_locations.csv"
)

In [None]:
survey_results_dir = survey_dir / "Processed_20241127/reformat_2019_rmoveonly"
trips_raw = pl.read_csv(survey_results_dir / "01-taz_spatial_join/trip.csv")
trips = pl.read_csv(survey_results_dir / "04-merge_skims/adj_weights/trip.csv").join(
    trips_raw.select("trip_id", "hh_id", "person_num", "trip_num"),
    left_on=["hhno", "pno", "tsvid"],
    right_on=["hh_id", "person_num", "trip_num"],
)
# tsvid / trip_num continues regardless of day / travel_date_dow

In [None]:
# unique: some trips traverse each segment multiple times (due to errors in
# map matching, though one can do that in real life too)
# drop_nulls: SFCTA's survey processing pipeline removes some trips
# (note that some of the matched paths' 'name' filed is null)
matched_paths = taz_cols_to_county(
    parse_geog(
        pl.read_parquet(matched_paths_parquet_filepath)
        .rename({"name": "map_match_name"})
        .unique(["trip_id", "osmid"])
        .join(trips, on="trip_id", how="left")
        .drop_nulls("trexpfac"),
        taz_analysis_neighborhoods,
    ),
    taz_county,
).with_columns(
    pl.col("o_county", "d_county").replace_strict(
        {
            1: "1-sf",
            2: "2-san mateo",
            3: "3-santa clara",
            4: "4-alameda",
            5: "5-contra costa",
            6: "6-solano",
            7: "7-napa",
            8: "8-sonoma",
            9: "9-marin",
        }
    )
)

In [None]:
def get_trips_on_segments(matched_paths, osmids):
    return (
        matched_paths.filter(pl.col("osmid").is_in(osmids))
        # only keep one trip entry even if the trip traveresed multiple study segments
        .group_by("trip_id")
        .agg(pl.first("o_geog", "d_geog", "dpurp", "trexpfac", "o_county", "d_county"))
    )


def calculate_long_summary(trips, cols):
    return (
        trips.group_by(cols)
        .agg(pl.len(), pl.sum("trexpfac"))
        .sort(cols)
        .with_columns(weighted_share=(pl.col("trexpfac") / pl.sum("trexpfac")))
    )

In [None]:
study_segments_osmids = managed_lanes_study_segments.filter(
    pl.col("segment").is_in(["80-DT-EB", "80-DT-WB", "101-N-NB", "101-N-SB"])
    # pl.col("segment").is_in(["80-BB-EB", "80-BB-WB"])
).get_column("osmid")

In [None]:
matched_trips_od_long = calculate_long_summary(
    get_trips_on_segments(matched_paths, study_segments_osmids),
    ["o_geog", "d_geog"],
)
matched_trips_od_long

In [None]:
matched_trips_od_long.pivot(
    index=["o_geog"],
    on="d_geog",
    values=["len"],
    sort_columns=True,
)

In [None]:
matched_trips_od_long.pivot(
    index=["o_geog"],
    on="d_geog",
    values=["trexpfac"],
    sort_columns=True,
)

In [None]:
matched_trips_od_long.pivot(
    index=["o_geog"],
    on="d_geog",
    values=["weighted_share"],
    sort_columns=True,
)

In [None]:
matched_trips_dpurp_long = calculate_long_summary(
    get_trips_on_segments(matched_paths, study_segments_osmids), "dpurp"
)
matched_trips_dpurp_long

In [None]:
matched_trips_county_long = calculate_long_summary(
    get_trips_on_segments(matched_paths, study_segments_osmids),
    ["o_county", "d_county"],
)
matched_trips_county_long

In [None]:
matched_trips_county_long.pivot(
    index=["o_county"],
    on="d_county",
    values=["trexpfac"],
    sort_columns=True,
)

In [None]:
matched_trips_county_long.pivot(
    index=["o_county"],
    on="d_county",
    values=["weighted_share"],
    sort_columns=True,
)