In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
import polars as pl


In [None]:
# TODO see if the data in "Q:\Data\Observed\Streets\Safety\SWITRS\San Francisco Data from TIMS" works for this
tims_crashes_filepath = r"C:\Users\cchow\Desktop\tmp\downtown_today\safety\TIMS-SWITRS-crashes-2014-2023.csv"
# crashes only include crashes where injuries/deaths happened
crashes = (
    pl.read_csv(
        tims_crashes_filepath,
        columns=[
            "ACCIDENT_YEAR",
            # "COLLISION_DATE",
            # "COLLISION_TIME",  # some times, e.g. "43" or "20" is unparseable
            "DAY_OF_WEEK",
            "NUMBER_KILLED",
            "NUMBER_INJURED",
            "PEDESTRIAN_ACCIDENT",
            "BICYCLE_ACCIDENT",
            "LATITUDE",
            "LONGITUDE",
            "POINT_X",
            "POINT_Y",
        ],
        schema_overrides={
            "COLLISION_TIME": str,
            "NUMBER_KILLED": int,
            "NUMBER_INJURED": int,
        },
    )
    .filter((pl.col("NUMBER_INJURED") > 0) | (pl.col("NUMBER_KILLED") > 0))
    .select(
        "ACCIDENT_YEAR",
        "DAY_OF_WEEK",
        (pl.col("PEDESTRIAN_ACCIDENT") == "Y").fill_null(False),  # cast to bool
        (pl.col("BICYCLE_ACCIDENT") == "Y").fill_null(False),  # cast to bool
        severity=(
            pl.when(pl.col("NUMBER_KILLED") > 0)
            .then(pl.lit("fatal"))
            .otherwise(pl.lit("injury only"))
        ),
        LATITUDE=pl.when(pl.col("LATITUDE").is_not_null())
        .then(pl.col("LATITUDE"))
        .otherwise(pl.col("POINT_Y")),
        LONGITUDE=pl.when(pl.col("LONGITUDE").is_not_null())
        .then(pl.col("LONGITUDE"))
        .otherwise(pl.col("POINT_X")),
    )
)

analysis_neighborhoods = gpd.read_file(
    r"Q:\GIS\Policy\San_Francisco\Analysis_Neighborhoods\Analysis Neighborhoods_20240610.zip"
).rename(columns={"nhood": "analysis_neighborhood"})


In [None]:
crashes = (
    gpd.GeoDataFrame(
        crashes.to_pandas(),
        geometry=gpd.points_from_xy(crashes["LONGITUDE"], crashes["LATITUDE"]),
        crs="EPSG:4326",
    )
    .sjoin(analysis_neighborhoods, how="left", predicate="within")
    .drop(columns="index_right")
)
# crashes.plot(column="analysis_neighborhood")

In [None]:
# excludes collisions that are e.g. on bridges within SF County but not within any
# analysis neighborhood. My guess is those should NOT be ped/bike collisions anyways
# the following code plots these collisions not in any analysis neighborhood:
# crashes.loc[crashes["analysis_neighborhood"].isnull(), "analysis_neighborhood"] = "N/A"
# crashes[
#     (crashes["LONGITUDE"] > -123)
#     & (crashes["LONGITUDE"] < -122)
#     & (crashes["LATITUDE"] > 37.5)
#     & (crashes["LATITUDE"] < 38)
# ].plot(column="analysis_neighborhood")

In [None]:
def generate_all_combos(crashes: pl.DataFrame, group_by_cols):
    """
    Some groups have no crashes. This make sures these groups still have a row of 0s.
    """
    # create a single row df with each col being a list of the unique values of that col
    cross_product_df = crashes.select(pl.col(group_by_cols).unique().implode())
    # create a cross product of the unique values of these cols
    for col in group_by_cols:
        # explode explodes the lists in each of these cols back out to different rows
        cross_product_df = cross_product_df.explode(col)
    return cross_product_df.join(
        crashes, how="left", on=group_by_cols, coalesce=True
    ).fill_null(0)


def group_crashes(crashes: pl.DataFrame, time_col: str):
    group_by_cols = [time_col, "analysis_neighborhood", "severity"]
    crashes_grouped = generate_all_combos(
        crashes.group_by(group_by_cols).agg(
            pl.sum("PEDESTRIAN_ACCIDENT", "BICYCLE_ACCIDENT"),
            ALL_ACCIDENT=pl.len(),
        ),
        group_by_cols,
    )
    # also calculate number of injury only + fatal crashes
    return pl.concat(
        [
            crashes_grouped,
            crashes_grouped.group_by("covid_period", "analysis_neighborhood")
            .sum()
            .with_columns(severity=pl.lit("combined")),
        ]
    ).sort(group_by_cols)


def calculate_covid_pct_change(crashes_grouped: pl.DataFrame):
    return (
        crashes_grouped.filter(pl.col("covid_period").is_in(["2018-2019", "2022-2023"]))
        .sort("analysis_neighborhood", "severity", "covid_period")
        .with_columns(
            # calculate % change with previous row
            pl.col("PEDESTRIAN_ACCIDENT", "BICYCLE_ACCIDENT", "ALL_ACCIDENT")
            .pct_change()
            .name.suffix("_1819_to_2223_pct_change")
        )
        # only select every 2nd row (i.e. the post-covid rows)
        .filter(pl.col("covid_period") == "2022-2023")
        .drop("covid_period", "PEDESTRIAN_ACCIDENT", "BICYCLE_ACCIDENT", "ALL_ACCIDENT")
    )


def add_analysis_neighborhood_geometry(df, analysis_neighborhoods):
    """HOTFIX since I'm doing non spatial calculations in polars"""
    return analysis_neighborhoods.merge(
        df.to_pandas(), how="right", on="analysis_neighborhood"
    )


def plot(
    nhood_crashes,
    accident_cols,
    save_filename_stem,
    suptitle=None,
    vmin=None,
    vmax=None,
):
    _, ax = plt.subplots(3, 3, figsize=(10, 10))
    for i, severity in enumerate(["injury only", "fatal", "combined"]):
        for j, accident_col in enumerate(accident_cols):
            nhood_crashes.loc[nhood_crashes["severity"] == severity].plot(
                ax=ax[i, j], column=accident_col, legend=True, vmin=vmin, vmax=vmax
            )
    plt.suptitle(suptitle)
    plt.savefig(f"output/Links/{save_filename_stem}.png")
    plt.tight_layout()
    plt.show()


In [None]:
# HOTFIX cast to pl then back to gpd as I'm more comfortable with pl syntax
crashes_grouped = (
    # group into analysis neighborhoods
    group_crashes(
        # HOTFIX remove geometry col to cast to polars
        pl.from_pandas(crashes.drop(columns="geometry"))
        .with_columns(
            # group years into 2 year periods around COVID
            # (doing 2 year periods to get better statistics with the larger sums)
            covid_period=pl.when(pl.col("ACCIDENT_YEAR").is_in([2018, 2019]))
            .then(pl.lit("2018-2019"))
            .when(pl.col("ACCIDENT_YEAR").is_in([2020, 2021]))
            .then(pl.lit("2020-2021"))
            .when(pl.col("ACCIDENT_YEAR").is_in([2022, 2023]))
            .then(pl.lit("2022-2023"))
        )
        # drop the years not around COVID, and locations not in analysis neighborhoods
        .drop_nulls(),
        "covid_period",
    )
)
# nhood = shorthand for analysis neighborhood
crashes_nhood_pct_change = calculate_covid_pct_change(crashes_grouped)
# HOTFIX add back geometry column
crashes_grouped = add_analysis_neighborhood_geometry(
    crashes_grouped, analysis_neighborhoods
)
crashed_nhood_pct_change = add_analysis_neighborhood_geometry(
    crashes_nhood_pct_change, analysis_neighborhoods
)


In [None]:
crashes_grouped.to_file("output/data/crashes.gpkg")
crashed_nhood_pct_change.to_file("output/data/crashes-pct_change.gpkg")


In [None]:
plot(
    crashes_grouped.loc[crashes_grouped["covid_period"] == "2018-2019"],
    ["PEDESTRIAN_ACCIDENT", "BICYCLE_ACCIDENT", "ALL_ACCIDENT"],
    "crashes-by_analysis_neighborhood-1819",
    suptitle=(
        "2018/2019 crashes\n"
        "rows = injury only, fatal, combined\ncols = ped, bike, all"
    ),
)


In [None]:
plot(
    crashes_grouped.loc[crashes_grouped["covid_period"] == "2020-2021"],
    ["PEDESTRIAN_ACCIDENT", "BICYCLE_ACCIDENT", "ALL_ACCIDENT"],
    "crashes-by_analysis_neighborhood-2021",
    suptitle=(
        "2020/2021 crashes\n"
        "rows = injury only, fatal, combined\ncols = ped, bike, all"
    ),
)


In [None]:
plot(
    crashes_grouped.loc[crashes_grouped["covid_period"] == "2022-2023"],
    ["PEDESTRIAN_ACCIDENT", "BICYCLE_ACCIDENT", "ALL_ACCIDENT"],
    "crashes-by_analysis_neighborhood-2223",
    suptitle=(
        "2022/2023 crashes\n"
        "rows = injury only, fatal, combined\ncols = ped, bike, all"
    ),
)


In [None]:
plot(
    crashed_nhood_pct_change,
    [
        "PEDESTRIAN_ACCIDENT_1819_to_2223_pct_change",
        "BICYCLE_ACCIDENT_1819_to_2223_pct_change",
        "ALL_ACCIDENT_1819_to_2223_pct_change",
    ],
    "crashes-by_analysis_neighborhood-pct_change_1819v2223",
    suptitle=(
        "pct change 18/19 vs 22/23\n"
        "rows = injury only, fatal, combined\ncols = ped, bike, all\n values capped at (-1, 1)\n"
        "holes = NaN = 0 to 0"
    ),
    vmin=-1,
    vmax=1,
)
