In [None]:
from pathlib import Path

import hvplot.polars
import polars as pl

In [None]:
# not much change between 18-19 and 21-22, so just plot 19 vs 22
# ACS 1-year is not available for 2020
years = [2019, 2022]  # [2018, 2019, 2021, 2022]
filepaths = {
    y: next(
        Path(
            r"Q:\Data\Surveys\Census\ACS\1yr"
            rf"\{y}\C03002-HispanicOrLatinoOriginByRace-BayArea_Counties-SF_PUMAs"
        ).glob("*.csv")
    )
    for y in years
}
dfs = {
    y: pl.read_csv(
        filepaths[y],
        # columns=["cmp_segid", "year", "source", "period", "avg_speed"],
    )
    for y in years
}


In [None]:
def parse_raceethnicity_csv(df, year):
    """parse the DataFrame from pl.read_csv() of a single year's C03002 ACS 1yr CSV"""
    return (
        df.with_columns(
            pl.col("Label (Grouping)").str.strip_chars_start(),
        )
        .filter(
            ~pl.col("Label (Grouping)").is_in(
                [
                    "Not Hispanic or Latino:",
                    "Two races including Some other race",
                    "Two races excluding Some other race, and three or more races",
                ]
            )
        )
        .with_columns(
            # not using PUMAs because PUMAs changed between 2010s and 2020s
            # pl.col("^San Francisco County.*!!.*$")  # for PUMAs
            pl.col("^.* County, California!!Estimate$")
            .str.replace_all(",", "")
            .cast(int),
        )
        .rename(
            {
                "Label (Grouping)": "race/ethnicity",
                "San Francisco County, California!!Estimate": "San Francisco",
            }
        )
        .with_columns(
            pl.sum_horizontal("^.* County, California!!Estimate$").alias("Bay Area"),
        )
        .select(
            "race/ethnicity",
            pl.col("San Francisco", "Bay Area")
            / pl.col("San Francisco", "Bay Area").filter(
                pl.col("race/ethnicity") == "Total:"
            ),
            year=pl.lit(year),
        )
        .filter(pl.col("race/ethnicity") != "Total:")
        .melt(
            id_vars=["race/ethnicity", "year"],
            value_vars=["San Francisco", "Bay Area"],
            variable_name="geography",
            value_name="population share",
        )
    )


def plot_geography(df, geography, legend=True):
    return df.filter(pl.col("geography") == geography).plot.bar(
        x="year",
        y="population share",
        by="race/ethnicity",
        stacked=True,
        title=geography,
        # row="geography",
        frame_width=100,
        frame_height=200,
        # xticks=years,
        legend=legend,
    )

In [None]:
df = pl.concat([parse_raceethnicity_csv(dfs[y], y) for y in years])

In [None]:
sf_plot = plot_geography(df, "San Francisco", legend=False)
bayarea_plot = plot_geography(df, "Bay Area")
plot = sf_plot + bayarea_plot
hvplot.save(plot, "figs/race_ethnicity.png")
plot