In [2]:
%load_ext autoreload

In [3]:
from pathlib import Path

import altair as alt
import polars as pl

%autoreload
import altair_utils  # noqa

In [4]:
# not much change between 18-19 and 21-22, so just plot 19 vs 22
# ACS 1-year is not available for 2020
years = [2018, 2019, 2021, 2022, 2023]  # [2019, 2022]
filepaths = {
    y: next(
        (
            Path(rf"Q:\Data\Surveys\Census\ACS\1yr\{y}")
            / (
                "C03002-HispanicOrLatinoOriginByRace-BayArea_Counties-SF_PUMAs"
                if (y < 2023)
                else "C03002-HispanicOrLatinoOriginByRace-BayArea_Counties"
            )
        ).glob("*.csv")
    )
    for y in years
}
dfs = {
    y: pl.read_csv(
        filepaths[y],
    )
    for y in years
}

In [5]:
def parse_raceethnicity_csv(df, year):
    """parse the DataFrame from pl.read_csv() of a single year's C03002 ACS 1yr CSV"""
    return (
        df.with_columns(
            pl.col("Label (Grouping)").str.strip_chars_start(),
        )
        .filter(
            ~pl.col("Label (Grouping)").is_in(
                [
                    "Not Hispanic or Latino:",
                    "Two races including Some other race",
                    "Two races excluding Some other race, and three or more races",
                ]
            )
        )
        .with_columns(
            # not using PUMAs because PUMAs changed between 2010s and 2020s
            # pl.col("^San Francisco County.*!!.*$")  # for PUMAs
            pl.col("^.* County, California!!Estimate$")
            .str.replace_all(",", "")
            .cast(int),
        )
        .rename(
            {
                "Label (Grouping)": "race/ethnicity",
                "San Francisco County, California!!Estimate": "San Francisco",
            }
        )
        .with_columns(
            pl.sum_horizontal("^.* County, California!!Estimate$").alias("Bay Area"),
        )
        .select(
            pl.col("race/ethnicity").replace("Two or more races:", "Two or more races"),
            (
                pl.col("San Francisco", "Bay Area")
                / pl.col("San Francisco", "Bay Area").filter(
                    pl.col("race/ethnicity") == "Total:"
                )
            ),
            year=pl.lit(year),
        )
        .filter(pl.col("race/ethnicity") != "Total:")
        .unpivot(
            index=["race/ethnicity", "year"],
            on=["San Francisco", "Bay Area"],
            variable_name="geography",
            value_name="population share",
        )
    )


def group_raceeth(df):
    """
    group the following race/ethnicities into Other,
    since population share too low to visualize:
    American Indian, Alaska Native, Native Hawaiian, Pacific Islander
    """
    return (
        df.with_columns(
            pl.when(
                pl.col("race/ethnicity").is_in(
                    {
                        "Asian alone",
                        "Native Hawaiian and Other Pacific Islander alone",
                    }
                )
            )
            .then(pl.lit("Asian or Pacific Islander alone"))
            .when(
                pl.col("race/ethnicity").is_in(
                    {
                        "American Indian and Alaska Native alone",
                        "Some other race alone",
                    }
                )
            )
            .then(pl.lit("American Indian, Alaska Native, or some other race alone"))
            .otherwise(pl.col("race/ethnicity"))
            .alias("race/ethnicity")
        )
        .group_by("race/ethnicity", "year", "geography")
        .agg(pl.sum("population share"))
    )

In [6]:
df = group_raceeth(pl.concat([parse_raceethnicity_csv(dfs[y], y) for y in years]))
df.write_csv("output/data/race_ethnicity.csv")

In [None]:
(
    alt.Chart(df.with_columns(date=pl.date(pl.col("year"), 1, 1)))
    .mark_line()
    .encode(
        x=alt.X("date", title="year"),
        y=alt.Y("population share").axis(format="%"),
        column=alt.Column("geography", sort="descending"),
        color="race/ethnicity",
        tooltip=[
            "year",
            "geography",
            "race/ethnicity",
            alt.Tooltip("population share", format="%"),
        ],
    )
    .interactive()
)

In [None]:
chart = (
    alt.Chart(
        df.filter(
            pl.col("year").is_in({2019, 2023})
            & (pl.col("geography") == "San Francisco")
        ).with_columns(date=pl.date(pl.col("year"), 1, 1)),
        title="race/ethnicity population shares",
    )
    .mark_line()
    .encode(
        x=alt.X("year:O"),
        y=alt.Y("population share", title="San Francisco").axis(format="%"),
        # column=alt.Column("geography", sort="descending"),
        color="race/ethnicity",
        tooltip=[
            "year",
            "geography",
            "race/ethnicity",
            alt.Tooltip("population share", format="%"),
        ],
    )
    .interactive()
    | alt.Chart(
        df.filter(
            pl.col("year").is_in({2019, 2023}) & (pl.col("geography") == "Bay Area")
        ).with_columns(date=pl.date(pl.col("year"), 1, 1))
    )
    .mark_line()
    .encode(
        x=alt.X("year:O"),
        y=alt.Y("population share", title="Bay Area").axis(format="%"),
        # column=alt.Column("geography", sort="descending"),
        color="race/ethnicity",
        tooltip=[
            "year",
            "geography",
            "race/ethnicity",
            alt.Tooltip("population share", format="%"),
        ],
    )
    .interactive()
)
chart.save("output/Links/race_ethnicity.png")
chart.save("output/Links/race_ethnicity.html")  # .properties(width="container")
chart