In [None]:
%load_ext autoreload

In [None]:
from pathlib import Path

import altair as alt
import pandas as pd
import polars as pl

%autoreload
from utils import add_normalization_col


In [None]:
dir = Path("Q:/Data/Surveys/Census/population_and_housing_unit/parsed")
csv_filepath = dir / "population_estimate-as_of_July1.csv"

In [None]:
def to_long(wide_df, value_name):
    return wide_df.melt(
        id_vars="geography", variable_name="year", value_name=value_name
    ).with_columns(pl.col("year").cast(int))


def to_pandas_for_altair(long_df):
    # need to convert year to date(time) for plotting the year axis in altair
    # cast to pandas for altair
    return long_df.with_columns(date=pl.date(pl.col("year"), 7, 1)).to_pandas()


def transpose(wide_df):
    return (
        wide_df.drop("geography")
        .transpose(include_header=True, header_name="year")
        .rename(
            {f"column_{i}": wide_df.select("geography").item(i, 0) for i in range(2)}
        )
        .with_columns(pl.col("year").cast(int))
    )


def plot_altair(long_df, y_column, y_title, y_show_zero=True):
    chart = (
        alt.Chart(to_pandas_for_altair(long_df))
        .mark_line()
        .encode(
            # 'date', not 'year', as the data are as of July 1 and plotted as such
            alt.X("date:T"),  # .title("year"),
            alt.Y(f"{y_column}:Q").title(y_title).scale(zero=y_show_zero),
            color=alt.Color("geography:N"),
            tooltip=[
                "year:Q",
                "geography:N",
                f"{y_column}:Q",
            ],
        )
    )
    # add vertical bar at 2020, because:
    # 'data from separate vintages should not be combined' (US Census),
    # and each vintage runs for the full decade (e.g. 2010-2019)
    rule = (
        alt.Chart(pd.DataFrame({"date": ["2020-01-01"], "color": ["grey"]}))
        .mark_rule()
        .encode(
            # 'date', not 'year', as the data are as of July 1 and plotted as such
            alt.X("date:T"),  # .title("year"),
            color=alt.Color("color", scale=None),
        )
    )
    return chart + rule

In [None]:
value_name_short = "population"
wide_df = pl.read_csv(csv_filepath)
long_df = to_long(wide_df, value_name_short)


In [None]:
wide_df.select(
    "geography",
    # CAUTION: US Census says do not compare values across vintages,
    # so comparing 2023 with 2019 is inappropriate
    (1 - pl.col("2023") / pl.col("2019")).alias("change 2019 to 2023"),
    (1 - pl.col("2023") / pl.col("2020")).alias("change 2020 to 2023"),
)
# For SF, present it as a 7-8% decrease from pre-COVID population

In [None]:
chart = plot_altair(
    long_df, value_name_short, "population estimate (as of July 1)"
) | plot_altair(
    add_normalization_col(
        long_df,
        "year",
        value_name_short,
        f"{value_name_short}-normalized_to_2019",
        norm_x_value=2019,
    ),
    f"{value_name_short}-normalized_to_2019",
    "population estimate (as of July 1, normalized to 2019)",
    y_show_zero=False,
)
chart.save(dir / "population_estimate-as_of_July1-2010-2023.png")
chart.interactive()
