In [1]:
%load_ext autoreload

In [2]:
from pathlib import Path

import altair as alt
import pandas as pd
import polars as pl

%autoreload
from altair_utils import color_sf_bayarea
from utils import add_normalization_col

In [3]:
dir = Path("Q:/Data/Surveys/Census/population_and_housing_unit/parsed")
csv_filepath = dir / "population_estimate-as_of_July1.csv"

In [4]:
def to_long(wide_df, value_name):
    return wide_df.unpivot(
        index="geography", variable_name="year", value_name=value_name
    ).with_columns(pl.col("year").cast(int))


def transpose(wide_df):
    return (
        wide_df.drop("geography")
        .transpose(include_header=True, header_name="year")
        .rename(
            {f"column_{i}": wide_df.select("geography").item(i, 0) for i in range(2)}
        )
        .with_columns(pl.col("year").cast(int))
    )


def plot_altair(long_df, y_column, y_title, color):
    chart = (
        alt.Chart(long_df)
        .mark_line()
        .encode(
            alt.X("date:T", title="year"),
            y=alt.Y(f"{y_column}:Q", title=y_title),
            color=color,
            tooltip=[
                "year:Q",
                "geography:N",
                f"{y_column}:Q",
            ],
        )
    )
    # add vertical bar at 2020, because:
    # 'data from separate vintages should not be combined' (US Census),
    # and each vintage runs for the full decade (e.g. 2010-2019)
    rule = (
        alt.Chart(pd.DataFrame({"date": ["2010-01-01", "2020-01-01"]}))
        .mark_rule(strokeDash=[3, 2], strokeWidth=0.5, color="grey")
        .encode(
            # 'date', not 'year', as the data are as of July 1 and plotted as such
            alt.X("date:T", title="year"),
            # color=alt.Color("color", scale=None),
        )
    )
    return chart + rule


def population_line_plot_normalized(
    df, y, color, title=None, norm_x_value=2019, mark_line_args={}, chart_args={}
):
    """
    modified from utils.line_plot_normalized()

    x = "year" when adding the normalization column
    x = "date" when plotting
    """
    normalized_y_col_name = f"{y} (% of {norm_x_value} value)"
    normalized_df = (
        add_normalization_col(
            df, "year", y, normalized_y_col_name, norm_x_value=norm_x_value
        )
        .sort("year")
        .with_columns(date=(pl.col("year").cast(str) + "-07-01").str.to_date())
    )
    if title is None:
        title = normalized_y_col_name
    else:
        title += f" (% of {norm_x_value} value)"
    chart = (
        alt.Chart(normalized_df)
        .mark_line(**mark_line_args)
        .encode(
            alt.X("date:T", title="year"),
            y=alt.Y(normalized_y_col_name, title=title)
            .scale(zero=False)
            .axis(format="%"),
            color=color,
            tooltip=[
                "year:Q",
                "geography:N",
                f"{y}:Q",
            ],
            **chart_args,
        )
    )
    rule = (
        alt.Chart(pd.DataFrame({"date": ["2020-01-01"]}))
        .mark_rule(strokeDash=[3, 2], strokeWidth=0.5, color="grey")
        .encode(
            # 'date', not 'year', as the data are as of July 1 and plotted as such
            alt.X("date:T"),  # .title("year"),
        )
    )
    return normalized_df, (
        chart
        + rule
        + alt.Chart(
            pl.DataFrame({"year": [norm_x_value]}).with_columns(
                date=(pl.col("year").cast(str) + "-07-01").str.to_date()
            )
        )
        .mark_rule(color="grey", opacity=0.5, strokeWidth=2)
        .encode(x=alt.X("date:T", title="year"))
        + alt.Chart(pl.DataFrame({normalized_y_col_name: [1]}))
        .mark_rule(color="grey", opacity=0.5, strokeWidth=2)
        .encode(y=normalized_y_col_name)
    ).interactive()

In [5]:
value_name_short = "population"
wide_df = pl.read_csv(csv_filepath)
long_df = to_long(wide_df, value_name_short)

In [None]:
wide_df.select(
    "geography",
    # CAUTION: US Census says do not compare values across vintages,
    # so comparing 2023 with 2019 is inappropriate
    (1 - pl.col("2023") / pl.col("2019")).alias("change 2019 to 2023"),
    (1 - pl.col("2023") / pl.col("2020")).alias("change 2020 to 2023"),
)
# For SF, present it as a 7-8% decrease from pre-COVID population

In [None]:
# only show SF and not Bay Area, otherwise impossible to read the SF values
output_df = long_df.filter(
    (pl.col("geography") == "San Francisco") & (pl.col("year") > 2007)
).with_columns(date=(pl.col("year").cast(str) + "-07-01").str.to_date())
chart = plot_altair(
    output_df,
    value_name_short,
    "San Francisco population estimate (as of July 1)",
    alt.value(color_sf_bayarea[0]),
)
output_df.write_csv("output/data/population.csv")
chart.save("output/Links/population.png", scale_factor=3)
chart.properties(width="container").save("output/Links/population.html")
chart

In [None]:
df_2019norm, chart_2019norm = population_line_plot_normalized(
    long_df.filter(pl.col("year") >= 2019),
    y=value_name_short,
    color=alt.Color("geography:N").scale(
        domain=["San Francisco", "Bay Area"], range=color_sf_bayarea
    ),
    norm_x_value=2019,
)
df_2019norm.write_csv("output/data/population-2019norm.csv")
chart_2019norm.save("output/Links/population-2019norm.png", scale_factor=3)
chart_2019norm.properties(width="container").save(
    "output/Links/population-2019norm.html"
)
chart_2019norm