In [None]:
from pathlib import Path
import hvplot.polars

import polars as pl
import numpy as np

hvplot.extension("plotly")  # bokeh PNG export is broken


In [None]:
# CES employment numbers are by place-of-work
years = np.arange(2014, 2023)  # county level numbers are only available up to 2022


In [None]:
def read_ces(filepath, years):
    return (
        pl.read_csv(filepath)
        .with_columns(pl.col("Industry Title").str.strip_chars_start())
        .filter(pl.col("Year").is_in(years))
    )


def parse_to_geographies(df):
    """parse data to just SF and (9-county) Bay Area"""
    bay_area_counties = {
        "San Francisco County",
        "San Mateo County",
        "Santa Clara County",
        "Alameda County",
        "Contra Costa County",
        "Solano County",
        "Napa County",
        "Sonoma County",
        "Marin County",
    }
    filter_bay_area = pl.col("Area Name").is_in(bay_area_counties)
    filter_sf = pl.col("Area Name") == "San Francisco County"
    return pl.concat(
        [
            df.filter(filter_bay_area)
            .group_by("Year", "Month", "Date", "Series Code", "Industry Title")
            .agg(pl.sum("Current Employment"))
            .with_columns(geography=pl.lit("Bay Area")),
            df.filter(filter_sf).select(
                "Year",
                "Month",
                "Date",
                "Series Code",
                "Industry Title",
                "Current Employment",
                geography=pl.lit("San Francisco"),
            ),
        ]
    )


def annual_average(df):
    """average over the 12 months of the year"""
    if "geography" in df.columns:
        geo_col = "geography"  # col name after parse_to_geographies()
    else:
        geo_col = "Area Name"  # the original geography column
    return df.group_by("Year", "Series Code", "Industry Title", geo_col).agg(
        pl.mean("Current Employment")
    )


def parse_industries(df):
    # industries + industries_other: a set of non-overlapping industries
    industries = [
        "Government",
        "Health Care and Social Assistance",
        "Finance and Insurance",
        "Real Estate and Rental and Leasing",
        "Trade, Transportation, and Utilities",
        "Leisure and Hospitality",
        "Information",
        "Professional and Business Services",
    ]
    industries_other = [
        "Goods Producing",
        "Total Farm",
        "Private Educational Services",
        "Other Services",
    ]
    return (
        df.filter(pl.col("Industry Title").is_in(industries + industries_other))
        .with_columns(
            pl.when(pl.col("Industry Title").is_in(industries))
            .then(pl.col("Industry Title"))
            .otherwise(pl.lit("other"))
        )
        .group_by("Year", "Industry Title", "geography")
        .agg(pl.sum("Current Employment"))
        .sort("Year", "geography")
    )


def add_industry_share_col(df):
    return df.with_columns(
        industry_share=(
            pl.col("Current Employment")
            / pl.col("Current Employment").sum().over("Year", "geography")
        ),
    )


def plot_industry_shares(df, geography, title):
    return df.filter(pl.col("geography") == geography).plot.bar(
        x="Year", y="industry_share", by="Industry Title", stacked=True, title=title
    )


In [None]:
filepath = r"Q:\Data\Surveys\EDD\ces_2014-2024_monthly_2024612.csv"
df = parse_industries(annual_average(parse_to_geographies(read_ces(filepath, years))))

In [None]:
shares_df = add_industry_share_col(df)
plot_industry_shares(shares_df, "San Francisco", "Industry shares in San Francisco")


In [None]:
plot_industry_shares(shares_df, "Bay Area", "Industry shares in Bay Area")

In [None]:
# Note the grown vs shrunk Information sector in Bay Area vs SF

In [None]:
annual_total_df = df.group_by("Year", "geography").sum().drop("Industry Title")

In [None]:
annual_total_df.sort("Year").plot.line(
    x="Year",
    y="Current Employment",
    by="geography",
    title="total employment",
)


In [None]:
annual_total_df.join(
    annual_total_df.filter(pl.col("Year") == 2019).rename(
        {"Current Employment": "employment-2019"}
    ),
    on="geography",
    how="left",
).with_columns(
    employment_normalized_to_2019=(
        pl.col("Current Employment") / pl.col("employment-2019")
    )
).sort("Year").plot.line(
    x="Year",
    y="employment_normalized_to_2019",
    by="geography",
    title="total employment (normalized to 2019)",
)
