In [None]:
%load_ext autoreload

In [None]:
import hvplot.polars
import numpy as np
import polars as pl
from bokeh.io import output_notebook

%autoreload
from utils import (
    add_industry_share_col,
    line_plot,
    line_plot_normalized,
    paired_line_plots,
    stacked_bar_plot,
)

hvplot.extension("bokeh")
output_notebook()

In [None]:
def read_ces(filepath, years):
    return parse_ces_industries(
        annual_average(parse_to_geographies(_read_ces(filepath, years)))
    )


def _read_ces(filepath, years):
    return (
        pl.read_csv(filepath)
        .with_columns(pl.col("Industry Title").str.strip_chars_start())
        .filter(pl.col("Year").is_in(years))
        .rename({"Current Employment": "employment", "Industry Title": "industry"})
    )


def parse_to_geographies(df):
    """parse data to just SF and (9-county) Bay Area"""
    bay_area_counties = {
        "San Francisco County",
        "San Mateo County",
        "Santa Clara County",
        "Alameda County",
        "Contra Costa County",
        "Solano County",
        "Napa County",
        "Sonoma County",
        "Marin County",
    }
    filter_bay_area = pl.col("Area Name").is_in(bay_area_counties)
    filter_sf = pl.col("Area Name") == "San Francisco County"
    return pl.concat(
        [
            df.filter(filter_bay_area)
            .group_by("Year", "Month", "Date", "Series Code", "industry")
            .agg(pl.sum("employment"))
            .with_columns(geography=pl.lit("Bay Area")),
            df.filter(filter_sf).select(
                "Year",
                "Month",
                "Date",
                "Series Code",
                "industry",
                "employment",
                geography=pl.lit("San Francisco"),
            ),
        ]
    )


def annual_average(df):
    """average over the 12 months of the year"""
    if "geography" in df.columns:
        geo_col = "geography"  # col name after parse_to_geographies()
    else:
        geo_col = "Area Name"  # the original geography column
    return df.group_by("Year", "Series Code", "industry", geo_col).agg(
        pl.mean("employment")
    )


def parse_ces_industries(df):
    # industries + industries_other: a set of non-overlapping industries
    ces_naics_industries_as_is = [  # comments are the NAICS codes
        "Goods Producing",  # 16/10-1
        "Trade, Transportation, and Utilities",  # 40/10-21
        "Information",  # 50/10-22
        "Financial Activities",  # 55/10-23
        "Professional and Business Services",  # 60/10-24
        "Private Education and Health Services",  # 65/10-25
        "Leisure and Hospitality",  # 70/10-26
        "Other Services",  # 80/10-27
        "Government",  # 90
        "Total Farm",  # Farm is NOT in QCEW
    ]
    return (
        df.filter(pl.col("industry").is_in(ces_naics_industries_as_is))
        .group_by("Year", "industry", "geography")
        .agg(pl.sum("employment"))
        .sort("Year", "geography")
    )

In [None]:
# CES/QCEW employment numbers are by place-of-work
years = np.arange(1990, 2023)  # county level numbers are only available up to 2022
industry_shares_years = [2019, 2022]

In [None]:
df = pl.concat(
    (
        read_ces(
            r"Q:\Data\Surveys\CA-EDD\CES\ces_1990-2001_monthly_2024627.csv",
            np.arange(1990, 2002),
        ),
        read_ces(
            r"Q:\Data\Surveys\CA-EDD\CES\ces_2002-2013_monthly_2024627.csv",
            np.arange(2002, 2014),
        ),
        read_ces(
            r"Q:\Data\Surveys\CA-EDD\CES\ces_2014-2024_monthly_2024627.csv",
            np.arange(2014, 2023),
        ),
    )
)


In [None]:
annual_total_df = (
    df.group_by("Year", "geography").sum().drop("industry").sort("geography", "Year")
)
employment_plot = line_plot(
    annual_total_df,
    "Year",
    "employment",
    "geography",
    "total employment",
    frame_width=300,
    legend=False,
) + line_plot_normalized(
    annual_total_df,
    "Year",
    "employment",
    "geography",
    "total employment (normalized)",
    norm_x_value=2019,
    frame_width=300,
)
annual_total_df.write_csv("output/data/employment.csv")
hvplot.save(employment_plot, "output/Links/employment.html")
hvplot.save(employment_plot, "output/Links/employment.png")
employment_plot

In [None]:
shares_df = add_industry_share_col(df, "employment").sort(
    "Year", "geography", "industry"
)
shares_df.write_csv("output/data/employment-by_industry.csv")

In [None]:
shares_wide_df = shares_df.pivot(
    columns="Year", index=["industry", "geography"], values="employment-industry_share"
)
shares_wide_df

In [None]:
shares_covid_diff = shares_wide_df.select(
    "geography",
    "industry",
    ((pl.col("2022") - pl.col("2019")) * 100).round(1).alias("percent_diff_19to22"),
).sort("geography", "percent_diff_19to22")
shares_covid_diff.write_csv(
    "output/csvs/employment-by_industry-percent_diff_19to22.csv"
)
shares_covid_diff

In [None]:
employment_by_industry_plot_sf = stacked_bar_plot(
    shares_df,
    "employment-industry_share",
    "San Francisco",
    "industry",
    "San Francisco employment: industry shares",
    # no clear trends over the past decade, so we'll just show 2019 vs 2022
    industry_shares_years,
    frame_width=100,
    legend=False,
)
employment_by_industry_plot_bayarea = stacked_bar_plot(
    shares_df,
    "employment-industry_share",
    "Bay Area",
    "industry",
    "Bay Area employment: industry shares",
    # no clear trends over the past decade, so we'll just show 2019 vs 2022
    industry_shares_years,
    frame_width=100,
)
employment_by_industry_plot = (
    employment_by_industry_plot_sf + employment_by_industry_plot_bayarea
)
hvplot.save(employment_by_industry_plot, "output/Links/employment-by_industry-bar.html")
hvplot.save(employment_by_industry_plot, "output/Links/employment-by_industry-bar.png")
employment_by_industry_plot

In [None]:
plot = paired_line_plots(
    shares_df, "Year", "employment-industry_share", "industry", ymin=0, frame_width=200
)
hvplot.save(plot, "output/Links/employment-by_industry.html")
hvplot.save(plot, "output/Links/employment-by_industry.png")
plot
