In [1]:
%load_ext autoreload

In [2]:
# import hvplot.polars
import altair as alt
import numpy as np
import polars as pl

# from bokeh.io import output_notebook

%autoreload
from altair_utils import color_sf_bayarea
from utils import (
    add_industry_share_col,
    line_plot,
    line_plot_normalized,
)

# hvplot.extension("bokeh")
# output_notebook()

In [3]:
def read_ces(filepath, years):
    return parse_ces_industries(
        annual_average(parse_to_geographies(_read_ces(filepath, years)))
    )


def _read_ces(filepath, years):
    return (
        pl.read_csv(filepath)
        .with_columns(pl.col("Industry Title").str.strip_chars_start())
        .filter(pl.col("Year").is_in(years))
        .rename(
            {
                "Current Employment": "employment",
                "Industry Title": "industry",
                "Year": "year",
            }
        )
    )


def parse_to_geographies(df):
    """parse data to just SF and (9-county) Bay Area"""
    bay_area_counties = {
        "San Francisco County",
        "San Mateo County",
        "Santa Clara County",
        "Alameda County",
        "Contra Costa County",
        "Solano County",
        "Napa County",
        "Sonoma County",
        "Marin County",
    }
    filter_bay_area = pl.col("Area Name").is_in(bay_area_counties)
    filter_sf = pl.col("Area Name") == "San Francisco County"
    return pl.concat(
        [
            df.filter(filter_bay_area)
            .group_by("year", "Month", "Date", "Series Code", "industry")
            .agg(pl.sum("employment"))
            .with_columns(geography=pl.lit("Bay Area")),
            df.filter(filter_sf).select(
                "year",
                "Month",
                "Date",
                "Series Code",
                "industry",
                "employment",
                geography=pl.lit("San Francisco"),
            ),
        ]
    )


def annual_average(df):
    """average over the 12 months of the year"""
    if "geography" in df.columns:
        geo_col = "geography"  # col name after parse_to_geographies()
    else:
        geo_col = "Area Name"  # the original geography column
    return df.group_by("year", "Series Code", "industry", geo_col).agg(
        pl.mean("employment")
    )


def parse_ces_industries(df):
    # industries + industries_other: a set of non-overlapping industries
    ces_naics_industries_as_is = [  # comments are the NAICS codes
        "Goods Producing",  # 16/10-1
        "Trade, Transportation, and Utilities",  # 40/10-21
        "Information",  # 50/10-22
        "Financial Activities",  # 55/10-23
        "Professional and Business Services",  # 60/10-24
        "Private Education and Health Services",  # 65/10-25
        # "Health Care and Social Assistance",  # split from 65/10-25
        # "Private Educational Services",  # split from 65/10-25
        "Leisure and Hospitality",  # 70/10-26
        "Other Services",  # 80/10-27
        "Government",  # 90
        "Total Farm",  # Farm is NOT in QCEW
    ]
    return (
        df.filter(pl.col("industry").is_in(ces_naics_industries_as_is))
        .group_by("year", "industry", "geography")
        .agg(pl.sum("employment"))
        .sort("year", "geography")
    )


def group_industry(df):
    """
    group the following race/ethnicities into Other,
    since population share too low to visualize:
    American Indian, Alaska Native, Native Hawaiian, Pacific Islander
    """
    return (
        df.with_columns(
            pl.when(
                pl.col("industry").is_in(
                    {"Financial Activities", "Professional and Business Services"}
                )
            )
            .then(pl.lit("Professional, Business, Financial Services/Activities"))
            .when(
                pl.col("industry").is_in(
                    {
                        "Goods Producing",
                        "Total Farm",
                        "Trade, Transportation, and Utilities",
                    }
                )
            )
            .then(pl.lit("Trade, Transportation, Utilities, Goods Producing, and Farm"))
            .when(
                pl.col("industry").is_in(
                    {
                        "Government",
                        "Private Education and Health Services",
                        "Other Services",
                    }
                )
            )
            .then(pl.lit("Government, Private Educational, Health, and Other Services"))
            .otherwise(pl.col("industry"))
            .alias("industry")
        )
        .group_by("industry", "year", "geography")
        .agg(pl.sum("employment"))
    )

In [4]:
# CES/QCEW employment numbers are by place-of-work
years = np.arange(2008, 2023)  # county level numbers are only available up to 2022

In [5]:
df = pl.concat(
    (
        # read_ces(
        #     r"Q:\Data\Surveys\CA-EDD\CurrentEmploymentStats\ces_1990-2001_monthly_2024627.csv",
        #     np.arange(1990, 2002),
        # ),
        read_ces(
            r"Q:\Data\Surveys\CA-EDD\CurrentEmploymentStats\ces_2002-2013_monthly_2024627.csv",
            np.arange(2008, 2014),
        ),
        read_ces(
            r"Q:\Data\Surveys\CA-EDD\CurrentEmploymentStats\ces_2014-2024_monthly_20241022.csv",
            np.arange(2014, 2024),
        ),
    )
)

In [6]:
annual_total_df = (
    df.group_by("year", "geography").sum().drop("industry").sort("geography", "year")
)
annual_total_df.write_csv("output/data/employment.csv")
# if want to start from this step directly:
# annual_total_df = pl.read_csv("output/data/employment.csv")

In [None]:
employment_chart = line_plot(
    annual_total_df.filter(pl.col("geography") == "San Francisco"),
    x="year",
    y="employment",
    color=alt.value(color_sf_bayarea[0]),
    title="San Francisco total employment",
    tooltip=["year", "geography", "employment"],
)
employment_chart.save("output/Links/employment.png", scale=3)
employment_chart.properties(width="container").save("output/Links/employment.html")
employment_chart


In [None]:
employment_2019norm_df, employment_2019norm_chart = line_plot_normalized(
    annual_total_df.filter(pl.col("year") > 2018),
    x="year",
    y="employment",
    color=alt.Color("geography").scale(
        domain=["San Francisco", "Bay Area"],
        range=color_sf_bayarea,
    ),
    title="total employment",
    tooltip=["year", "geography", "employment"],
    norm_x_value=2019,
)
employment_2019norm_df.write_csv("output/data/employment-2019norm.csv")
employment_2019norm_chart.save("output/Links/employment-2019norm.png", scale=3)
employment_2019norm_chart.properties(width="container").save(
    "output/Links/employment-2019norm.html"
)
employment_2019norm_chart


In [9]:
shares_df = add_industry_share_col(group_industry(df), "employment").sort(
    "year", "geography", "industry"
)
shares_wide_df = shares_df.pivot(
    on="year", index=["industry", "geography"], values="employment-industry_share"
)
shares_df = shares_df.with_columns(
    pl.col("employment").cast(int), pl.col("employment-industry_share").round(3)
)
shares_df.write_csv("output/data/employment-by_industry.csv")

In [None]:
shares_covid_diff = shares_wide_df.select(
    "geography",
    "industry",
    ((pl.col("2023") - pl.col("2019")) * 100).round(1).alias("percent_diff_19to23"),
).sort("geography", "percent_diff_19to23")
shares_covid_diff.write_csv(
    "output/data/employment-by_industry-percent_diff_19to23.csv"
)
shares_covid_diff

In [None]:
shares_chart = (
    alt.Chart(
        shares_df.filter(
            (pl.col("year") > 2017) & (pl.col("geography") == "San Francisco")
        ).with_columns(date=pl.date(pl.col("year"), 1, 1)),
        title="employment: industry shares",
    )
    .mark_line()
    .encode(
        x=alt.X("date", title="year"),
        y=alt.Y("employment-industry_share", title="San Francisco").axis(format="%"),
        # column=alt.Column("geography", sort="descending"),
        color="industry",
        tooltip=[
            "year",
            "geography",
            "industry",
            "employment",
            alt.Tooltip("employment-industry_share", format="%"),
        ],
    )
    .interactive()
    | alt.Chart(
        shares_df.filter(
            (pl.col("year") > 2017) & (pl.col("geography") == "Bay Area")
        ).with_columns(date=pl.date(pl.col("year"), 1, 1))
    )
    .mark_line()
    .encode(
        x=alt.X("date", title="year"),
        y=alt.Y("employment-industry_share", title="Bay Area").axis(format="%"),
        # column=alt.Column("geography", sort="descending"),
        color="industry",
        tooltip=[
            "year",
            "geography",
            "industry",
            "employment",
            alt.Tooltip("employment-industry_share", format="%"),
        ],
    )
    .interactive()
).resolve_scale(y="shared")

shares_chart.save("output/Links/employment-by_industry.png", scale=3)
shares_chart.save(
    "output/Links/employment-by_industry.html"
)  # .properties(width="container")
shares_chart


In [None]:
shares_chart = (
    alt.Chart(
        shares_df.filter(pl.col("year").is_in({2019, 2023})).with_columns(
            date=pl.date(pl.col("year"), 1, 1)
        )
    )
    .mark_line()
    .encode(
        x=alt.X("year:O"),
        y=alt.Y("employment-industry_share").axis(format="%"),
        column=alt.Column("geography", sort="descending"),
        color="industry",
        tooltip=[
            "year",
            "geography",
            "industry",
            "employment",
            alt.Tooltip("employment-industry_share", format="%"),
        ],
    )
    .interactive()
)
shares_chart
