In [None]:
%load_ext autoreload

In [None]:
from pathlib import Path
import hvplot.polars

import polars as pl
import numpy as np

%autoreload
from utils import (
    line_plot,
    line_plot_normalized,
    add_industry_share_col,
    stacked_bar_plot,
)

hvplot.extension("plotly")  # bokeh PNG export is broken

In [None]:
# Do NOT use QCEW for employment numbers:
# 'The QCEW data are not the official estimates from the California EDD. The
# official series is available through the Current Employment Statistics (CES)
# program on this website at Employment by Industry Data.' (CA EDD)

In [None]:
# CES/QCEW employment numbers are by place-of-work
years = np.arange(2013, 2023)
industry_shares_years = [2019, 2022]

In [None]:
def read_qcew(filepaths, years):
    return pl.concat([parse_to_geographies(_read_qcew(f, years)) for f in filepaths])


def _read_qcew(filepath, years):
    try:
        time_period_col = "Time Period"
        df = _read_qcew_csv(filepath, time_period_col)
    except pl.exceptions.ColumnNotFoundError:
        time_period_col = "Quarter"
        df = _read_qcew_csv(filepath, time_period_col)
    return df.filter(
        pl.col("Year").is_in(years) & (pl.col(time_period_col) == "Annual")
    ).drop(time_period_col)


def _read_qcew_csv(filepath, time_period_col):
    return pl.read_csv(
        filepath,
        columns=(
            ["Area Name", time_period_col]  # these two will be parsed away and dropped
            + qcew_full_summary_group_by_cols_raw
            + qcew_group_by_sum_cols
        ),
        schema_overrides={"NAICS Code": str},
    ).rename({"Industry Name": "industry"})


qcew_full_summary_group_by_cols_raw = [
    "Year",
    "Ownership",
    "NAICS Level",
    "NAICS Code",
    "Industry Name",
]
qcew_full_summary_group_by_cols = [
    "Year",
    "Ownership",
    "NAICS Level",
    "NAICS Code",
    "industry",
]
# Do NOT use QCEW for employment numbers, use CES instead. See note above.
qcew_group_by_sum_cols = ["Establishments"]


def parse_to_geographies(df):
    """parse data to just SF and (9-county) Bay Area"""
    bay_area_counties = {
        "San Francisco County",
        "San Mateo County",
        "Santa Clara County",
        "Alameda County",
        "Contra Costa County",
        "Solano County",
        "Napa County",
        "Sonoma County",
        "Marin County",
    }
    filter_bay_area = pl.col("Area Name").is_in(bay_area_counties)
    filter_sf = pl.col("Area Name") == "San Francisco County"
    return pl.concat(
        [
            df.filter(filter_bay_area)
            .group_by(qcew_full_summary_group_by_cols)
            .agg(pl.sum(qcew_group_by_sum_cols))
            .with_columns(geography=pl.lit("Bay Area")),
            df.filter(filter_sf).select(
                qcew_full_summary_group_by_cols + qcew_group_by_sum_cols,
                geography=pl.lit("San Francisco"),
            ),
        ]
    )


filter_industry_total = pl.col("NAICS Code") == "10"  # Total, All Industries
filter_ownership_total = pl.col("Ownership") == "Total Covered"


def parse_qcew_industries(df):
    # industries + industries_other: a set of non-overlapping industries
    qcew_naics_industries_as_is = [  # comments are the NAICS codes
        "Goods-Producing",  # 16/10-1
        "Trade, Transportation, and Utilities",  # 40/10-21
        "Information",  # 50/10-22
        "Financial Activities",  # 55/10-23
        "Professional and Business Services",  # 60/10-24
        # since I filter out all Govt owned establishments/employment,
        # rename 'Education ...' to 'Private Education ...' later
        "Education and Health Services",  # 65/10-25
        "Leisure and Hospitality",  # 70/10-26
        "Other Services",  # 80/10-27
        "Unclassified",  # 10-29
        # Farm: NOT in QCEW
        # Government is not an 'Industry' in QCEW, parse out using the Ownership column
    ]
    return (
        df.with_columns(  # .filter(pl.col("NAICS Level") == 2)
            industry=pl.when(
                pl.col("Ownership").is_in(
                    ["Federal Government", "State Government", "Local Government"]
                )
            )
            .then(pl.lit("Government"))
            # Ownership == "Private":
            .when(pl.col("industry").is_in(qcew_naics_industries_as_is))
            .then(pl.col("industry"))  # keep the name
            .otherwise(pl.lit(None))
            .replace(
                {
                    "Education and Health Services": "Private Education and Health Services"
                }
            )
        )
        .drop_nulls("industry")
        .group_by("Year", "industry", "geography")
        .agg(pl.sum("Establishments"))
        .sort("Year", "geography")
    )

In [None]:
dir = Path(r"C:\Users\cchow\Desktop\tmp\downtown_today")
filepaths = [
    dir / "qcew_2012-2015.csv",
    dir / "qcew_2016-2019.csv",
    dir / "qcew-2020-2023q3.csv",
]
df = read_qcew(filepaths, years)

In [None]:
annual_total_df = (
    df.filter(filter_industry_total & filter_ownership_total)
    .group_by("Year", "geography")
    .agg(pl.sum(qcew_group_by_sum_cols))
)
(
    line_plot(
        annual_total_df,
        "Year",
        "Establishments",
        "geography",
        "Establishments",
        frame_width=300,
    )
    + line_plot_normalized(
        annual_total_df,
        "Year",
        "Establishments",
        "geography",
        "Establishments (normalized)",
        norm_x_value=2019,
        frame_width=300,
    )
)

In [None]:
shares_df = add_industry_share_col(parse_qcew_industries(df), "Establishments")

In [None]:
stacked_bar_plot(
    shares_df,
    "Establishments-industry_share",
    "San Francisco",
    "industry",
    "San Francisco establishments: industry shares",
    # no clear trends over the past decade, so we'll just show 2019 vs 2022
    industry_shares_years,
    frame_width=300,
)


In [None]:
stacked_bar_plot(
    shares_df,
    "Establishments-industry_share",
    "Bay Area",
    "industry",
    "Bay Area establishments: industry shares",
    # no clear trends over the past decade, so we'll just show 2019 vs 2022
    industry_shares_years,
    frame_width=300,
)
