In [111]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [112]:
# import hvplot.polars
import altair as alt
import numpy as np
import polars as pl

# from bokeh.io import output_notebook

%autoreload
from altair_utils import color_sf_bayarea
from utils import (
    add_industry_share_col,
    line_plot,
    line_plot_normalized,
)

# hvplot.extension("bokeh")
# output_notebook()

In [134]:
def read_ces(filepath, years):
    return parse_ces_industries(
        annual_average(parse_to_geographies(_read_ces(filepath, years)))
    )


def _read_ces(filepath, years):
    return (
        pl.read_csv(filepath)
        .with_columns(pl.col("Industry Title").str.strip_chars_start())
        .filter(pl.col("Year").is_in(years))
        .rename(
            {
                "Current Employment": "employment",
                "Industry Title": "industry",
                "Year": "year",
            }
        )
    )


def parse_to_geographies(df):
    """parse data to just SF and (9-county) Bay Area"""
    bay_area_counties = {
        "San Francisco County",
        "San Mateo County",
        "Santa Clara County",
        "Alameda County",
        "Contra Costa County",
        "Solano County",
        "Napa County",
        "Sonoma County",
        "Marin County",
    }
    filter_bay_area = pl.col("Area Name").is_in(bay_area_counties)
    filter_sf = pl.col("Area Name") == "San Francisco County"
    return pl.concat(
        [
            df.filter(filter_bay_area)
            .group_by("year", "Month", "Date", "Series Code", "industry")
            .agg(pl.sum("employment"))
            .with_columns(geography=pl.lit("Bay Area")),
            df.filter(filter_sf).select(
                "year",
                "Month",
                "Date",
                "Series Code",
                "industry",
                "employment",
                geography=pl.lit("San Francisco"),
            ),
        ]
    )


def annual_average(df):
    """average over the 12 months of the year"""
    if "geography" in df.columns:
        geo_col = "geography"  # col name after parse_to_geographies()
    else:
        geo_col = "Area Name"  # the original geography column
    return df.group_by("year", "Series Code", "industry", geo_col).agg(
        pl.mean("employment")
    )


def parse_ces_industries(df):
    # industries + industries_other: a set of non-overlapping industries
    ces_naics_industries_as_is = [  # comments are the NAICS codes
        "Goods Producing",  # 16/10-1
        "Trade, Transportation, and Utilities",  # 40/10-21
        "Information",  # 50/10-22
        "Financial Activities",  # 55/10-23
        "Professional and Business Services",  # 60/10-24
        "Private Education and Health Services",  # 65/10-25
        # "Health Care and Social Assistance",  # split from 65/10-25
        # "Private Educational Services",  # split from 65/10-25
        "Leisure and Hospitality",  # 70/10-26
        "Other Services",  # 80/10-27
        "Government",  # 90
        "Total Farm",  # Farm is NOT in QCEW
    ]
    return (
        df.filter(pl.col("industry").is_in(ces_naics_industries_as_is))
        .group_by("year", "industry", "geography")
        .agg(pl.sum("employment"))
        .sort("year", "geography")
    )


def group_industry(df):
    """
    group the following race/ethnicities into Other,
    since population share too low to visualize:
    American Indian, Alaska Native, Native Hawaiian, Pacific Islander
    """
    return (
        df.with_columns(
            pl.when(
                pl.col("industry").is_in(
                    {"Financial Activities", "Professional and Business Services"}
                )
            )
            .then(pl.lit("Professional, Business, Financial Services/Activities"))
            .when(
                pl.col("industry").is_in(
                    {
                        "Goods Producing",
                        "Total Farm",
                        "Trade, Transportation, and Utilities",
                    }
                )
            )
            .then(pl.lit("Trade, Transportation, Utilities, Goods Producing, and Farm"))
            .when(
                pl.col("industry").is_in(
                    {
                        "Government",
                        "Private Education and Health Services",
                        "Other Services",
                    }
                )
            )
            .then(pl.lit("Government, Private Educational, Health, and Other Services"))
            .otherwise(pl.col("industry"))
            .alias("industry")
        )
        .group_by("industry", "year", "geography")
        .agg(pl.sum("employment"))
    )

In [137]:
# CES/QCEW employment numbers are by place-of-work
years = np.arange(2008, 2023)  # county level numbers are only available up to 2022

In [138]:
df = pl.concat(
    (
        # read_ces(
        #     r"Q:\Data\Surveys\CA-EDD\CurrentEmploymentStats\ces_1990-2001_monthly_2024627.csv",
        #     np.arange(1990, 2002),
        # ),
        read_ces(
            r"Q:\Data\Surveys\CA-EDD\CurrentEmploymentStats\ces_2002-2013_monthly_2024627.csv",
            np.arange(2008, 2014),
        ),
        read_ces(
            r"Q:\Data\Surveys\CA-EDD\CurrentEmploymentStats\ces_2014-2024_monthly_20241022.csv",
            np.arange(2014, 2024),
        ),
    )
)

In [139]:
annual_total_df = (
    df.group_by("year", "geography").sum().drop("industry").sort("geography", "year")
)
annual_total_df.write_csv("output/data/employment.csv")
# if want to start from this step directly:
# annual_total_df = pl.read_csv("output/data/employment.csv")

In [140]:
employment_chart = line_plot(
    annual_total_df.filter(pl.col("geography") == "San Francisco"),
    x="year",
    y="employment",
    color=alt.value(color_sf_bayarea[0]),
    title="San Francisco total employment",
    tooltip=["year", "geography", "employment"],
)
employment_chart.save("output/Links/employment.png")
employment_chart.properties(width="container").save("output/Links/employment.html")
employment_chart


In [141]:
employment_2019norm_df, employment_2019norm_chart = line_plot_normalized(
    annual_total_df.filter(pl.col("year") > 2018),
    x="year",
    y="employment",
    color=alt.Color("geography").scale(
        domain=["San Francisco", "Bay Area"],
        range=color_sf_bayarea,
    ),
    title="total employment",
    tooltip=["year", "geography", "employment"],
    norm_x_value=2019,
)
employment_2019norm_df.write_csv("output/data/employment-2019norm.csv")
employment_2019norm_chart.save("output/Links/employment-2019norm.png")
employment_2019norm_chart.properties(width="container").save(
    "output/Links/employment-2019norm.html"
)
employment_2019norm_chart


In [142]:
shares_df = add_industry_share_col(group_industry(df), "employment").sort(
    "year", "geography", "industry"
)
shares_wide_df = shares_df.pivot(
    on="year", index=["industry", "geography"], values="employment-industry_share"
)
shares_df = shares_df.with_columns(
    pl.col("employment").cast(int), pl.col("employment-industry_share").round(3)
)
shares_df.write_csv("output/data/employment-by_industry.csv")

In [143]:
shares_wide_df

industry,geography,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""Government, Private Educationa…","""Bay Area""",0.309724,0.327049,0.328984,0.324401,0.318473,0.314753,0.31176,0.30764,0.305794,0.306255,0.303883,0.302551,0.308806,0.306708,0.302728,0.312746
"""Information""","""Bay Area""",0.034755,0.03557,0.036099,0.037955,0.039245,0.04007,0.042211,0.044697,0.046642,0.050154,0.053204,0.057261,0.064682,0.065483,0.065713,0.06081
"""Leisure and Hospitality""","""Bay Area""",0.099454,0.101153,0.102994,0.104641,0.106387,0.107879,0.109011,0.109239,0.109263,0.109393,0.109459,0.109762,0.079326,0.082582,0.095449,0.100151
"""Professional, Business, Financ…","""Bay Area""",0.230405,0.224831,0.226362,0.229965,0.235233,0.238181,0.239481,0.241073,0.24187,0.241793,0.240278,0.242014,0.254072,0.253334,0.249978,0.242942
"""Trade, Transportation, Utiliti…","""Bay Area""",0.325663,0.311397,0.30556,0.303038,0.300661,0.299116,0.297537,0.297351,0.29643,0.292405,0.293177,0.288412,0.293114,0.291894,0.286131,0.283352
"""Government, Private Educationa…","""San Francisco""",0.334528,0.346305,0.353127,0.343093,0.330766,0.321901,0.312833,0.301971,0.296994,0.296126,0.29407,0.289616,0.303781,0.312013,0.30076,0.31234
"""Information""","""San Francisco""",0.03472,0.035843,0.035831,0.040774,0.043483,0.042376,0.044715,0.052196,0.055445,0.059767,0.062641,0.068843,0.078356,0.083297,0.089491,0.085895
"""Leisure and Hospitality""","""San Francisco""",0.139421,0.139763,0.141254,0.142663,0.144388,0.141944,0.142406,0.139013,0.138251,0.134673,0.13401,0.133414,0.084865,0.081484,0.100228,0.11022
"""Professional, Business, Financ…","""San Francisco""",0.31997,0.31491,0.313586,0.320494,0.32781,0.339296,0.345642,0.35102,0.350698,0.34892,0.347246,0.347513,0.375042,0.374197,0.369419,0.353673
"""Trade, Transportation, Utiliti…","""San Francisco""",0.171361,0.163179,0.156203,0.152976,0.153553,0.154482,0.154404,0.155799,0.158612,0.160514,0.162032,0.160615,0.157956,0.149009,0.140102,0.137873


In [144]:
shares_covid_diff = shares_wide_df.select(
    "geography",
    "industry",
    ((pl.col("2023") - pl.col("2019")) * 100).round(1).alias("percent_diff_19to23"),
).sort("geography", "percent_diff_19to23")
shares_covid_diff.write_csv(
    "output/data/employment-by_industry-percent_diff_19to23.csv"
)
shares_covid_diff

geography,industry,percent_diff_19to23
str,str,f64
"""Bay Area""","""Leisure and Hospitality""",-1.0
"""Bay Area""","""Trade, Transportation, Utiliti…",-0.5
"""Bay Area""","""Professional, Business, Financ…",0.1
"""Bay Area""","""Information""",0.4
"""Bay Area""","""Government, Private Educationa…",1.0
"""San Francisco""","""Leisure and Hospitality""",-2.3
"""San Francisco""","""Trade, Transportation, Utiliti…",-2.3
"""San Francisco""","""Professional, Business, Financ…",0.6
"""San Francisco""","""Information""",1.7
"""San Francisco""","""Government, Private Educationa…",2.3


In [156]:
shares_chart = (
    alt.Chart(
        shares_df.filter(
            (pl.col("year") > 2017) & (pl.col("geography") == "San Francisco")
        ).with_columns(date=pl.date(pl.col("year"), 1, 1)),
        title="employment: industry shares",
    )
    .mark_line()
    .encode(
        x=alt.X("date", title="year"),
        y=alt.Y("employment-industry_share", title="San Francisco").axis(format="%"),
        # column=alt.Column("geography", sort="descending"),
        color="industry",
        tooltip=[
            "year",
            "geography",
            "industry",
            "employment",
            alt.Tooltip("employment-industry_share", format="%"),
        ],
    )
    .interactive()
    | alt.Chart(
        shares_df.filter(
            (pl.col("year") > 2017) & (pl.col("geography") == "Bay Area")
        ).with_columns(date=pl.date(pl.col("year"), 1, 1))
    )
    .mark_line()
    .encode(
        x=alt.X("date", title="year"),
        y=alt.Y("employment-industry_share", title="Bay Area").axis(format="%"),
        # column=alt.Column("geography", sort="descending"),
        color="industry",
        tooltip=[
            "year",
            "geography",
            "industry",
            "employment",
            alt.Tooltip("employment-industry_share", format="%"),
        ],
    )
    .interactive()
)

shares_chart.save("output/Links/employment-by_industry.png")
shares_chart.save(
    "output/Links/employment-by_industry.html"
)  # .properties(width="container")
shares_chart


In [146]:
shares_chart = (
    alt.Chart(
        shares_df.filter(pl.col("year").is_in({2019, 2023})).with_columns(
            date=pl.date(pl.col("year"), 1, 1)
        )
    )
    .mark_line()
    .encode(
        x=alt.X("year:O"),
        y=alt.Y("employment-industry_share").axis(format="%"),
        column=alt.Column("geography", sort="descending"),
        color="industry",
        tooltip=[
            "year",
            "geography",
            "industry",
            "employment",
            alt.Tooltip("employment-industry_share", format="%"),
        ],
    )
    .interactive()
)
shares_chart
