In [1]:
%load_ext autoreload

In [2]:
import altair as alt
import geopandas as gpd
import polars as pl

%autoreload
from altair_utils import color_value_transit_teal
from downtown_today_utils import parse_geog

In [3]:
taz_analysis_neighborhoods_gis_filepath = r"Q:\GIS\Policy\San_Francisco\Analysis_Neighborhoods\taz2454-sf_only-with_analysis_neighborhoods.gpkg"
taz_analysis_neighborhoods = gpd.read_file(taz_analysis_neighborhoods_gis_filepath)[
    ["TAZ", "analysis_neighborhood"]
]

In [10]:
person_2018 = parse_geog(
    pl.read_csv(
        r"Q:\Data\Surveys\HouseholdSurveys\MTC-SFCTA2018\Processing_20211018\v01\04-merge_skims\adj_weights\survey2018_precx_rewt_base2019.dat",
        separator=" ",
        columns=["hhno", "pno", "pagey", "pwtaz", "psexpfac"],
    )
    .join(
        pl.read_csv(
            r"Q:\Data\Surveys\HouseholdSurveys\MTC-SFCTA2018\Deliverable_20211018\person.csv",
            columns=["hh_id", "person_num", "telework_freq"],
        ),
        left_on=["hhno", "pno"],
        right_on=["hh_id", "person_num"],
    )
    .filter(pl.col("pagey") > 17),  # just to be safe; 2019 survey is adults only
    taz_analysis_neighborhoods,
)
person_2022 = parse_geog(
    # use nokids weight to be more consistent with 2018
    pl.read_csv(
        r"Q:\Data\Surveys\HouseholdSurveys\MTC-SFCTA2022\Processed_20241127\reformat_2019_rmoveonly\04-merge_skims\adj_weights\survey2023_precx_rewt_base2023.dat",
        separator=" ",
        columns=["hhno", "pno", "pagey", "pwtaz", "psexpfac"],
    )
    .join(
        pl.read_csv(
            r"Q:\Data\Surveys\HouseholdSurveys\MTC-SFCTA2022\Deliverable_20241127\person.csv",
            columns=["hh_id", "person_num", "telework_freq"],
        ),
        left_on=["hhno", "pno"],
        right_on=["hh_id", "person_num"],
    )
    .filter(pl.col("pagey") > 17),  # adults only, to be consistent with 2019
    taz_analysis_neighborhoods,
)

In [11]:
def plot_dist(df, df_name, col):
    df = (
        df.group_by(col).agg(weighted=pl.sum("psexpfac"), unweighted=pl.len()).sort(col)
    )
    return df, alt.Chart(df.drop_nulls()).mark_bar().encode(
        x=alt.X(f"{col}:O", title=None),
        y=alt.Y("weighted", title=col),
        color=color_value_transit_teal,
    ).properties(title=f"{df_name}")
    # + df.plot.bar(
    #     x=col, y="unweighted", title=f"{df_name}\n{col}: unweighted", frame_width=200
    # ).opts(xrotation=45)


def plot_dists(df2018, df2022, df_name, col):
    df2018, plot2018 = plot_dist(df2018, f"{df_name} preCOVID", col)
    df2022, plot2022 = plot_dist(df2022, f"{df_name} postCOVID", col)
    display(df2018)
    display(df2022)
    df = pl.concat(
        [
            df2018.with_columns(year=pl.lit(2019)),
            df2022.with_columns(year=pl.lit(2023)),
        ]
    )
    print("look here to get null shares (non-response rate):")
    display(
        df.with_columns(
            (pl.col("weighted") / pl.col("weighted").sum().over("year")).alias(
                f"{col}-share"
            )
        )
    )
    df = df.drop_nulls().with_columns(
        (pl.col("weighted") / pl.col("weighted").sum().over("year")).alias(
            f"{col}-share"
        )
    )
    print("sums 2018/2022", df2018.sum(), df2022.sum())
    return df, (plot2018 | plot2022).resolve_scale(y="shared")

In [12]:
def parse_telework_freq(person, year):
    if year == 2018:
        replace_dict = {
            1: "5+",
            2: "5+",
            3: "4",
            4: "2-3",
            5: "1",
            6: "0-1 (<1)",
            7: "0-1 (<1)",
            8: "0 (never)",
            995: None,
        }
    elif year == 2022:
        replace_dict = {
            1: "5+",
            2: "5+",
            3: "4",
            4: "2-3",
            5: "2-3",
            6: "1",
            7: "0-1 (<1)",
            8: "0-1 (<1)",
            996: "0 (never)",
            995: None,
        }
    else:
        raise NotImplementedError()
    return person.with_columns(
        telework_freq_days_per_week=pl.col("telework_freq")
        .replace_strict(replace_dict)
        .replace(  # further simplifying/merging of categories
            {
                "0 (never)": "0-1",
                "0-1 (<1)": "0-1",
                "1": "1",
                "2-3": "2-3",
                "4": "4",
                "5+": "5+",
            }
        )
    )

In [None]:
telecommute_ne_core_df, telecommute_ne_core_chart = plot_dists(
    parse_telework_freq(
        person_2018.filter(pl.col("pw_geog") == "1. northeast core"), 2018
    ),
    parse_telework_freq(
        person_2022.filter(pl.col("pw_geog") == "1. northeast core"), 2022
    ),
    "persons working in NE core",
    "telework_freq_days_per_week",
)
telecommute_ne_core_chart

In [14]:
telecommute_ne_core_df.write_csv("output/telework_freq-work_in_ne_core.csv")

In [15]:
# analysis for people working in rest of SF
# plot_dists(
#     parse_telework_freq(person_2018.filter((pl.col("pw_geog") == "2. rest of SF") | (pl.col("pw_geog") == "1. northeast core")), 2018),
#     parse_telework_freq(person_2022.filter((pl.col("pw_geog") == "2. rest of SF") | (pl.col("pw_geog") == "1. northeast core")), 2022),
#     "persons working in entirety of SF",
#     "telework_freq_days_per_week"
# )

In [None]:
alt.Chart(telecommute_ne_core_df).mark_bar().encode(
    x="year:O",
    y="weighted",
    color=color_value_transit_teal,
    column="telework_freq_days_per_week",
)

In [None]:
alt.Chart(telecommute_ne_core_df).mark_bar().encode(
    x="year:O",
    y="telework_freq_days_per_week-share",
    color=color_value_transit_teal,
    column="telework_freq_days_per_week",
)