## worker types

1. **work from home**: home parcel and work parcel are the same `hhparcel==pwpcl`
2. **teleworker**: workers that have work location outside of home parcel; and work at home 3 hrs or longer (includes workers with missing work location `work_loc==-1`)
3. **commuter**: workers that have work location outside of home parcel; and work at home less than 3 hrs (includes workers with missing work location `work_loc==-1`)
4. **non-worker**: not employed
5. ~~**no work location**: employed worker that have no work location record (only in survey data, not model results)~~

- code:

In [1]:
import os
import pandas as pd
import numpy as np
import math
#import pyodbc
from pathlib import Path
import plotly.express as px
from plotly.subplots import make_subplots
import re

# to show plotly figures in quarto HTML file
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"

In [2]:
#survey_path = Path('R:/e2projects_two/SoundCast/Inputs/dev/base_year/2018/survey')
#survey_path = Path(r'R:\e2projects_two\2023_base_year\2023_survey\daysim_format\skims_attached')
# telecommute version of survey data
#rsg_survey_path = Path('R:/e2projects_two/SoundCast/Inputs/dev/base_year/2018/survey/RSG_version')
rsg_survey_path = Path(r'R:\e2projects_two\2023_base_year\2023_survey\daysim_format\cleaned\skims_attached')
# model output
model_path = Path('//modelstation3/c$\Workspace\sc_2023/soundcast/outputs/daysim')
# model_path = Path(r'R:\e2projects_two\2023_base_year\2017_2019_survey\test\cleaned\skims_attached')
# county_taz = pd.read_csv(r'R:\e2projects_two\SoundCast\releases\v2.1.1\soundcast-2.1.1\scripts\summarize\inputs\county_taz.csv')


# ---- read survey data and model results ----

# survey data
df_household = pd.read_csv(rsg_survey_path/'_household.tsv', sep='\t')
df_person = pd.read_csv(rsg_survey_path/'_person.tsv', sep='\t')
df_person_day = pd.read_csv(rsg_survey_path/'_person_day.tsv', sep='\t')
df_tour = pd.read_csv(rsg_survey_path/'_tour.tsv', sep='\t')
# df_trip = pd.read_csv(rsg_survey_path/'_trip.tsv', sep='\t')

# model data
df_model_household = pd.read_csv(model_path/'_household.tsv', sep='\t')
df_model_person = pd.read_csv(model_path/'_person.tsv', sep='\t')
df_model_person_day = pd.read_csv(model_path/'_person_day.tsv', sep='\t')
df_model_tour = pd.read_csv(model_path/'_tour.tsv', sep='\t')
# # df_model_trip = pd.read_csv(model_path/'_trip.tsv', sep='\t')

In [3]:
# columns used in analysis
household_list = ['hhno', 'hhparcel']
person_list = ['hhno', 'pno', 'id', 'pwtyp', 'pwpcl', 'puwmode', 'pwautime', 'pwaudist']
person_list = ['hhno', 'pno', 'pwtyp', 'pwpcl', 'puwmode', 'pwautime', 'pwaudist','worker_type']
person_day_list = ['hhno', 'pno', 'day', 'wkathome',
                   'wktours', 'sctours', 'estours', 'pbtours', 'shtours', 'mltours', 'sotours',
                   'retours', 'metours', 'pdexpfac']


In [4]:
# Create placeholder column for worker type for  model data
df_model_person['worker_type'] = -1

In [5]:
# merge household, person, person-day data
#----  survey person day data ----
df1 = df_person_day[person_day_list].\
    merge(df_person[person_list], on=['hhno','pno'], how='left'). \
    merge(df_household[household_list], on=['hhno'], how='left')

# ---- model person day results ----
df2 = df_model_person_day[person_day_list].\
    merge(df_model_person[person_list], on=['hhno','pno'], how='left'). \
    merge(df_model_household[household_list], on=['hhno'], how='left')

# ---- full person day dataset ---

df1['source'] = "survey data"
df2['source'] = "model results"

# unweighted survey
df3 = df1.copy()
df3['pdexpfac'] = 1
df3['source'] = "unweighted survey"

# Calculate worker type for model data using person day information
df2.loc[df2['pwtyp'].isin([1,2]), 'worker_type'] = 'commuter'
df2.loc[(df2['pwtyp'].isin([1,2]))&(df2['hhparcel']==df2['pwpcl']),'worker_type'] = 'wfh'
df2.loc[(df2['pwtyp'].isin([1,2]))&(df2['hhparcel']!=df2['pwpcl'])&(df2['wkathome'] > 0),'worker_type'] = 'telecommuter'

In [6]:
df_full_person_day = pd.concat([df1,df2, df3])

# group workers
# df_full_person_day['worker_type'] = df_full_person_day.apply(lambda x: group_worker(x['hhparcel'],x['pwpcl'],x['pwtyp'],x['wkathome']),axis=1)
# format work at home time
df_full_person_day['wkathome_int'] = df_full_person_day['wkathome']. \
    apply(lambda x: np.floor(x) if((x<10.0) & (x>=0.0)) else (0.0 if(x<0.0) else 10.0))
df_full_person_day['wkathome_hour'] = df_full_person_day['wkathome_int']. \
    apply(lambda x: str(int(x)) if (x<10.0) else "10+")

# person day data for workers
df_full_person_day_workers = df_full_person_day.loc[df_full_person_day['pwtyp']!=0].copy()

In [7]:
# tour data
def create_df(survey_df, model_df, col_list, expfac, purpose_list, dpurp_var='pdpurp'):
    df_survey = survey_df[col_list].copy()
    df_survey['source'] = "survey data"

    df_unweighted_survey = survey_df[col_list].copy()
    df_unweighted_survey[expfac] = 1
    df_unweighted_survey['source'] = "unweighted survey"

    df_model = model_df[col_list].copy()
    df_model['source'] = "model results"

    df = pd.concat([df_survey,
                    df_unweighted_survey,
                    df_model])

    df['tour_purpose'] = df[dpurp_var].map(purpose_list)

    return df


# FIXME: check dictionary
purpose_dict = {1: 'wktours',
                2: 'sctours',
                3: 'estours',
                4: 'pbtours',
                5: 'shtours',
                6: 'mltours',
                7: 'sotours',
                8: 'retours',
                9: 'metours'}
# create df
tour_list = ['hhno', 'pno', 'day', 'tour', 'pdpurp', 'tautodist', 'toexpfac']
tour = create_df(df_tour, df_model_tour, tour_list, 'toexpfac', purpose_dict)
# get worker type
tour = tour.merge(df_full_person_day,
                  on=['hhno', 'pno', 'day', 'source'], how='left')

# Create bins: bins of 2 miles up to 60 miles
max_bin = 40
bin_size = 2
tour['dist_bins'] = pd.cut(tour['tautodist'], bins=np.arange(0, max_bin+bin_size, bin_size),
                                    labels=[str(i) for i in np.arange(0, max_bin, bin_size)])

## worker counts

- only include paid worker: `'pwtyp' != 0` (Worker type (0=non-worker, 1=full time worker, 2=part time worker)) 
- commuters are workers with teleworking hours less than 3 hours

In [8]:
df_full_person_day_workers.columns

Index(['hhno', 'pno', 'day', 'wkathome', 'wktours', 'sctours', 'estours',
       'pbtours', 'shtours', 'mltours', 'sotours', 'retours', 'metours',
       'pdexpfac', 'pwtyp', 'pwpcl', 'puwmode', 'pwautime', 'pwaudist',
       'worker_type', 'hhparcel', 'source', 'wkathome_int', 'wkathome_hour'],
      dtype='object')

In [9]:
# worker counts by worker type
df_worker_count = df_full_person_day_workers.groupby(['source','worker_type'])['pdexpfac']. \
    sum().reset_index()
df_worker_count['percent'] = df_worker_count.groupby(['source'], group_keys=False)['pdexpfac']. \
    apply(lambda x: x / float(x.sum()))

fig = px.bar(df_worker_count.sort_values(by=['source']), x="worker_type", y="percent", custom_data=['pdexpfac'],
             color="source",
             barmode="group",template="simple_white",
             title="workers by worker type")
fig.update_traces(hovertemplate="share of workers: %{y:.2%}<br>" +
                                "worker counts: %{customdata[0]:.0f}",)
fig.update_layout(height=400, width=700, font=dict(size=11),
                  yaxis_tickformat = '.2%')
fig.show()

In [10]:
# worker counts by worker type
df_worker_count = df_full_person_day_workers.groupby(['source','worker_type','pwtyp'])['pdexpfac']. \
    sum().reset_index()
df_worker_count['percent'] = df_worker_count.groupby(['source','worker_type'], group_keys=False)['pdexpfac']. \
    apply(lambda x: x / float(x.sum()))
# df_worker_count
fig = px.bar(df_worker_count.sort_values(by=['source']), x='pwtyp', y="percent", custom_data=['pdexpfac'],
             facet_col="worker_type",color="source",
             barmode="group",template="simple_white",
             title="Share of full-time/part-time workers by worker type")
fig.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
fig.update_traces(hovertemplate="share of workers: %{y:.2%}<br>" +
                                "worker counts: %{customdata[0]:.0f}",)
fig.update_layout(height=400, width=700, font=dict(size=11),
                  yaxis_tickformat = '.2%')
fig.show()

### worker counts by telework hours

- group workers by number of teleworking hours (10+ hours are represented as 10 hrs)

In [11]:
# population in each telework hour
df_hour_count = df_full_person_day_workers.groupby(['source','wkathome_int', 'wkathome_hour'])['pdexpfac'].sum().reset_index()
df_hour_count['percent'] = df_hour_count.groupby(['source'], group_keys=False)['pdexpfac']. \
    apply(lambda x:  x / float(x.sum()))

fig = px.bar(df_hour_count.sort_values(by=['source','wkathome_int']), x="wkathome_hour", y="percent", color="source",
             barmode="group",template="simple_white",
             title= "share of workers by telework hour")
fig.update_layout(height=350, width=700, font=dict(size=11),
                  xaxis = dict(dtick = 1),
                  yaxis_tickformat = '.2%')
fig.update_xaxes(categoryorder='array', categoryarray= ['0','1', '2', '3', '4', '5', '6', '7', '8', '9', '10+'])
fig.show()

## work tours

In [12]:
_df = df_full_person_day_workers[df_full_person_day_workers['worker_type']=="telecommuter"].groupby(['source','wktours'])['pdexpfac'].sum().reset_index()
_df['percent'] = _df.groupby(['source'], group_keys=False)['pdexpfac'].apply(lambda x: x / float(x.sum()))
fig = px.bar(_df, x="wktours", y="percent", color="source",
             barmode="group",template="simple_white",
             title= "teleworkers: number of work tours")
fig.update_layout(height=350, width=700, font=dict(size=11),
                  yaxis_tickformat = '.2%')
fig.show()


In [13]:
# _df = df_full_person_day_workers[df_full_person_day_workers['worker_type']=="teleworker"].groupby(['source','pwtyp','wktours'])['pdexpfac'].sum().reset_index()
# _df['percent'] = _df.groupby(['source','pwtyp'], group_keys=False)['pdexpfac'].apply(lambda x: x / float(x.sum()))

# fig = px.bar(_df, x='wktours', y="percent", custom_data=['pdexpfac'],
#              facet_col="pwtyp",color="source",
#              barmode="group",template="simple_white",
#              title="teleworkers: number of work tours by full-/part-time workers"
#             )
# # fig.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
# fig.update_traces(hovertemplate="share of workers: %{y:.2%}<br>" +
#                                 "worker counts: %{customdata[0]:.0f}",)
# fig.update_layout(height=400, width=700, font=dict(size=11),
#                   yaxis_tickformat = '.2%')
# fig.show()

In [14]:
_df = df_full_person_day_workers[df_full_person_day_workers['worker_type']=="commuter"].groupby(['source','wktours'])['pdexpfac'].sum().reset_index()
_df['percent'] = _df.groupby(['source'], group_keys=False)['pdexpfac'].apply(lambda x: x / float(x.sum()))

fig = px.bar(_df, x="wktours", y="percent", color="source",
             barmode="group",template="simple_white",
             title= "commuters: number of work tours")
fig.update_layout(height=350, width=700, font=dict(size=11),
                  yaxis_tickformat = '.2%')
fig.show()

In [15]:
_df = df_full_person_day_workers[df_full_person_day_workers['worker_type']=="commuter"].groupby(['source','pwtyp','wktours'])['pdexpfac'].sum().reset_index()
_df['percent'] = _df.groupby(['source','pwtyp'], group_keys=False)['pdexpfac'].apply(lambda x: x / float(x.sum()))

fig = px.bar(_df, x='wktours', y="percent", custom_data=['pdexpfac'],
             facet_col="pwtyp",color="source",
             barmode="group",template="simple_white",
             title="commuter: number of work tours by full-/part-time workers")
# fig.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
fig.update_traces(hovertemplate="share of workers: %{y:.2%}<br>" +
                                "worker counts: %{customdata[0]:.0f}",)
fig.update_layout(height=400, width=700, font=dict(size=11),
                  yaxis_tickformat = '.2%')
fig.show()

In [16]:
_df = df_full_person_day_workers[df_full_person_day_workers['worker_type']=="wfh"].groupby(['source','wktours'])['pdexpfac'].sum().reset_index()
_df['percent'] = _df.groupby(['source'], group_keys=False)['pdexpfac'].apply(lambda x: x / float(x.sum()))

fig = px.bar(_df, x="wktours", y="percent", color="source",
             barmode="group",template="simple_white",
             title= "work from home workers: number of work tours")
fig.update_layout(height=350, width=700, font=dict(size=11),
                  yaxis_tickformat = '.2%')
fig.show()

In [17]:
_df = df_full_person_day_workers[df_full_person_day_workers['worker_type']=="wfh"].groupby(['source','pwtyp','wktours'])['pdexpfac'].sum().reset_index()
_df['percent'] = _df.groupby(['source','pwtyp'], group_keys=False)['pdexpfac'].apply(lambda x: x / float(x.sum()))

fig = px.bar(_df, x='wktours', y="percent", custom_data=['pdexpfac'],
             facet_col="pwtyp",color="source",
             barmode="group",template="simple_white",
             title="work from home: number of work tours by full-/part-time workers")
# fig.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
fig.update_traces(hovertemplate="share of workers: %{y:.2%}<br>" +
                                "worker counts: %{customdata[0]:.0f}",)
fig.update_layout(height=400, width=700, font=dict(size=11),
                  yaxis_tickformat = '.2%')
fig.show()

In [18]:
_df = df_full_person_day_workers.loc[df_full_person_day_workers['wktours']>0].groupby(['source','wkathome_int', 'wkathome_hour'])['pdexpfac'].sum().reset_index()

_df2 = df_hour_count[['source', 'wkathome_int', 'wkathome_hour']].copy()
_df2['total_workers'] = df_hour_count['pdexpfac']
_df = _df.merge(_df2, on=['source', 'wkathome_int', 'wkathome_hour'], how='left')
_df['percent'] = _df['pdexpfac']/_df['total_workers']


fig = px.bar(_df.sort_values(by=['source','wkathome_int']), x="wkathome_hour", y="percent", color="source",
                barmode="group",template="simple_white",
                title= "share of people making 1+ work tours by telework hours")
fig.update_layout(height=300, width=700, font=dict(size=11),
                  xaxis = dict(dtick = 1),
                  yaxis_tickformat = '.2%')
fig.update_xaxes(categoryorder='array', categoryarray= ['0','1', '2', '3', '4', '5', '6', '7', '8', '9', '10+'])
fig.show()

## Tour rates by destination purpose for each worker type



In [19]:
# tour counts by worker type and tour purpose
df_tour_rate = tour.groupby(['source','worker_type', 'tour_purpose'])['toexpfac'].sum().reset_index()
# get person counts
df_person_count = df_full_person_day_workers.groupby(['source','worker_type'])['pdexpfac']. \
    sum().reset_index()


df_tour_rate = df_tour_rate.merge(df_person_count, on=['source', 'worker_type'], how='left')

df_tour_rate['tour_rate'] = df_tour_rate['toexpfac']/df_tour_rate['pdexpfac']

def plot_tour_rate(df, worker_type):
    df_plot = df.loc[df['worker_type']==worker_type]
    fig = px.bar(df_plot, x="tour_purpose", y="tour_rate", color="source",
                 barmode="group",template="simple_white",
                 title= worker_type + "s: tour rates by destination purpose")
    fig.update_layout(height=300, width=700, font=dict(size=11),
                      yaxis_tickformat = '.2f')
    fig.show()


In [20]:
df_tour_rate.to_clipboard()

In [21]:
plot_tour_rate(df_tour_rate,"commuter")

In [22]:
# tour counts by worker type and tour purpose
df1 = tour.loc[(tour['worker_type']=="commuter") & (tour['wktours']==0)].groupby(['source','worker_type', 'tour_purpose'])['toexpfac'].sum().reset_index()
# get person counts
df2 = df_full_person_day.loc[(df_full_person_day['worker_type']=="commuter") & (df_full_person_day['wktours']==0)].groupby(['source','worker_type'])['pdexpfac']. \
    sum().reset_index()

df1 = df1.merge(df2, on=['source', 'worker_type'], how='left')
df1['tour_rate'] = df1['toexpfac']/df1['pdexpfac']

fig = px.bar(df1, x="tour_purpose", y="tour_rate", color="source",
             barmode="group",template="simple_white",
             title= "commuter with no work tours: tour rates by destination purpose")
fig.update_layout(height=300, width=700, font=dict(size=11),
                  yaxis_tickformat = '.2f')
fig.show()

In [23]:
# tour counts by worker type and tour purpose
df1 = tour.loc[(tour['worker_type']=="commuter") & (tour['pwpcl']==-1)].groupby(['source','worker_type', 'tour_purpose'])['toexpfac'].sum().reset_index()
# get person counts
df2 = df_full_person_day.loc[(df_full_person_day['worker_type']=="commuter") & (df_full_person_day['pwpcl']==-1)].groupby(['source','worker_type'])['pdexpfac']. \
    sum().reset_index()

df1 = df1.merge(df2, on=['source', 'worker_type'], how='left')
df1['tour_rate'] = df1['toexpfac']/df1['pdexpfac']

fig = px.bar(df1, x="tour_purpose", y="tour_rate", color="source",
             barmode="group",template="simple_white",
             color_discrete_sequence=px.colors.qualitative.D3[1:3],
             title= "commuters with missing work location: tour rates by destination purpose")
fig.update_layout(height=300, width=700, font=dict(size=11),
                  yaxis_tickformat = '.2f')
fig.show()

In [24]:
plot_tour_rate(df_tour_rate,"telecommuter")

In [25]:
plot_tour_rate(df_tour_rate,"wfh")

In [26]:
plot_tour_rate(df_tour_rate,'-1')

In [27]:
## teleworker: telework hour threshold
#
# def tele_hour_worker_count(df):
#
#     tele_workers = pd.DataFrame()
#
#     for hour in [1,2,3,4,5,6,7,8]:
#         df['worker_type_hour'] = df.apply(lambda x: group_worker(x['hhparcel'],x['pwpcl'],x['pwtyp'],x['wkathome'],hour),axis=1)
#
#         df_telework_count = df.groupby(['source','worker_type_hour'])['pdexpfac']. sum().reset_index()
#         df_telework_count['percent'] = df_telework_count.groupby(['source'], group_keys=False)['pdexpfac']. \
#             apply(lambda x: 100 * x / float(x.sum()))
#         df_telework_count['tele_hours'] = hour
#         df_telework_count = df_telework_count.loc[df_telework_count['worker_type_hour'].isin(["teleworker","commuter"])]
#
#         tele_workers = pd.concat([tele_workers, df_telework_count])
#
#     return tele_workers
#
# df_telework_count_hour = tele_hour_worker_count(df_full_person_day)

In [28]:
# df = df_telework_count_hour.loc[df_telework_count_hour['worker_type_hour']=='teleworker']
#
# fig = px.line(df.sort_values(by=['source','worker_type_hour','tele_hours']), x="tele_hours", y="percent", color="source",
#              # facet_col='source',
#              template="simple_white",
#              title="percentage of teleworkers by telework hour thresholds")
# fig.update_layout(height=400, width=600, font=dict(size=11))
# fig.update_yaxes(showgrid=True, dtick=1)
# fig.show()

In [29]:
### teleworker work tour rates by telework hour thresholds
#
# tour_hour = tour.copy()
# tele_rates_hours = pd.DataFrame()
#
# # group workers
# for hour in [1,2,3,4,5,6,7,8]:
#     tour_hour['worker_type_hour'] = tour_hour.apply(lambda x: group_worker(x['hhparcel'],x['pwpcl'],x['pwtyp'],x['wkathome'],hour),axis=1)
#
#     # tour counts by worker type and tour purpose
#     _df = tour_hour.groupby(['source','worker_type_hour', 'tour_purpose'])['toexpfac'].sum().reset_index()
#     _df['tele_hours'] = hour
#     _df = _df.loc[_df['worker_type_hour'].isin(["teleworker","commuter"])]
#
#     tele_rates_hours = pd.concat([tele_rates_hours,_df])
#
#
# # get person counts
# tele_rates_hours = tele_rates_hours.merge(df_telework_count_hour, on=['source', 'worker_type_hour', 'tele_hours'], how='left')
# tele_rates_hours['tour_rate'] = tele_rates_hours['toexpfac']/tele_rates_hours['pdexpfac']
#
# # tele_rates_hours

In [30]:
# work_tele_rates_hours = tele_rates_hours.loc[(tele_rates_hours['tour_purpose']=='wktours') & (tele_rates_hours['worker_type_hour']=='teleworker')]
# fig = px.line(work_tele_rates_hours.sort_values(by=['source','worker_type_hour','tele_hours']), x="tele_hours", y="tour_rate", color="source",
#               # facet_col='source',
#               template="simple_white",
#               title="teleworker work tour rates by different teleworking hour thresholds")
# fig.update_layout(height=400, width=600, font=dict(size=11))
# fig.update_yaxes(showgrid=True, dtick=0.025)
# fig.show()

## Tour distances by purpose

In [31]:
df_tour_distance = tour.groupby(['source', 'worker_type', 'tour_purpose', 'dist_bins']). \
    sum()[['toexpfac']].reset_index()

df_tour_distance['percent'] = df_tour_distance. \
    groupby(['tour_purpose', 'worker_type', 'source'], group_keys=False)['toexpfac']. \
    apply(lambda x: x / float(x.sum()))

In [32]:
def plot_tour_distance(df, dpurp, worker_type_list):

    df_plot = df.loc[(df['tour_purpose']==dpurp) & (df['worker_type'].isin(worker_type_list))].copy()

    fig2 = px.line(df_plot, x='dist_bins', y="percent", color="worker_type", template="simple_white",
                   facet_col='source',
                   title=dpurp + " tour distance")

    fig2.for_each_annotation(lambda a: a.update(text = a.text.split("=")[-1]))
    fig2.update_layout(height=400, width=700, font=dict(size=11),
                       yaxis_tickformat = '.2%')
    fig2.show()


plot_tour_distance(df_tour_distance, "wktours",["wfh", "commuter", "telecommuter"])

In [33]:
plot_tour_distance(df_tour_distance, "sctours", ["wfh", "commuter", "telecommuter"])

In [34]:
plot_tour_distance(df_tour_distance, "estours", ["wfh", "commuter", "telecommuter"])

In [35]:
plot_tour_distance(df_tour_distance, "pbtours", ["wfh", "commuter", "telecommuter"])

In [36]:
plot_tour_distance(df_tour_distance, "shtours", ["wfh", "commuter", "telecommuter"])

In [37]:
plot_tour_distance(df_tour_distance, "mltours", ["wfh", "commuter", "telecommuter"])

In [38]:
plot_tour_distance(df_tour_distance, "sotours", ["wfh", "commuter", "telecommuter"])