## Produce daily Origin-Destination trip matrix for NYC resident cbgs
## Include time away from home buckets, and various device counts

##### Output is master excel table for origin NYC boroughs by day and destinations, time away from home, and device behavior

In [1]:
import pandas as pd
import numpy as np
import s3fs
import os
import time

In [2]:
from geo import stco,sub
from safegraph_py_functions import safegraph_py_functions as sgpy

In [3]:
%load_ext dotenv
%dotenv
myAccessKey = os.getenv('myAccessKey')
mySecretKey = os.getenv('mySecretKey')

start = time.time()

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
# read nyc origin cbgs
cbg_nyc = pd.read_csv(f'../data/nyc_cbg.csv')

In [6]:
# specify the SG key and secret
fs = s3fs.S3FileSystem(profile="safegraphws", key=myAccessKey, secret=mySecretKey, client_kwargs={'endpoint_url': 'https://s3.wasabisys.com', 'region_name':'us-east-1'})

## RUNNING FOR 1 MONTH ONLY, Y-o-Y COMPARISON

In [7]:
# set date variables, in this case, a test month for 2 different years
month = "02"
years = ["2019","2020"]
dayList =["01","02","03","04","05","06","07","08","09","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31"]
d1 = 27
d2 = 29

# If running all days and months - replace range and adjust for loop to run through lists below:
#monthList =["01","02","03","04","05","06","07","08"]
#dayNumList =[31, 29, 31, 30, 31, 30, 31,31] 

In [10]:
## Iterate and create pivot for home county to destination county (in region + outside)
frames = [] 
for y in years:
    d2 = 28 if y == '2019' else 29
    for i in range(d1,d2):
        with fs.open(f'sg-c19-response/social-distancing/v2/{y}/{month}/{dayList[i]}/{y}-{month}-{dayList[i]}-social-distancing.csv.gz','rb') as f:
            print(f'{y}-{month}-{dayList[i]}')
            # read SG's file
            df = pd.read_csv(f, escapechar='\\', compression='gzip')
            # filter NYC's Origin CBGs
            df = pd.merge(cbg_nyc, df, left_on="orig_cbg", right_on="origin_census_block_group", how="inner")

            #unpack json destination cbgs
            df = sgpy.unpack_json_and_merge(df, json_column='destination_cbgs', key_col_name='destination_cbg', value_col_name='dest_cbg_count')
            
            ##Make new columns
            df['orig_cbg'] = df['orig_cbg'].apply(str) #clean origin cbg
            df['date_y-m-d'] = df['date_range_start'].str[:10]
            #separate home trips from other trips for later aggregation
            df['is_home'] = df.apply(lambda x: x['orig_cbg']==x['destination_cbg'],axis=1)
            #id destinations by county fips
            df['dest_stco_all'] = df['destination_cbg'].str[:5]
            #id 31cr counties and all others outside
            df['dest_stco_reg'] = df['dest_stco_all'] 
            df.loc[~df['dest_stco_reg'].isin(stco),'dest_stco_reg'] = 'O31CR' 
            df['dest_sub'] = df['dest_stco_all'].map(sub).fillna('O31CR')

            #Make new table with select columns
            dff = df[['date_y-m-d','orig_stco','dest_cbg_count','dest_stco_reg','dest_sub','is_home']]
            frames.append(dff) 

2019-02-28
2020-02-28
2020-02-29


In [None]:
df_dest = pd.concat(frames)
df_dest = pd.pivot_table(df_dest,values=['dest_cbg_count'],index=['date_y-m-d','orig_stco'],columns=['dest_stco_reg','is_home'],aggfunc=np.sum,\
                         fill_value=0,margins=True)

In [None]:
df_dest.head(10)

In [None]:
#Run an iteration to unpack the bucketed time away from home

In [None]:
frames = [] 
for y in years:
    for i in range(d1,d2):
        with fs.open(f'sg-c19-response/social-distancing/v2/{y}/{month}/{dayList[i]}/{y}-{month}-{dayList[i]}-social-distancing.csv.gz','rb') as f:
            print(f'{y}-{month}-{dayList[i]}')
            # read SG's file
            df = pd.read_csv(f, escapechar='\\', compression='gzip')
            # filter NYC's Origin CBGs
            df = pd.merge(cbg_nyc, df, left_on="orig_cbg", right_on="origin_census_block_group", how="inner")

            #unpack json bucketed time away from home
            df = sgpy.unpack_json_and_merge(df, json_column='bucketed_away_from_home_time', key_col_name='away_from_home_time', value_col_name='away_count')
            
            ##Make new columns
            df['date_y-m-d'] = df['date_range_start'].str[:10]
            #Make new table with select columns
            dff = df[['date_y-m-d','orig_stco','away_from_home_time','away_count']]
            frames.append(dff) 

In [None]:
df_away = pd.concat(frames)
df_away = pd.pivot_table(df_away,values=['away_count'],index=['date_y-m-d','orig_stco'],columns=['away_from_home_time'],aggfunc=np.sum,fill_value=0,margins=True)

In [None]:
df_away.head(10)

In [None]:
# Run another iteration for just counts of devices
#'device_count','completely_home_device_count','part_time_work_behavior_devices','full_time_work_behavior_devices',

In [None]:
frames = [] 
for y in years:
    d2 = if y == '2019'
    for i in range(d1,d2):
        with fs.open(f'sg-c19-response/social-distancing/v2/{y}/{month}/{dayList[i]}/{y}-{month}-{dayList[i]}-social-distancing.csv.gz','rb') as f:
            print(f'{y}-{month}-{dayList[i]}')
            # read SG's file
            df = pd.read_csv(f, escapechar='\\', compression='gzip')
            # filter NYC's Origin CBGs
            df = pd.merge(cbg_nyc, df, left_on="orig_cbg", right_on="origin_census_block_group", how="inner")
            
            ##Make new columns
            df['date_y-m-d'] = df['date_range_start'].str[:10]
            #Make new table with select columns
            dff = df[['date_y-m-d','orig_stco','device_count','completely_home_device_count','part_time_work_behavior_devices','full_time_work_behavior_devices']]
            frames.append(dff)

In [None]:
df_device = pd.concat(frames)
df_device = df_device.groupby(['date_y-m-d','orig_stco']).agg({'device_count':np.sum,'completely_home_device_count':np.sum,\
                                                    'part_time_work_behavior_devices':np.sum,\
                                                    'full_time_work_behavior_devices':np.sum})

In [None]:
#Run a concat of all three tables to make master table with origin/day index

In [None]:
df_master = pd.concat([df_device,df_dest,df_away],axis=1)
df_master = df_master.reset_index()

In [None]:
df_master.to_excel(f'output/{month}-_bigtable.xlsx')


In [None]:
end = time.time()
elapsed = end - start
print(f'Run time - {elapsed} seconds')