## Produce daily Origin-Destination trip matrix for NYC resident cbgs
## Include time away from home buckets, and various device counts

##### Output is master excel table for origin NYC boroughs by day and destinations, time away from home, and device behavior

In [1]:
import pandas as pd
import numpy as np
import s3fs
import os
import time

In [2]:
from geo import stco,sub
from safegraph_py_functions import safegraph_py_functions as sgpy

In [3]:
%load_ext dotenv
%dotenv
myAccessKey = os.getenv('myAccessKey')
mySecretKey = os.getenv('mySecretKey')

start = time.time()

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
# read nyc origin cbgs
cbg_nyc = pd.read_csv(f'../data/nyc_cbg.csv')

In [6]:
# specify the SG key and secret
fs = s3fs.S3FileSystem(profile="safegraphws", key=myAccessKey, secret=mySecretKey, client_kwargs={'endpoint_url': 'https://s3.wasabisys.com', 'region_name':'us-east-1'})

## TESTING FOR 2ND WEEK OF MARCH ONLY

In [7]:
# set date variables, in this case, a test month for 2 different years
month = "03"
years = ["2019","2020"]
dayList =["01","02","03","04","05","06","07","08","09","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31"]

# If running all days and months - replace range and adjust for loop to run through lists below:
#monthList =["01","02","03","04","05","06","07","08"]
#dayNumList =[31, 29, 31, 30, 31, 30, 31,31] 

In [8]:
## Iterate and create pivot for home county to destination county (in region + outside)
frames = [] 
for y in years:
    for i in range(7,14):
        with fs.open(f'sg-c19-response/social-distancing/v2/{y}/{month}/{dayList[i]}/{y}-{month}-{dayList[i]}-social-distancing.csv.gz','rb') as f:
            print(f'{y}-{month}-{dayList[i]}')
            # read SG's file
            df = pd.read_csv(f, escapechar='\\', compression='gzip')
            # filter NYC's Origin CBGs
            df = pd.merge(cbg_nyc, df, left_on="orig_cbg", right_on="origin_census_block_group", how="inner")

            #unpack json destination cbgs
            df = sgpy.unpack_json_and_merge(df, json_column='destination_cbgs', key_col_name='destination_cbg', value_col_name='dest_cbg_count')
            
            ##Make new columns
            df['orig_cbg'] = df['orig_cbg'].apply(str) #clean origin cbg
            df['date_y-m-d'] = df['date_range_start'].str[:10]
            #separate home trips from other trips for later aggregation
            df['is_home'] = df.apply(lambda x: x['orig_cbg']==x['destination_cbg'],axis=1)
            #id destinations by county fips
            df['dest_stco_all'] = df['destination_cbg'].str[:5]
            #id 31cr counties and all others outside
            df['dest_stco_reg'] = df['dest_stco_all'] 
            df.loc[~df['dest_stco_reg'].isin(stco),'dest_stco_reg'] = 'O31CR' 
            df['dest_sub'] = df['dest_stco_all'].map(sub).fillna('O31CR')

            #Make new table with select columns
            dff = df[['date_y-m-d','orig_stco','dest_cbg_count','dest_stco_reg','dest_sub','is_home']]
            frames.append(dff) 

2019-03-08
2019-03-09
2019-03-10
2019-03-11
2019-03-12
2019-03-13
2019-03-14
2020-03-08
2020-03-09
2020-03-10
2020-03-11
2020-03-12
2020-03-13
2020-03-14


In [9]:
df_dest = pd.concat(frames)
df_dest = pd.pivot_table(df_dest,values=['dest_cbg_count'],index=['date_y-m-d','orig_stco'],columns=['dest_stco_reg','is_home'],aggfunc=np.sum,\
                         fill_value=0,margins=True)

In [10]:
df_dest.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count
Unnamed: 0_level_1,dest_stco_reg,09001,09005,09009,34003,34013,34017,34019,34021,34023,34025,34027,34029,34031,34035,34037,34039,34041,36005,36005,36027,36047,36047,36059,36061,36061,36071,36079,36081,36081,36085,36085,36087,36103,36105,36111,36119,O31CR,All
Unnamed: 0_level_2,is_home,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,True,False,True,False,False,False,False,False,False,Unnamed: 39_level_2
date_y-m-d,orig_stco,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3,Unnamed: 23_level_3,Unnamed: 24_level_3,Unnamed: 25_level_3,Unnamed: 26_level_3,Unnamed: 27_level_3,Unnamed: 28_level_3,Unnamed: 29_level_3,Unnamed: 30_level_3,Unnamed: 31_level_3,Unnamed: 32_level_3,Unnamed: 33_level_3,Unnamed: 34_level_3,Unnamed: 35_level_3,Unnamed: 36_level_3,Unnamed: 37_level_3,Unnamed: 38_level_3,Unnamed: 39_level_3
2019-03-08,36005,371,15,109,693,381,435,8,40,228,48,68,32,176,25,16,146,7,75996,80345,115,6820,0,604,27654,0,127,41,4394,0,173,0,223,409,26,37,6023,3714,209499
2019-03-08,36047,290,23,158,460,747,889,31,95,602,219,188,114,181,73,39,328,34,6833,0,72,142864,125949,2505,34880,0,141,17,13872,0,1651,0,171,914,50,88,626,11740,346844
2019-03-08,36061,371,61,122,876,661,997,33,66,254,222,110,66,168,75,25,223,12,7813,0,145,6297,0,939,89458,70337,185,58,5208,0,327,0,207,755,60,106,1369,9564,197170
2019-03-08,36081,332,23,114,587,471,578,12,31,252,91,75,29,117,57,18,208,15,4791,0,83,18220,0,13223,32760,0,134,30,122702,121053,392,0,102,1995,52,49,1024,6218,325838
2019-03-08,36085,15,1,9,214,325,508,18,47,967,477,74,102,67,92,14,486,11,568,0,11,6849,0,221,5309,0,23,1,990,0,38257,32467,29,72,12,16,77,1822,90151
2019-03-09,36005,483,26,179,945,451,556,10,47,239,74,77,39,266,22,20,184,17,64977,76178,145,5703,0,635,18837,0,274,71,3869,0,221,0,359,521,30,74,6604,4440,186573
2019-03-09,36047,331,54,244,592,806,734,42,122,642,473,194,179,206,88,57,402,27,6035,0,142,121515,120509,2960,21305,0,222,38,12508,0,2053,0,213,1266,89,169,857,13093,308167
2019-03-09,36061,607,144,164,1401,777,1055,53,121,394,389,259,131,278,127,90,279,23,7021,0,274,6216,0,1452,74366,65955,265,113,5066,0,411,0,275,1393,89,261,1630,11251,182330
2019-03-09,36081,391,34,202,816,483,654,19,66,375,128,139,76,171,82,89,328,20,3792,0,98,13857,0,14211,19246,0,256,51,110737,116045,408,0,173,2660,86,87,1093,7623,294496
2019-03-09,36085,35,1,5,271,319,396,22,64,1529,1009,91,270,58,163,39,557,8,445,0,18,5261,0,308,2946,0,52,1,951,0,33546,31471,41,150,24,22,96,2458,82627


In [11]:
#Run an iteration to unpack the bucketed time away from home

In [22]:
frames = [] 
for y in years:
    for i in range(7,14):
        with fs.open(f'sg-c19-response/social-distancing/v2/{y}/{month}/{dayList[i]}/{y}-{month}-{dayList[i]}-social-distancing.csv.gz','rb') as f:
            print(f'{y}-{month}-{dayList[i]}')
            # read SG's file
            df = pd.read_csv(f, escapechar='\\', compression='gzip')
            # filter NYC's Origin CBGs
            df = pd.merge(cbg_nyc, df, left_on="orig_cbg", right_on="origin_census_block_group", how="inner")

            #unpack json bucketed time away from home
            df = sgpy.unpack_json_and_merge(df, json_column='bucketed_away_from_home_time', key_col_name='away_from_home_time', value_col_name='away_count')
            
            ##Make new columns
            df['date_y-m-d'] = df['date_range_start'].str[:10]
            #Make new table with select columns
            dff = df[['date_y-m-d','orig_stco','away_from_home_time','away_count']]
            frames.append(dff) 

2019-03-08
2019-03-09
2019-03-10
2019-03-11
2019-03-12
2019-03-13
2019-03-14
2020-03-08
2020-03-09
2020-03-10
2020-03-11
2020-03-12
2020-03-13
2020-03-14


In [23]:
df_away = pd.concat(frames)
df_away = pd.pivot_table(df_away,values=['away_count'],index=['date_y-m-d','orig_stco'],columns=['away_from_home_time'],aggfunc=np.sum,fill_value=0,margins=True)

In [24]:
df_away.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count
Unnamed: 0_level_1,away_from_home_time,1081-1200,1201-1320,121-180,1321-1440,181-240,21-45,241-300,301-360,361-420,421-480,46-60,481-540,541-600,601-660,61-120,661-720,721-840,841-960,961-1080,<20,>1440,All
date_y-m-d,orig_stco,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
2019-03-08,36005,877,482,4818,482,4217,4416,5197,4189,3633,3373,2068,3125,2596,1886,6110,1478,2043,1406,1078,46827,0,100301
2019-03-08,36047,1518,835,7883,775,7253,7164,8273,7143,6369,6225,3270,5631,4425,3490,9671,2543,3369,2268,1823,65445,0,155373
2019-03-08,36061,975,634,4595,489,3933,3883,4742,3987,3531,3506,1755,3149,2639,1979,5577,1601,2104,1445,1208,37590,0,89322
2019-03-08,36081,1202,800,7083,664,6045,6243,7293,6249,5953,5840,2734,5824,4585,3270,8676,2497,2893,1792,1441,58098,0,139182
2019-03-08,36085,370,214,1941,164,1687,1429,1911,1816,1958,1767,638,1738,1325,860,2212,611,812,492,411,13060,0,35416
2019-03-09,36005,1087,765,4669,879,4040,4094,4736,3346,2473,2143,1976,1990,1700,1370,6339,1160,1680,1403,1196,48666,9,95721
2019-03-09,36047,1949,1336,8077,1346,6862,6744,7594,5414,3963,3511,3363,3115,2727,2350,10653,1941,2990,2224,1985,70724,13,148881
2019-03-09,36061,1449,1080,4624,951,3970,3972,4427,3084,2291,2066,1829,1724,1542,1276,6111,1116,1828,1535,1503,39720,8,86106
2019-03-09,36081,1502,1164,7306,1154,6209,5985,6745,5041,3847,3496,2823,3187,2876,2190,9788,1706,2430,1832,1638,62737,13,133669
2019-03-09,36085,456,313,2196,336,1894,1522,1915,1494,1146,917,754,836,656,543,2855,442,577,453,466,14349,1,34121


In [25]:
# Run another iteration for just counts of devices
#'device_count','completely_home_device_count','part_time_work_behavior_devices','full_time_work_behavior_devices',

In [26]:
frames = [] 
for y in years:
    for i in range(7,14):
        with fs.open(f'sg-c19-response/social-distancing/v2/{y}/{month}/{dayList[i]}/{y}-{month}-{dayList[i]}-social-distancing.csv.gz','rb') as f:
            print(f'{y}-{month}-{dayList[i]}')
            # read SG's file
            df = pd.read_csv(f, escapechar='\\', compression='gzip')
            # filter NYC's Origin CBGs
            df = pd.merge(cbg_nyc, df, left_on="orig_cbg", right_on="origin_census_block_group", how="inner")
            
            ##Make new columns
            df['date_y-m-d'] = df['date_range_start'].str[:10]
            #Make new table with select columns
            dff = df[['date_y-m-d','orig_stco','device_count','completely_home_device_count','part_time_work_behavior_devices','full_time_work_behavior_devices']]
            frames.append(dff)

2019-03-08
2019-03-09
2019-03-10
2019-03-11
2019-03-12
2019-03-13
2019-03-14
2020-03-08
2020-03-09
2020-03-10
2020-03-11
2020-03-12
2020-03-13
2020-03-14


In [27]:
df_device = pd.concat(frames)
df_device = df_device.groupby(['date_y-m-d','orig_stco']).agg({'device_count':np.sum,'completely_home_device_count':np.sum,\
                                                    'part_time_work_behavior_devices':np.sum,\
                                                    'full_time_work_behavior_devices':np.sum})

In [None]:
#Run a concat of all three tables to make master table with origin/day index

In [28]:
df_master = pd.concat([df_device,df_dest,df_away],axis=1)
df_master = df_master.reset_index()


In [29]:
df_master.to_excel(f'output/{month}-wk2_bigtable.xlsx')


In [30]:
end = time.time()
elapsed = end - start
print(f'Run time - {elapsed} seconds')

Run time - 4587.112530946732 seconds
