## Produce daily Origin-Destination trip matrix for NYC resident cbgs
## Include time away from home buckets, and various device counts

##### Output is master excel table for origin NYC boroughs by day and destinations, time away from home, and device behavior

In [1]:
import pandas as pd
import numpy as np
import s3fs
import os
import time

In [2]:
from geo import stco,sub
from safegraph_py_functions import safegraph_py_functions as sgpy

In [3]:
%load_ext dotenv
%dotenv
myAccessKey = os.getenv('myAccessKey')
mySecretKey = os.getenv('mySecretKey')

start = time.time()

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
# read nyc origin cbgs
cbg_nyc = pd.read_csv(f'../data/nyc_cbg.csv')

In [6]:
# specify the SG key and secret
fs = s3fs.S3FileSystem(profile="safegraphws", key=myAccessKey, secret=mySecretKey, client_kwargs={'endpoint_url': 'https://s3.wasabisys.com', 'region_name':'us-east-1'})

## RUNNING FOR 1 MONTH ONLY, Y-o-Y COMPARISON

In [7]:
# set date variables, in this case, a test month for 2 different years
month = "09"
years = ["2020"]
dayList =["01","02","03","04","05","06","07","08","09","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31"]
d1 = 0
d2 = 30

# If running all days and months - replace range and adjust for loop to run through lists below:
#monthList =["01","02","03","04","05","06","07","08"]
#dayNumList =[31, 29, 31, 30, 31, 30, 31,31] 

In [8]:
## Iterate and create pivot for home county to destination county (in region + outside)
frames = [] 
for y in years:
    for i in range(d1,d2):
        with fs.open(f'sg-c19-response/social-distancing/v2/{y}/{month}/{dayList[i]}/{y}-{month}-{dayList[i]}-social-distancing.csv.gz','rb') as f:
            print(f'{y}-{month}-{dayList[i]}')
            # read SG's file
            df = pd.read_csv(f, escapechar='\\', compression='gzip')
            # filter NYC's Origin CBGs
            df = pd.merge(cbg_nyc, df, left_on="orig_cbg", right_on="origin_census_block_group", how="inner")

            #unpack json destination cbgs
            df = sgpy.unpack_json_and_merge(df, json_column='destination_cbgs', key_col_name='destination_cbg', value_col_name='dest_cbg_count')
            
            ##Make new columns
            df['orig_cbg'] = df['orig_cbg'].apply(str) #clean origin cbg
            df['date_y-m-d'] = df['date_range_start'].str[:10]
            #separate home trips from other trips for later aggregation
            df['is_home'] = df.apply(lambda x: x['orig_cbg']==x['destination_cbg'],axis=1)
            #id destinations by county fips
            df['dest_stco_all'] = df['destination_cbg'].str[:5]
            #id 31cr counties and all others outside
            df['dest_stco_reg'] = df['dest_stco_all'] 
            df.loc[~df['dest_stco_reg'].isin(stco),'dest_stco_reg'] = 'O31CR' 
            df['dest_sub'] = df['dest_stco_all'].map(sub).fillna('O31CR')

            #Make new table with select columns
            dff = df[['date_y-m-d','orig_stco','dest_cbg_count','dest_stco_reg','dest_sub','is_home']]
            frames.append(dff) 

2020-09-01
2020-09-02
2020-09-03
2020-09-04
2020-09-05
2020-09-06
2020-09-07
2020-09-08
2020-09-09
2020-09-10
2020-09-11
2020-09-12
2020-09-13
2020-09-14
2020-09-15
2020-09-16
2020-09-17
2020-09-18
2020-09-19
2020-09-20
2020-09-21
2020-09-22
2020-09-23
2020-09-24
2020-09-25
2020-09-26
2020-09-27
2020-09-28
2020-09-29
2020-09-30


In [9]:
df_dest = pd.concat(frames)
df_dest = pd.pivot_table(df_dest,values=['dest_cbg_count'],index=['date_y-m-d','orig_stco'],columns=['dest_stco_reg','is_home'],aggfunc=np.sum,\
                         fill_value=0,margins=True)

In [10]:
df_dest.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count
Unnamed: 0_level_1,dest_stco_reg,09001,09005,09009,34003,34013,34017,34019,34021,34023,34025,34027,34029,34031,34035,34037,34039,34041,36005,36005,36027,36047,36047,36059,36061,36061,36071,36079,36081,36081,36085,36085,36087,36103,36105,36111,36119,O31CR,All
Unnamed: 0_level_2,is_home,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,True,False,True,False,False,False,False,False,False,Unnamed: 39_level_2
date_y-m-d,orig_stco,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3,Unnamed: 23_level_3,Unnamed: 24_level_3,Unnamed: 25_level_3,Unnamed: 26_level_3,Unnamed: 27_level_3,Unnamed: 28_level_3,Unnamed: 29_level_3,Unnamed: 30_level_3,Unnamed: 31_level_3,Unnamed: 32_level_3,Unnamed: 33_level_3,Unnamed: 34_level_3,Unnamed: 35_level_3,Unnamed: 36_level_3,Unnamed: 37_level_3,Unnamed: 38_level_3,Unnamed: 39_level_3
2020-09-01,36005,265,10,96,358,232,225,10,18,78,52,43,58,99,12,12,74,7,38975,38978,88,1920,0,293,10087,0,105,50,1796,0,144,0,175,203,24,41,3310,2812,100650
2020-09-01,36047,140,14,88,294,413,443,21,78,401,318,78,181,107,76,32,250,18,2327,0,55,78556,71205,1550,10868,0,139,19,7064,0,1081,0,100,870,94,99,324,7580,184883
2020-09-01,36061,155,42,144,516,290,400,8,27,147,159,99,136,138,37,14,122,6,4162,0,95,2695,0,519,30596,32806,110,33,1977,0,198,0,119,957,26,66,626,4972,82397
2020-09-01,36081,201,19,116,399,225,307,10,29,154,109,71,114,103,34,40,138,22,2487,0,79,8732,0,9651,10976,0,167,33,77505,78157,291,0,132,2006,53,106,702,5641,198809
2020-09-01,36085,50,4,46,165,225,338,39,47,931,526,63,460,59,102,47,335,9,344,0,16,4249,0,177,2133,0,66,5,679,0,28908,25482,35,158,35,26,63,2542,68364
2020-09-02,36005,276,6,94,381,203,209,5,20,63,43,31,46,98,14,9,94,4,36960,38653,57,2053,0,350,9481,0,148,36,1754,0,133,0,159,197,18,37,3533,2613,97778
2020-09-02,36047,142,20,108,292,403,395,25,68,404,264,87,155,98,54,26,216,15,2273,0,53,75906,70289,1648,10534,0,134,21,6873,0,1204,0,112,742,83,93,282,7576,180595
2020-09-02,36061,158,41,122,507,306,335,6,31,127,165,110,104,128,37,25,129,11,4064,0,85,2639,0,523,28952,32838,110,34,1972,0,181,0,123,904,40,49,651,4753,80260
2020-09-02,36081,241,18,111,411,201,330,14,44,189,101,66,66,74,30,20,115,18,2550,0,65,8635,0,9965,10692,0,165,30,72525,76943,290,0,109,1908,59,103,644,5553,192285
2020-09-02,36085,50,4,34,142,249,386,42,74,946,674,56,499,60,112,28,359,11,317,0,10,4338,0,165,2105,0,63,5,589,0,27143,25450,25,135,38,32,53,2443,66637


In [11]:
#Run an iteration to unpack the bucketed time away from home

In [12]:
frames = [] 
for y in years:
    for i in range(d1,d2):
        with fs.open(f'sg-c19-response/social-distancing/v2/{y}/{month}/{dayList[i]}/{y}-{month}-{dayList[i]}-social-distancing.csv.gz','rb') as f:
            print(f'{y}-{month}-{dayList[i]}')
            # read SG's file
            df = pd.read_csv(f, escapechar='\\', compression='gzip')
            # filter NYC's Origin CBGs
            df = pd.merge(cbg_nyc, df, left_on="orig_cbg", right_on="origin_census_block_group", how="inner")

            #unpack json bucketed time away from home
            df = sgpy.unpack_json_and_merge(df, json_column='bucketed_away_from_home_time', key_col_name='away_from_home_time', value_col_name='away_count')
            
            ##Make new columns
            df['date_y-m-d'] = df['date_range_start'].str[:10]
            #Make new table with select columns
            dff = df[['date_y-m-d','orig_stco','away_from_home_time','away_count']]
            frames.append(dff) 

2020-09-01
2020-09-02
2020-09-03
2020-09-04
2020-09-05
2020-09-06
2020-09-07
2020-09-08
2020-09-09
2020-09-10
2020-09-11
2020-09-12
2020-09-13
2020-09-14
2020-09-15
2020-09-16
2020-09-17
2020-09-18
2020-09-19
2020-09-20
2020-09-21
2020-09-22
2020-09-23
2020-09-24
2020-09-25
2020-09-26
2020-09-27
2020-09-28
2020-09-29
2020-09-30


In [13]:
df_away = pd.concat(frames)
df_away = pd.pivot_table(df_away,values=['away_count'],index=['date_y-m-d','orig_stco'],columns=['away_from_home_time'],aggfunc=np.sum,fill_value=0,margins=True)

In [14]:
df_away.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count
Unnamed: 0_level_1,away_from_home_time,1081-1200,1201-1320,121-180,1321-1440,181-240,21-45,241-300,301-360,361-420,421-480,46-60,481-540,541-600,601-660,61-120,661-720,721-840,841-960,961-1080,<20,All
date_y-m-d,orig_stco,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2020-09-01,36005,930,650,2821,983,2660,3032,1697,1558,1538,1674,1384,1833,1335,934,3727,824,1050,736,738,23053,53157
2020-09-01,36047,1562,1361,5186,1801,4505,5467,3103,2728,2692,2960,2528,3227,2585,1807,6871,1380,2093,1403,1240,40593,95092
2020-09-01,36061,999,794,2867,976,2647,2994,1476,1214,1293,1267,1285,1271,1115,802,3791,761,1122,826,771,22934,51205
2020-09-01,36081,1615,1297,5467,1633,4665,5880,3264,2885,2855,3155,2677,3450,2855,2013,7699,1541,1998,1222,1216,42525,99912
2020-09-01,36085,640,492,2031,510,1751,2043,1168,959,960,1042,830,1095,873,603,2759,436,617,392,447,11696,31344
2020-09-02,36005,941,636,2468,933,2207,2808,1649,1386,1438,1564,1259,1634,1297,871,3413,755,1040,771,752,22650,50472
2020-09-02,36047,1756,1110,4389,1698,3834,4885,2802,2517,2610,2890,2273,3165,2684,1649,6221,1328,1810,1322,1263,39808,90014
2020-09-02,36061,1103,798,2374,961,2101,2616,1233,1168,1167,1350,1184,1373,1345,762,3365,656,990,841,827,22727,48941
2020-09-02,36081,1589,1077,4446,1608,3794,5163,2860,2639,2860,3233,2292,3455,3156,1828,6427,1306,1574,1225,1224,41666,93422
2020-09-02,36085,598,487,1693,514,1382,1851,1044,933,967,1098,780,1145,993,516,2531,417,510,412,450,11500,29821


In [15]:
# Run another iteration for just counts of devices
#'device_count','completely_home_device_count','part_time_work_behavior_devices','full_time_work_behavior_devices',

In [16]:
frames = [] 
for y in years:
    for i in range(d1,d2):
        with fs.open(f'sg-c19-response/social-distancing/v2/{y}/{month}/{dayList[i]}/{y}-{month}-{dayList[i]}-social-distancing.csv.gz','rb') as f:
            print(f'{y}-{month}-{dayList[i]}')
            # read SG's file
            df = pd.read_csv(f, escapechar='\\', compression='gzip')
            # filter NYC's Origin CBGs
            df = pd.merge(cbg_nyc, df, left_on="orig_cbg", right_on="origin_census_block_group", how="inner")
            
            ##Make new columns
            df['date_y-m-d'] = df['date_range_start'].str[:10]
            #Make new table with select columns
            dff = df[['date_y-m-d','orig_stco','device_count','completely_home_device_count','part_time_work_behavior_devices','full_time_work_behavior_devices']]
            frames.append(dff)

2020-09-01
2020-09-02
2020-09-03
2020-09-04
2020-09-05
2020-09-06
2020-09-07
2020-09-08
2020-09-09
2020-09-10
2020-09-11
2020-09-12
2020-09-13
2020-09-14
2020-09-15
2020-09-16
2020-09-17
2020-09-18
2020-09-19
2020-09-20
2020-09-21
2020-09-22
2020-09-23
2020-09-24
2020-09-25
2020-09-26
2020-09-27
2020-09-28
2020-09-29
2020-09-30


In [17]:
df_device = pd.concat(frames)
df_device = df_device.groupby(['date_y-m-d','orig_stco']).agg({'device_count':np.sum,'completely_home_device_count':np.sum,\
                                                    'part_time_work_behavior_devices':np.sum,\
                                                    'full_time_work_behavior_devices':np.sum})

In [18]:
#Run a concat of all three tables to make master table with origin/day index

In [19]:
df_master = pd.concat([df_device,df_dest,df_away],axis=1)
df_master = df_master.reset_index()

In [20]:
df_master.to_excel(f'output/Outflows/{month}-2020-_bigtable.xlsx')


In [21]:
end = time.time()
elapsed = end - start
print(f'Run time - {elapsed} seconds')

Run time - 2718.268038749695 seconds
