## Produce daily Origin-Destination trip matrix for NYC resident cbgs
## Include time away from home buckets, and various device counts

##### Output is master excel table for origin NYC boroughs by day and destinations, time away from home, and device behavior

In [1]:
import pandas as pd
import numpy as np
import s3fs
import os
import time

In [2]:
from geo import stco,sub
from safegraph_py_functions import safegraph_py_functions as sgpy

In [3]:
%load_ext dotenv
%dotenv
myAccessKey = os.getenv('myAccessKey')
mySecretKey = os.getenv('mySecretKey')

start = time.time()

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
# read nyc origin cbgs
cbg_nyc = pd.read_csv(f'../data/nyc_cbg.csv')
nyc_geoxwalk = pd.read_excel(f'')

In [6]:
# specify the SG key and secret
fs = s3fs.S3FileSystem(profile="safegraphws", key=myAccessKey, secret=mySecretKey, client_kwargs={'endpoint_url': 'https://s3.wasabisys.com', 'region_name':'us-east-1'})

## RUNNING FOR 1 MONTH ONLY, Y-o-Y COMPARISON

In [7]:
# set date variables, in this case, a test month for 2 different years
month = "12"
years = ["2019"] #,"2020"
dayList =["01","02","03","04","05","06","07","08","09","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31"]
d1 = 0
d2 = 31

# If running all days and months - replace range and adjust for loop to run through lists below:
#monthList =["01","02","03","04","05","06","07","08"]
#dayNumList =[31, 29, 31, 30, 31, 30, 31,31] 

In [8]:
## Iterate and create pivot for home county to destination county (in region + outside)
frames = [] 
for y in years:
    for i in range(d1,d2):
        with fs.open(f'sg-c19-response/social-distancing/v2/{y}/{month}/{dayList[i]}/{y}-{month}-{dayList[i]}-social-distancing.csv.gz','rb') as f:
            print(f'{y}-{month}-{dayList[i]}')
            # read SG's file
            df = pd.read_csv(f, escapechar='\\', compression='gzip')
            # filter NYC's Origin CBGs
            df = pd.merge(cbg_nyc, df, left_on="orig_cbg", right_on="origin_census_block_group", how="inner")

            #unpack json destination cbgs
            df = sgpy.unpack_json_and_merge(df, json_column='destination_cbgs', key_col_name='destination_cbg', value_col_name='dest_cbg_count')
            
            ##Make new columns
            df['orig_cbg'] = df['orig_cbg'].apply(str) #clean origin cbg
            df['date_y-m-d'] = df['date_range_start'].str[:10]
            #separate home trips from other trips for later aggregation
            df['is_home'] = df.apply(lambda x: x['orig_cbg']==x['destination_cbg'],axis=1)
            #id destinations by county fips
            df['dest_stco_all'] = df['destination_cbg'].str[:5]
            #id 31cr counties and all others outside
            df['dest_stco_reg'] = df['dest_stco_all'] 
            df.loc[~df['dest_stco_reg'].isin(stco),'dest_stco_reg'] = 'O31CR' 
            df['dest_sub'] = df['dest_stco_all'].map(sub).fillna('O31CR')

            #Make new table with select columns
            dff = df[['date_y-m-d','orig_stco','dest_cbg_count','dest_stco_reg','dest_sub','is_home']]
            frames.append(dff) 

2019-12-01
2019-12-02
2019-12-03
2019-12-04
2019-12-05
2019-12-06
2019-12-07
2019-12-08
2019-12-09
2019-12-10
2019-12-11
2019-12-12
2019-12-13
2019-12-14
2019-12-15
2019-12-16
2019-12-17
2019-12-18
2019-12-19
2019-12-20
2019-12-21
2019-12-22
2019-12-23
2019-12-24
2019-12-25
2019-12-26
2019-12-27
2019-12-28
2019-12-29
2019-12-30
2019-12-31


In [9]:
df_dest = pd.concat(frames)
df_dest = pd.pivot_table(df_dest,values=['dest_cbg_count'],index=['date_y-m-d','orig_stco'],columns=['dest_stco_reg','is_home'],aggfunc=np.sum,\
                         fill_value=0,margins=True)

In [10]:
df_dest.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count
Unnamed: 0_level_1,dest_stco_reg,09001,09005,09009,34003,34013,34017,34019,34021,34023,34025,34027,34029,34031,34035,34037,34039,34041,36005,36005,36027,36047,36047,36059,36061,36061,36071,36079,36081,36081,36085,36085,36087,36103,36105,36111,36119,O31CR,All
Unnamed: 0_level_2,is_home,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,True,False,True,False,False,False,False,False,False,Unnamed: 39_level_2
date_y-m-d,orig_stco,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3,Unnamed: 23_level_3,Unnamed: 24_level_3,Unnamed: 25_level_3,Unnamed: 26_level_3,Unnamed: 27_level_3,Unnamed: 28_level_3,Unnamed: 29_level_3,Unnamed: 30_level_3,Unnamed: 31_level_3,Unnamed: 32_level_3,Unnamed: 33_level_3,Unnamed: 34_level_3,Unnamed: 35_level_3,Unnamed: 36_level_3,Unnamed: 37_level_3,Unnamed: 38_level_3,Unnamed: 39_level_3
2019-12-01,36005,340,18,161,524,301,290,21,41,199,59,90,28,155,52,12,146,23,38763,57027,105,1803,0,451,9906,0,197,38,2416,0,151,0,181,233,25,33,3530,4314,121633
2019-12-01,36047,304,49,193,475,685,503,45,138,566,558,202,169,176,62,36,357,57,1845,0,117,83845,103050,1834,12523,0,222,34,8533,0,1382,0,164,1018,109,111,495,12827,232684
2019-12-01,36061,545,125,216,900,766,669,49,177,308,368,214,105,139,133,43,286,21,3785,0,213,3214,0,1252,45053,53673,188,77,4206,0,323,0,216,1729,87,167,1238,14575,135060
2019-12-01,36081,346,29,218,580,455,371,17,85,272,174,112,79,133,68,45,235,34,1742,0,97,8662,0,9362,11931,0,257,43,81532,103299,347,0,167,1718,131,102,632,9305,232580
2019-12-01,36085,45,4,41,248,240,242,15,36,847,430,57,111,70,70,31,341,18,158,0,26,3200,0,143,1749,0,47,0,576,0,25359,31381,23,112,49,25,66,2915,68675
2019-12-02,36005,263,13,67,400,192,324,6,24,127,44,74,25,89,13,7,111,8,52905,57741,50,2572,0,345,18847,0,89,29,3087,0,104,0,136,202,12,16,3842,2426,144190
2019-12-02,36047,171,12,78,314,593,649,25,81,543,156,113,75,152,49,8,304,15,2210,0,36,114573,104025,1858,27687,0,72,17,10934,0,1323,0,103,720,19,35,441,7781,275172
2019-12-02,36061,222,13,92,504,534,602,6,52,166,79,81,31,107,39,12,155,8,4759,0,57,3683,0,488,66238,55753,76,20,3937,0,191,0,101,337,15,44,794,7674,146870
2019-12-02,36081,242,4,88,380,415,463,7,41,204,60,63,26,96,40,20,162,10,2866,0,37,13721,0,10538,26184,0,65,36,105386,103957,279,0,63,1476,26,31,733,5354,273073
2019-12-02,36085,33,0,46,178,241,431,10,35,591,216,48,34,44,63,2,341,2,282,0,10,5707,0,130,4433,0,24,0,779,0,32029,31235,19,59,10,8,62,1318,78420


In [11]:
#Run an iteration to unpack the bucketed time away from home

In [12]:
frames = [] 
for y in years:
    for i in range(d1,d2):
        with fs.open(f'sg-c19-response/social-distancing/v2/{y}/{month}/{dayList[i]}/{y}-{month}-{dayList[i]}-social-distancing.csv.gz','rb') as f:
            print(f'{y}-{month}-{dayList[i]}')
            # read SG's file
            df = pd.read_csv(f, escapechar='\\', compression='gzip')
            # filter NYC's Origin CBGs
            df = pd.merge(cbg_nyc, df, left_on="orig_cbg", right_on="origin_census_block_group", how="inner")

            #unpack json bucketed time away from home
            df = sgpy.unpack_json_and_merge(df, json_column='bucketed_away_from_home_time', key_col_name='away_from_home_time', value_col_name='away_count')
            
            ##Make new columns
            df['date_y-m-d'] = df['date_range_start'].str[:10]
            #Make new table with select columns
            dff = df[['date_y-m-d','orig_stco','away_from_home_time','away_count']]
            frames.append(dff) 

2019-12-01
2019-12-02
2019-12-03
2019-12-04
2019-12-05
2019-12-06
2019-12-07
2019-12-08
2019-12-09
2019-12-10
2019-12-11
2019-12-12
2019-12-13
2019-12-14
2019-12-15
2019-12-16
2019-12-17
2019-12-18
2019-12-19
2019-12-20
2019-12-21
2019-12-22
2019-12-23
2019-12-24
2019-12-25
2019-12-26
2019-12-27
2019-12-28
2019-12-29
2019-12-30
2019-12-31


In [13]:
df_away = pd.concat(frames)
df_away = pd.pivot_table(df_away,values=['away_count'],index=['date_y-m-d','orig_stco'],columns=['away_from_home_time'],aggfunc=np.sum,fill_value=0,margins=True)

In [14]:
df_away.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count,away_count
Unnamed: 0_level_1,away_from_home_time,1081-1200,1201-1320,121-180,1321-1440,181-240,21-45,241-300,301-360,361-420,421-480,46-60,481-540,541-600,601-660,61-120,661-720,721-840,841-960,961-1080,<20,All
date_y-m-d,orig_stco,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2019-12-01,36005,866,504,2916,837,2174,3087,2047,1498,1326,1268,1398,1212,1044,816,3795,699,1109,1023,929,37695,66243
2019-12-01,36047,1646,1070,5800,1484,4443,5968,4022,2832,2611,2368,2649,2385,1943,1748,8005,1484,2329,1889,1808,62290,118774
2019-12-01,36061,1404,1008,3293,1095,2486,3615,2212,1519,1440,1260,1640,1216,1299,1316,4666,1236,1973,1674,1565,32993,68910
2019-12-01,36081,1444,919,5935,1268,4502,5911,4201,2844,2551,2484,2660,2353,2000,1747,8369,1459,2163,1675,1454,58294,114233
2019-12-01,36085,473,294,1937,352,1475,2075,1378,831,626,610,820,572,488,425,2827,411,608,497,452,15755,32906
2019-12-02,36005,615,468,2792,563,2502,3199,2649,2433,3014,2773,1412,2675,1867,1219,3793,801,1106,906,639,31627,67053
2019-12-02,36047,1165,892,5535,967,4726,5662,5023,5200,6415,6146,2435,5834,3966,2523,7245,1708,2128,1594,1228,49524,119916
2019-12-02,36061,910,687,3223,660,2855,3320,2957,2695,3072,3159,1410,2915,2208,1570,4236,1133,1466,1086,1033,29027,69622
2019-12-02,36081,1023,830,5374,936,4683,5458,4960,4932,6513,6233,2447,5974,4167,2717,6894,1719,2050,1407,1160,45904,115381
2019-12-02,36085,318,260,1548,287,1451,1562,1590,1724,2227,2030,667,1670,1086,701,2153,448,548,412,294,11869,32845


In [15]:
# Run another iteration for just counts of devices
#'device_count','completely_home_device_count','part_time_work_behavior_devices','full_time_work_behavior_devices',

In [16]:
frames = [] 
for y in years:
    for i in range(d1,d2):
        with fs.open(f'sg-c19-response/social-distancing/v2/{y}/{month}/{dayList[i]}/{y}-{month}-{dayList[i]}-social-distancing.csv.gz','rb') as f:
            print(f'{y}-{month}-{dayList[i]}')
            # read SG's file
            df = pd.read_csv(f, escapechar='\\', compression='gzip')
            # filter NYC's Origin CBGs
            df = pd.merge(cbg_nyc, df, left_on="orig_cbg", right_on="origin_census_block_group", how="inner")
            
            ##Make new columns
            df['date_y-m-d'] = df['date_range_start'].str[:10]
            #Make new table with select columns
            dff = df[['date_y-m-d','orig_stco','device_count','completely_home_device_count','part_time_work_behavior_devices','full_time_work_behavior_devices']]
            frames.append(dff)

2019-12-01
2019-12-02
2019-12-03
2019-12-04
2019-12-05
2019-12-06
2019-12-07
2019-12-08
2019-12-09
2019-12-10
2019-12-11
2019-12-12
2019-12-13
2019-12-14
2019-12-15
2019-12-16
2019-12-17
2019-12-18
2019-12-19
2019-12-20
2019-12-21
2019-12-22
2019-12-23
2019-12-24
2019-12-25
2019-12-26
2019-12-27
2019-12-28
2019-12-29
2019-12-30
2019-12-31


In [17]:
df_device = pd.concat(frames)
df_device = df_device.groupby(['date_y-m-d','orig_stco']).agg({'device_count':np.sum,'completely_home_device_count':np.sum,\
                                                    'part_time_work_behavior_devices':np.sum,\
                                                    'full_time_work_behavior_devices':np.sum})

In [18]:
#Run a concat of all three tables to make master table with origin/day index

In [19]:
df_master = pd.concat([df_device,df_dest,df_away],axis=1)
df_master = df_master.reset_index()

In [20]:
df_master.to_excel(f'output/dara/{month}-2019-_bigtable.xlsx')


In [21]:
end = time.time()
elapsed = end - start
print(f'Run time - {elapsed} seconds')

Run time - 4452.025114059448 seconds
