## Produce daily Origin-Destiniation trip matrix for NYC resident boroughs to NYC metro region municipalities

##### Output is daily csv o-d matrices, O = NYC total or NYC borough and D = NYC metro region municipality (county subdivision - census designated place geography)

In [1]:
import pandas as pd
import numpy as np
import s3fs
import os
import time

In [2]:
from geo import stco,sub
from safegraph_py_functions import safegraph_py_functions as sgpy

In [3]:
%load_ext dotenv
%dotenv
myAccessKey = os.getenv('myAccessKey')
mySecretKey = os.getenv('mySecretKey')

start = time.time()

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
# read nyc origin cbgs
cbg_nyc = pd.read_csv(f'../data/nyc_cbg.csv')
orig_subpl = pd.read_excel(f'../data/31cr_cbg-subpl_xwalk.xlsx')

In [7]:
for column_name in orig_subpl.columns:
    orig_subpl.rename(columns={column_name:column_name.replace('dest','orig')},inplace=True)

Unnamed: 0,bgrp_str,orig_cbg,orig_subpl,orig_subpl_lbl
0,90010100000.0,90010101011,900133620,"Greenwich town (Fairfield, CT)"
1,90010100000.0,90010101012,900133620,"Greenwich town (Fairfield, CT)"
2,90010100000.0,90010101013,900133620,"Greenwich town (Fairfield, CT)"
3,90010100000.0,90010101014,900133620,"Greenwich town (Fairfield, CT)"
4,90010100000.0,90010101021,900133620,"Greenwich town (Fairfield, CT)"


orig_subpl['orig_cbg'] = orig_subpl['dest_cbg'].astype(str)
orig_subpl['orig_cbg'] = orig_subpl['dest_cbg'].str.zfill(12)
orig_subpl['dest_cbg'] = orig_subpl['dest_cbg'].astype(str)

In [8]:
# specify the SG key and secret
fs = s3fs.S3FileSystem(profile="safegraphws", key=myAccessKey, secret=mySecretKey, client_kwargs={'endpoint_url': 'https://s3.wasabisys.com', 'region_name':'us-east-1'})

In [9]:
# set date variables, in this case, a test month for 2 different years
y = "2020"
y0 = "2019"

monthList =["01","02","03","04","05","06","07","08","09","10","11","12"]
dayNumList =[31, 28, 31, 30, 31, 30, 31,31,30,31,30,31] 
dayList =["01","02","03","04","05","06","07","08","09","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31"]

In [10]:
NYC = ['36005','36047','36061','36081','36085']

In [11]:
## Test month - individual days for year-over-year comparison

frames = []
for m in range(0,1):
    for d in range(0,2):#dayNumList[m]):
        with fs.open(f'sg-c19-response/social-distancing/v2/{y0}/{monthList[m]}/{dayList[d]}/{y0}-{monthList[m]}-{dayList[d]}-social-distancing.csv.gz','rb') as f:
            print(f'{y0}-{monthList[m]}-{dayList[d]}')
            # read SG's file
            df = pd.read_csv(f, escapechar='\\', compression='gzip')
            
            # filter NYC's Origin CBGs
            df = pd.merge(orig_subpl, df, left_on="orig_cbg", right_on="origin_census_block_group", how="inner")

            #unpack json destination cbgs
            df = sgpy.unpack_json_and_merge(df, json_column='destination_cbgs', key_col_name='destination_cbg', value_col_name='dest_cbg_count')
            
            #select destinations in the region only & code in subplace ID
            #df = pd.merge(dest_subpl,df,left_on="dest_cbg", right_on="destination_cbg", how="inner")
            
            ##Make new columns
            df['date_y-m-d'] = df['date_range_start'].str[:10]
            
            #id 31cr counties and all others outside
            df['dest_stco'] = df['destination_cbg'].str[:5]
            df = df[df['dest_stco'].isin(NYC)] 
             
            #Make new table with select columns
            dff = df[['date_y-m-d','orig_subpl','dest_stco','dest_cbg_count']]
            frames.append(dff)
            

2019-01-01
2019-01-02


In [12]:
df_dest = pd.concat(frames)

In [13]:
df_dest_bor = pd.pivot_table(df_dest,values=['dest_cbg_count'],index=['date_y-m-d','orig_subpl'],columns=['dest_stco'],aggfunc=np.sum,\
                         fill_value=0,margins=True)

In [15]:
df_dest_bor.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count,dest_cbg_count
Unnamed: 0_level_1,dest_stco,36005,36047,36061,36081,36085,All
date_y-m-d,orig_subpl,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2019-01-01,3601011,1,4,25,75,3,108
2019-01-01,3601594,1,1,1,2,0,5
2019-01-01,3602044,3,18,13,21,1,56
2019-01-01,3602374,1,3,7,1,1,13
2019-01-01,3602737,0,1,4,3,0,8


In [12]:
df_dest_nyc = pd.pivot_table(df_dest,values=['dest_cbg_count'],index=['date_y-m-d','orig_subpl'],columns=['dest_sub','dest_subpl'],aggfunc=np.sum,\
                         fill_value=0,margins=True)

In [13]:
df_dest_bor.to_csv(f'output/Destinations/dest_coxsubpl_daily_09-12-20{y0}.csv')
df_dest_nyc.to_csv(f'output/Destinations/dest_nycxsubpl_daily_09-12-20{y0}.csv')

### 2020

In [14]:
#adjusted for February leap year
dayNumList =[31, 29, 31, 30, 31, 30, 31,31,30,31] 

In [16]:
## January to October
frames = []
for m in range(0,10):
    for d in range(0,dayNumList[m]):
        with fs.open(f'sg-c19-response/social-distancing/v2/{y}/{monthList[m]}/{dayList[d]}/{y}-{monthList[m]}-{dayList[d]}-social-distancing.csv.gz','rb') as f:
            print(f'{y}-{monthList[m]}-{dayList[d]}')
            # read SG's file
            df = pd.read_csv(f, escapechar='\\', compression='gzip')
            
            # filter NYC's Origin CBGs
            df = pd.merge(orig_subpl, df, left_on="orig_cbg", right_on="origin_census_block_group", how="inner")

            #unpack json destination cbgs
            df = sgpy.unpack_json_and_merge(df, json_column='destination_cbgs', key_col_name='destination_cbg', value_col_name='dest_cbg_count')
            
            #select destinations in the region only & code in subplace ID
            #df = pd.merge(dest_subpl,df,left_on="dest_cbg", right_on="destination_cbg", how="inner")
            
            ##Make new columns
            df['date_y-m-d'] = df['date_range_start'].str[:10]
            
            #id 31cr counties and all others outside
            df['dest_stco'] = df['destination_cbg'].str[:5]
            df = df[df['dest_stco'].isin(NYC)] 
             
            #Make new table with select columns
            dff = df[['date_y-m-d','orig_subpl','dest_stco','dest_cbg_count']]
            frames.append(dff)

2020-01-01
2020-01-02
2020-01-03
2020-01-04
2020-01-05
2020-01-06
2020-01-07
2020-01-08
2020-01-09
2020-01-10
2020-01-11
2020-01-12
2020-01-13
2020-01-14
2020-01-15
2020-01-16
2020-01-17
2020-01-18
2020-01-19
2020-01-20
2020-01-21
2020-01-22
2020-01-23
2020-01-24
2020-01-25
2020-01-26
2020-01-27
2020-01-28
2020-01-29
2020-01-30
2020-01-31
2020-02-01
2020-02-02
2020-02-03
2020-02-04
2020-02-05
2020-02-06
2020-02-07
2020-02-08
2020-02-09
2020-02-10
2020-02-11
2020-02-12
2020-02-13
2020-02-14
2020-02-15
2020-02-16
2020-02-17
2020-02-18
2020-02-19
2020-02-20
2020-02-21
2020-02-22
2020-02-23
2020-02-24
2020-02-25
2020-02-26
2020-02-27
2020-02-28
2020-02-29
2020-03-01
2020-03-02
2020-03-03
2020-03-04
2020-03-05
2020-03-06
2020-03-07
2020-03-08
2020-03-09
2020-03-10
2020-03-11
2020-03-12
2020-03-13
2020-03-14
2020-03-15
2020-03-16
2020-03-17
2020-03-18
2020-03-19
2020-03-20
2020-03-21
2020-03-22
2020-03-23
2020-03-24
2020-03-25
2020-03-26
2020-03-27
2020-03-28
2020-03-29
2020-03-30
2020-03-31

In [17]:
df_dest = pd.concat(frames)

In [18]:
df_dest_bor = pd.pivot_table(df_dest,values=['dest_cbg_count'],index=['date_y-m-d','orig_subpl'],columns=['dest_stco'],aggfunc=np.sum,\
                         fill_value=0,margins=True)

In [19]:
df_dest_bor.to_csv('output/Destinations/orig_subplxbor_daily_01-10-20.csv')

## Get device count for all subplaces, daily

In [21]:
## January to October
frames2 = []
for m in range(0,10):
    for d in range(0,dayNumList[m]):
        with fs.open(f'sg-c19-response/social-distancing/v2/{y}/{monthList[m]}/{dayList[d]}/{y}-{monthList[m]}-{dayList[d]}-social-distancing.csv.gz','rb') as f:
            print(f'{y}-{monthList[m]}-{dayList[d]}')
            # read SG's file
            df = pd.read_csv(f, escapechar='\\', compression='gzip')
            
            # filter NYC's Origin CBGs
            df = pd.merge(orig_subpl, df, left_on="orig_cbg", right_on="origin_census_block_group", how="inner")
            
            ##Make new columns
            df['date_y-m-d'] = df['date_range_start'].str[:10]
            dff = df[['date_y-m-d','orig_subpl','completely_home_device_count','candidate_device_count','device_count']]
            dff = dff.groupby(['date_y-m-d','orig_subpl']).agg({'device_count':np.sum,'completely_home_device_count':np.sum,\
                                                    'candidate_device_count':np.sum}).reset_index()
            frames2.append(dff)

2020-01-01
2020-01-02
2020-01-03
2020-01-04
2020-01-05
2020-01-06
2020-01-07
2020-01-08
2020-01-09
2020-01-10
2020-01-11
2020-01-12
2020-01-13
2020-01-14
2020-01-15
2020-01-16
2020-01-17
2020-01-18
2020-01-19
2020-01-20
2020-01-21
2020-01-22
2020-01-23
2020-01-24
2020-01-25
2020-01-26
2020-01-27
2020-01-28
2020-01-29
2020-01-30
2020-01-31
2020-02-01
2020-02-02
2020-02-03
2020-02-04
2020-02-05
2020-02-06
2020-02-07
2020-02-08
2020-02-09
2020-02-10
2020-02-11
2020-02-12
2020-02-13
2020-02-14
2020-02-15
2020-02-16
2020-02-17
2020-02-18
2020-02-19
2020-02-20
2020-02-21
2020-02-22
2020-02-23
2020-02-24
2020-02-25
2020-02-26
2020-02-27
2020-02-28
2020-02-29
2020-03-01
2020-03-02
2020-03-03
2020-03-04
2020-03-05
2020-03-06
2020-03-07
2020-03-08
2020-03-09
2020-03-10
2020-03-11
2020-03-12
2020-03-13
2020-03-14
2020-03-15
2020-03-16
2020-03-17
2020-03-18
2020-03-19
2020-03-20
2020-03-21
2020-03-22
2020-03-23
2020-03-24
2020-03-25
2020-03-26
2020-03-27
2020-03-28
2020-03-29
2020-03-30
2020-03-31

In [22]:
df_subpl_device = pd.concat(frames2)

In [23]:
df_subpl_device.head()

Unnamed: 0,date_y-m-d,orig_subpl,device_count,completely_home_device_count,candidate_device_count
0,2020-01-01,3601011,349,111,585
1,2020-01-01,3601594,36,9,107
2,2020-01-01,3602044,519,152,975
3,2020-01-01,3602374,137,35,195
4,2020-01-01,3602737,34,9,46


In [24]:
df_subpl_device.to_csv('output/Destinations/orig_subpl_device_daily_01-10-20.csv')