## Generate a table for NYC geographies by device count and time away from home

In [1]:
import pandas as pd
import numpy as np
import s3fs
import os
import time

In [2]:
from geo import stco,sub
from safegraph_py_functions import safegraph_py_functions as sgpy

In [3]:
%load_ext dotenv
%dotenv
myAccessKey = os.getenv('myAccessKey')
mySecretKey = os.getenv('mySecretKey')

start = time.time()

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
#read raw data for nyc origin cbgs
cbg_nyc = pd.read_csv('../data/nyc_cbg.csv')

In [6]:
# specify the SG key and secret
fs = s3fs.S3FileSystem(profile="safegraphws", key=myAccessKey, secret=mySecretKey, client_kwargs={'endpoint_url': 'https://s3.wasabisys.com', 'region_name':'us-east-1'})

In [7]:
#for test
month = "04"

#for iterating through all available
monthList =["01","02","03","04","05","06", "07","08"]
dayNumList =[31, 29, 31, 30, 31, 30, 31]
years = ["2019","2020"]
dayList =["01","02","03","04","05","06","07","08","09","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31"]

In [None]:
##Test Month: Year over Year Comparison
frames = []
for y in years:
    for i in range(0,30):
        with fs.open(f'sg-c19-response/social-distancing/v2/{y}/{month}/{dayList[i]}/{y}-{month}-{dayList[i]}-social-distancing.csv.gz','rb') as f:
            print(f'{y}-{month}-{dayList[i]}')
            # read SG's file
            df = pd.read_csv(f, escapechar='\\', compression='gzip')
            # filter NYC's Origin CBGs
            df = pd.merge(cbg_nyc, df, left_on="orig_cbg", right_on="origin_census_block_group", how="inner")

            #unpack json destination cbgs
            df = sgpy.unpack_json_and_merge(df, json_column='bucketed_away_from_home_time', key_col_name='away_from_home_time', value_col_name='time_away_count')

            #Clean up & make new analysis columns
            df['orig_cbg'] = df['orig_cbg'].apply(str)
            #df['orig_tract'] = df['orig_cbg'].str[:-1]
            df['orig_stco'] = df['orig_cbg'].str[:5]
            df['date_y-m-d'] = df['date_range_start'].str[:10]

            #Make new table with select columns & pick geo for analysis
            dff = df[['orig_stco','date_y-m-d','away_from_home_time','time_away_count']]
            frames.append(dff)
 
    #mid = time.time()
    #elapsed_mid = mid - start
    #print(f'Mid run time - {elapsed_mid}')

2019-04-01
2019-04-02
2019-04-03
2019-04-04
2019-04-05
2019-04-06
2019-04-07
2019-04-08
2019-04-09
2019-04-10
2019-04-11
2019-04-12
2019-04-13
2019-04-14
2019-04-15
2019-04-16
2019-04-17
2019-04-18
2019-04-19
2019-04-20
2019-04-21
2019-04-22
2019-04-23
2019-04-24


In [None]:
dff = pd.concat(frames)

In [None]:
dfff = pd.pivot_table(dff,values=['time_away_count'],index=['orig_stco','date_y-m-d'],columns=['away_from_home_time'],aggfunc=np.sum,fill_value=0,margins=True)

In [None]:
dfff.head()

In [None]:
dfff.to_csv(f'output/{month}_test.csv')

In [None]:
end = time.time()
elapsed = end - start
print(f'Run time - {elapsed} seconds')