In [1]:
import pandas as pd
import numpy as np
import s3fs
import os
import time

In [2]:
from geo import stco,sub
from safegraph_py_functions import safegraph_py_functions as sgpy

In [3]:
%load_ext dotenv
%dotenv
myAccessKey = os.getenv('myAccessKey')
mySecretKey = os.getenv('mySecretKey')

start = time.time()

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
cbg_nyc = pd.read_csv('../data/nyc_cbg.csv')

In [6]:
# specify the SG key and secret
fs = s3fs.S3FileSystem(profile="safegraphws", key=myAccessKey, secret=mySecretKey, client_kwargs={'endpoint_url': 'https://s3.wasabisys.com', 'region_name':'us-east-1'})

In [7]:
# years, months and days
years = ['2019','2020']
monthList =["01","02","03","04","05","06", "07","08"]
#dayNumList =[31, 29, 31, 30, 31, 30, 31, 31] 
#dayList =["01","02","03","04","05","06","07","08","09","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31"]

In [8]:
##Check candidate device count on the first of the month - to view data structure
frames = []
for y in years:
    for m in monthList:
        with fs.open(f'sg-c19-response/social-distancing/v2/{y}/{m}/01/{y}-{m}-01-social-distancing.csv.gz','rb') as f:
            print(f'{y}-{m}-01')
            # read SG's file
            df = pd.read_csv(f, escapechar='\\', compression='gzip')
            # filter NYC's Origin CBGs
            dff = pd.merge(cbg_nyc, df, left_on="orig_cbg", right_on="origin_census_block_group", how="inner")
            frames.append(dff)

2019-01-01
2019-02-01
2019-03-01
2019-04-01
2019-05-01
2019-06-01
2019-07-01
2019-08-01
2020-01-01
2020-02-01
2020-03-01
2020-04-01
2020-05-01
2020-06-01
2020-07-01
2020-08-01


In [9]:
df = pd.concat(frames)
df['date_y-m-d'] = df['date_range_start'].str[:10]
df.head()

Unnamed: 0,Pop_10E,orig_st,orig_co,orig_cbg,orig_stco,origin_census_block_group,date_range_start,date_range_end,device_count,distance_traveled_from_home,bucketed_distance_traveled,median_dwell_at_bucketed_distance_traveled,completely_home_device_count,median_home_dwell_time,bucketed_home_dwell_time,at_home_by_each_hour,part_time_work_behavior_devices,full_time_work_behavior_devices,destination_cbgs,delivery_behavior_devices,median_non_home_dwell_time,candidate_device_count,bucketed_away_from_home_time,median_percentage_time_home,bucketed_percentage_time_home,mean_home_dwell_time,mean_non_home_dwell_time,mean_distance_traveled_from_home,date_y-m-d
0,11091,36,5,360050001001,36005,360050001001,2019-01-01T00:00:00-05:00,2019-01-02T00:00:00-05:00,10,171.0,"{""16001-50000"":4,""0"":1,""2001-8000"":1,""1-1000"":...","{""16001-50000"":5,""2001-8000"":10,""8001-16000"":4...",2,0,"{"">1080"":1,""<60"":5}","[1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1]",1,1,"{""360470363002"":1,""360810716001"":1,""3604703010...",1,30,45,"{""21-45"":1,""46-60"":1,""<20"":3,""61-120"":1,""1081-...",0,"{""0-25"":7,""76-100"":2,""26-50"":1}",147.0,200.0,171.0,2019-01-01
1,1120,36,5,360050002001,36005,360050002001,2019-01-01T00:00:00-05:00,2019-01-02T00:00:00-05:00,66,1631.0,"{""16001-50000"":13,""0"":42,""2001-8000"":2,""1-1000...","{""16001-50000"":120,""<1000"":5,""2001-8000"":25,""1...",41,735,"{""721-1080"":3,""361-720"":9,""61-360"":11,""<60"":11...","[36,35,35,34,35,33,34,35,34,34,33,30,34,33,34,...",1,1,"{""340230049002"":1,""360050090002"":1,""3600500190...",1,0,228,"{""21-45"":1,""46-60"":2,""721-840"":2,""<20"":45,""61-...",100,"{""0-25"":3,""76-100"":52,""26-50"":4}",776.0,111.0,1787.0,2019-01-01
2,1974,36,5,360050002002,36005,360050002002,2019-01-01T00:00:00-05:00,2019-01-02T00:00:00-05:00,134,887.0,"{""16001-50000"":3,""0"":69,"">50000"":6,""2001-8000""...","{""16001-50000"":49,"">50000"":81,""<1000"":130,""200...",69,792,"{""721-1080"":23,""361-720"":15,""61-360"":19,""<60"":...","[68,64,67,69,74,76,79,82,81,84,82,79,78,76,75,...",1,1,"{""340297222003"":1,""360050127011"":1,""3600500900...",1,0,393,"{""21-45"":9,""481-540"":2,""46-60"":3,""721-840"":2,""...",100,"{""0-25"":23,""76-100"":98,""51-75"":10,""26-50"":3}",752.0,161.0,3653.0,2019-01-01
3,1240,36,5,360050002003,36005,360050002003,2019-01-01T00:00:00-05:00,2019-01-02T00:00:00-05:00,95,1819.0,"{""16001-50000"":4,""0"":44,"">50000"":5,""2001-8000""...","{""16001-50000"":559,"">50000"":117,""<1000"":107,""2...",47,620,"{""721-1080"":12,""361-720"":16,""61-360"":17,""<60"":...","[44,46,42,47,47,49,48,49,51,52,45,47,40,39,41,...",6,3,"{""360050127011"":1,""360750215023"":1,""3606100860...",4,7,257,"{""21-45"":8,""481-540"":1,""541-600"":1,""46-60"":1,""...",99,"{""0-25"":14,""76-100"":63,""51-75"":2,""26-50"":10}",686.0,159.0,8330.0,2019-01-01
4,0,36,5,360050004000,36005,360050004000,2019-01-01T00:00:00-05:00,2019-01-02T00:00:00-05:00,29,1956.0,"{""16001-50000"":1,""0"":10,""2001-8000"":11,""1001-2...","{""16001-50000"":15,""1001-2000"":124,""2001-8000"":...",8,0,"{""721-1080"":3,""<60"":22,""361-720"":1,""61-360"":3}","[5,1,1,0,2,2,4,2,6,3,2,3,2,1,1,2,2,3,3,1,6,5,5,7]",1,1,"{""360050121012"":1,""360050431005"":1,""3600501170...",1,67,61,"{""21-45"":1,""481-540"":2,""541-600"":1,""721-840"":1...",0,"{""0-25"":15,""76-100"":8,""51-75"":2,""26-50"":1}",122.0,233.0,2041.0,2019-01-01


In [10]:
dff = df[['date_y-m-d','orig_cbg','candidate_device_count','device_count','Pop_10E']]
dff.head()

Unnamed: 0,date_y-m-d,orig_cbg,candidate_device_count,device_count,Pop_10E
0,2019-01-01,360050001001,45,10,11091
1,2019-01-01,360050002001,228,66,1120
2,2019-01-01,360050002002,393,134,1974
3,2019-01-01,360050002003,257,95,1240
4,2019-01-01,360050004000,61,29,0


In [11]:
dff.to_csv("output/candidate_device_check.csv")