## This notebook pulls candidate device panel, device panel, and completely home device panel for U.S. counties and the national total

### Purpose is for sample bias understanding

In [2]:
import pandas as pd
import numpy as np
import s3fs
import os
import time

In [3]:
from geo import stco,sub
from safegraph_py_functions import safegraph_py_functions as sgpy

In [3]:
%load_ext dotenv
%dotenv
myAccessKey = os.getenv('myAccessKey')
mySecretKey = os.getenv('mySecretKey')

start = time.time()

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
cbg_nyc = pd.read_csv('../data/nyc_cbg.csv')

In [6]:
# specify the SG key and secret
fs = s3fs.S3FileSystem(profile="safegraphws", key=myAccessKey, secret=mySecretKey, client_kwargs={'endpoint_url': 'https://s3.wasabisys.com', 'region_name':'us-east-1'})

### Prep county-level table for 2019

Records represent individual CBGs (for later aggregation)

In [7]:
# years, months and days
#years = ['2019','2020']
monthList =["01","02","03","04","05","06", "07","08","09","10","11","12"]
dayNumList =[31, 28, 31, 30, 31, 30, 31, 31,30,31,30,31] 
dayList =["01","02","03","04","05","06","07","08","09","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31"]

In [9]:
##Check candidate device count for 2019
frames = []
for m in range (0,12):
    for d in range(0, dayNumList[m]):
        with fs.open(f'sg-c19-response/social-distancing/v2/2019/{monthList[m]}/{dayList[d]}/2019-{monthList[m]}-{dayList[d]}-social-distancing.csv.gz','rb') as f:
            print(f'2019-{monthList[m]}-{dayList[d]}')
            # read SG's file
            df = pd.read_csv(f, escapechar='\\', compression='gzip')
            #create table for all counties in the U.S.
            df['orig_stco'] = df['origin_census_block_group'].apply(str).str[:5]
            df['date_y-m-d'] = df['date_range_start'].str[:10]
            dff = df[['date_y-m-d','orig_stco','completely_home_device_count','candidate_device_count','device_count']]
            dff = dff.groupby(['date_y-m-d','orig_stco']).agg({'device_count':np.sum,'completely_home_device_count':np.sum,\
                                                    'candidate_device_count':np.sum}).reset_index()
            frames.append(dff)

2019-01-01
2019-01-02
2019-01-03
2019-01-04
2019-01-05
2019-01-06
2019-01-07
2019-01-08
2019-01-09
2019-01-10
2019-01-11
2019-01-12
2019-01-13
2019-01-14
2019-01-15
2019-01-16
2019-01-17
2019-01-18
2019-01-19
2019-01-20
2019-01-21
2019-01-22
2019-01-23
2019-01-24
2019-01-25
2019-01-26
2019-01-27
2019-01-28
2019-01-29
2019-01-30
2019-01-31
2019-02-01
2019-02-02
2019-02-03
2019-02-04
2019-02-05
2019-02-06
2019-02-07
2019-02-08
2019-02-09
2019-02-10
2019-02-11
2019-02-12
2019-02-13
2019-02-14
2019-02-15
2019-02-16
2019-02-17
2019-02-18
2019-02-19
2019-02-20
2019-02-21
2019-02-22
2019-02-23
2019-02-24
2019-02-25
2019-02-26
2019-02-27
2019-02-28
2019-03-01
2019-03-02
2019-03-03
2019-03-04
2019-03-05
2019-03-06
2019-03-07
2019-03-08
2019-03-09
2019-03-10
2019-03-11
2019-03-12
2019-03-13
2019-03-14
2019-03-15
2019-03-16
2019-03-17
2019-03-18
2019-03-19
2019-03-20
2019-03-21
2019-03-22
2019-03-23
2019-03-24
2019-03-25
2019-03-26
2019-03-27
2019-03-28
2019-03-29
2019-03-30
2019-03-31
2019-04-01

In [10]:
df_19 = pd.concat(frames)

In [11]:
df_19.head(20)

Unnamed: 0,date_y-m-d,orig_stco,device_count,completely_home_device_count,candidate_device_count
0,2019-01-01,10001,11809,5302,34903
1,2019-01-01,10003,34237,15251,102795
2,2019-01-01,10005,13089,5593,38141
3,2019-01-01,10010,4708,1829,11575
4,2019-01-01,10030,19655,7717,49888
5,2019-01-01,10059,1570,594,4546
6,2019-01-01,10070,1702,623,4200
7,2019-01-01,10090,5224,1901,11986
8,2019-01-01,10119,519,236,1696
9,2019-01-01,10139,1188,480,3511


### Prep county-level table for 2020

Records represent individual CBGs (for later aggregation)

In [12]:
# updated february day for leap year
monthList =["01","02","03","04","05","06", "07","08"]
dayNumList =[31, 29, 31, 30, 31, 30, 31, 31] 

In [13]:
##Check candidate device count for 2019
frames = []
for m in range (0,8):
    for d in range(0, dayNumList[m]):
        with fs.open(f'sg-c19-response/social-distancing/v2/2020/{monthList[m]}/{dayList[d]}/2020-{monthList[m]}-{dayList[d]}-social-distancing.csv.gz','rb') as f:
            print(f'2020-{monthList[m]}-{dayList[d]}')
            # read SG's file
            df = pd.read_csv(f, escapechar='\\', compression='gzip')
            #create table for all counties in the U.S.
            df['orig_stco'] = df['origin_census_block_group'].apply(str).str[:5]
            df['date_y-m-d'] = df['date_range_start'].str[:10]
            dff = df[['date_y-m-d','orig_stco','completely_home_device_count','candidate_device_count','device_count']]
            dff = dff.groupby(['date_y-m-d','orig_stco']).agg({'device_count':np.sum,'completely_home_device_count':np.sum,\
                                                    'candidate_device_count':np.sum}).reset_index()
            frames.append(dff)

2020-01-01
2020-01-02
2020-01-03
2020-01-04
2020-01-05
2020-01-06
2020-01-07
2020-01-08
2020-01-09
2020-01-10
2020-01-11
2020-01-12
2020-01-13
2020-01-14
2020-01-15
2020-01-16
2020-01-17
2020-01-18
2020-01-19
2020-01-20
2020-01-21
2020-01-22
2020-01-23
2020-01-24
2020-01-25
2020-01-26
2020-01-27
2020-01-28
2020-01-29
2020-01-30
2020-01-31
2020-02-01
2020-02-02
2020-02-03
2020-02-04
2020-02-05
2020-02-06
2020-02-07
2020-02-08
2020-02-09
2020-02-10
2020-02-11
2020-02-12
2020-02-13
2020-02-14
2020-02-15
2020-02-16
2020-02-17
2020-02-18
2020-02-19
2020-02-20
2020-02-21
2020-02-22
2020-02-23
2020-02-24
2020-02-25
2020-02-26
2020-02-27
2020-02-28
2020-02-29
2020-03-01
2020-03-02
2020-03-03
2020-03-04
2020-03-05
2020-03-06
2020-03-07
2020-03-08
2020-03-09
2020-03-10
2020-03-11
2020-03-12
2020-03-13
2020-03-14
2020-03-15
2020-03-16
2020-03-17
2020-03-18
2020-03-19
2020-03-20
2020-03-21
2020-03-22
2020-03-23
2020-03-24
2020-03-25
2020-03-26
2020-03-27
2020-03-28
2020-03-29
2020-03-30
2020-03-31

In [14]:
df_20 = pd.concat(frames)

In [15]:
df_20.head(20)

Unnamed: 0,date_y-m-d,orig_stco,device_count,completely_home_device_count,candidate_device_count
0,2020-01-01,10001,11472,3924,20752
1,2020-01-01,10003,30270,10734,56514
2,2020-01-01,10005,13572,4537,24242
3,2020-01-01,10010,5501,1578,8303
4,2020-01-01,10030,20761,5997,31957
5,2020-01-01,10059,1660,452,2848
6,2020-01-01,10070,2040,560,3167
7,2020-01-01,10090,6206,1779,9031
8,2020-01-01,10119,617,201,1131
9,2020-01-01,10139,1399,385,2353


### Add 2019 and 2020 tables together

In [16]:
table = pd.concat([df_19,df_20],axis=0)
table.head(20)

Unnamed: 0,date_y-m-d,orig_stco,device_count,completely_home_device_count,candidate_device_count
0,2019-01-01,10001,11809,5302,34903
1,2019-01-01,10003,34237,15251,102795
2,2019-01-01,10005,13089,5593,38141
3,2019-01-01,10010,4708,1829,11575
4,2019-01-01,10030,19655,7717,49888
5,2019-01-01,10059,1570,594,4546
6,2019-01-01,10070,1702,623,4200
7,2019-01-01,10090,5224,1901,11986
8,2019-01-01,10119,519,236,1696
9,2019-01-01,10139,1188,480,3511


In [17]:
table.to_csv("output/dara/full_device_check_uscounties.csv")

In [18]:
#aggregate to NYC total
table_US = table.groupby(['date_y-m-d']).agg({'device_count':np.sum,'completely_home_device_count':np.sum,\
                                                    'candidate_device_count':np.sum})
table_US.head(20)

Unnamed: 0_level_0,device_count,completely_home_device_count,candidate_device_count
date_y-m-d,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-01,21923101,9316880,63024349
2019-01-02,22066595,8264103,63025365
2019-01-03,21982481,7792329,63025135
2019-01-04,21766510,7387935,63025082
2019-01-05,21791853,8517615,63024583
2019-01-06,22358350,9401222,63025026
2019-01-07,22384427,8365092,63025806
2019-01-08,22709157,8053236,63025662
2019-01-09,23154577,7973555,63026350
2019-01-10,23371864,7820860,63026222


In [19]:
table_US.to_excel("output/dara/full_candidate_device_us-total.xlsx")

In [20]:
end = time.time()
elapsed = end - start
print(f'Run time - {elapsed} seconds')

Run time - 25075.88927602768 seconds


## Import table from EDM and reshape the counties (for later analysis)

In [16]:
counties = pd.read_csv('../data/devicecount_uscounties_201003.csv')

In [17]:
counties.head()

Unnamed: 0,date_range_start,origin_fips,device_count,completely_home_device_count,candidate_device_count
0,2020-04-12T00:00:00-04:00,26065,14670,7459,39576
1,2020-04-12T00:00:00-04:00,39093,18987,9562,46515
2,2020-04-12T00:00:00-05:00,47149,26900,14783,56125
3,2020-04-12T00:00:00-07:00,6083,13037,6206,36671
4,2020-04-12T00:00:00-04:00,37153,2856,1149,6767


In [18]:
counties = counties.rename(columns={"date_range_start":'date-y-m-d'})
counties['date-y-m-d'] = counties['date-y-m-d'].str[:10]
counties.head()

Unnamed: 0,date-y-m-d,origin_fips,device_count,completely_home_device_count,candidate_device_count
0,2020-04-12,26065,14670,7459,39576
1,2020-04-12,39093,18987,9562,46515
2,2020-04-12,47149,26900,14783,56125
3,2020-04-12,6083,13037,6206,36671
4,2020-04-12,37153,2856,1149,6767


In [19]:
co_device = pd.pivot_table(counties,values=['device_count'],index=['date-y-m-d'],columns=['origin_fips'],aggfunc=np.sum,fill_value=0,margins=True)
co_home = pd.pivot_table(counties,values=['completely_home_device_count'],index=['date-y-m-d'],columns=['origin_fips'],aggfunc=np.sum,fill_value=0,margins=True)
co_candidate = pd.pivot_table(counties,values=['candidate_device_count'],index=['date-y-m-d'],columns=['origin_fips'],aggfunc=np.sum,fill_value=0,margins=True)

In [21]:
co_device.to_excel('output/Device Counts/device_UScounties_daily.xlsx')
co_home.to_excel('output/Device Counts/comphome_UScounties_daily.xlsx')
co_candidate.to_excel('output/Device Counts/candidatedevice_UScounties_daily.xlsx')