In [1]:
import sys
sys.path.append('/home/sfang/windows/gitlab/stanleysfang/surveillance_2019_ncov/prod')

In [2]:
from google.cloud import bigquery
from BigQueryWrapper import QueryRunner, Loader, Extractor
import pandas as pd
import datetime
import re

In [3]:
project_id = 'stanleysfang'

In [4]:
client = bigquery.Client(project=project_id)

In [5]:
qr = QueryRunner(client=client)
loader = Loader(client=client)
extractor = Extractor(client=client)

### Functions

In [166]:
def find_all_cols(url, start_dt=datetime.date(2020, 1, 22), end_dt=datetime.date.today() - datetime.timedelta(days=1)):
    col_set = set()
    for d in pd.date_range(start_dt, end_dt):
        print(d.strftime('%Y-%m-%d'))
        df = pd.read_csv(url + d.strftime('%m-%d-%Y') + '.csv')
        for col in df.columns:
            if col not in col_set:
                col_set.add(col)
    print(col_set)
    return col_set

In [134]:
def standardize_daily_reports(df, col_mapping, col_order):
    cols = []
    for col in df.columns:
        cols.append(col_mapping[col][0])
        df[col] = df[col].astype(col_mapping[col][1])
    df.columns = cols
    for col, dtype in set(col_mapping.values()):
        if col not in df.columns:
            df[col] = pd.Series(dtype=dtype)
    df = df[col_order]
    return df

### US

In [167]:
d = datetime.date.today() - datetime.timedelta(days=1)
# d = datetime.date(2020, 4, 12)

In [7]:
url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports_us/'

In [8]:
cols = [
    'province_state',
    'country_region',
    'last_update',
    'latitude',
    'longitude',
    'confirmed',
    'deaths',
    'recovered',
    'active',
    'FIPS',
    'incident_rate',
    'total_test_results',
    'people_hospitalized',
    'case_fatality_ratio',
    'UID',
    'iso3',
    'testing_rate',
    'hospitalization_rate'
]

In [9]:
dtypes = {
    'province_state': 'object',
    'country_region': 'object',
    # 'last_update': 'datetime64', # use parse_dates argument in pd.read_csv
    'latitude': 'float64',
    'longitude': 'float64',
    'confirmed': 'float64',
    'deaths': 'float64',
    'recovered': 'float64',
    'active': 'float64',
    'FIPS': 'float64',
    'incident_rate': 'float64',
    'total_test_results': 'float64',
    'people_hospitalized': 'float64',
    'case_fatality_ratio': 'float64',
    'UID': 'float64',
    'iso3': 'object',
    'testing_rate': 'float64',
    'hospitalization_rate': 'float64',
}

In [10]:
schema = [
    ('province_state', 'STRING'),
    ('country_region', 'STRING'),
    ('last_update', 'TIMESTAMP'),
    ('latitude', 'FLOAT64'),
    ('longitude', 'FLOAT64'),
    ('confirmed', 'INT64'),
    ('deaths', 'INT64'),
    ('recovered', 'INT64'),
    ('active', 'INT64'),
    ('FIPS', 'INT64'),
    ('incident_rate', 'FLOAT64'),
    ('total_test_results', 'INT64'),
    ('people_hospitalized', 'INT64'),
    ('case_fatality_ratio', 'FLOAT64'),
    ('UID', 'INT64'),
    ('iso3', 'STRING'),
    ('testing_rate', 'FLOAT64'),
    ('hospitalization_rate', 'FLOAT64'),
]

In [11]:
daily_report_us = pd.read_csv(
    url + d.strftime('%m-%d-%Y') + '.csv',
    header=0, names=cols,
    dtype=dtypes, parse_dates=['last_update']
)

In [12]:
daily_report_us.head()

Unnamed: 0,province_state,country_region,last_update,latitude,longitude,confirmed,deaths,recovered,active,FIPS,incident_rate,total_test_results,people_hospitalized,case_fatality_ratio,UID,iso3,testing_rate,hospitalization_rate
0,Alabama,US,2020-11-16 05:30:30,32.3182,-86.9023,217822.0,3248.0,88038.0,126536.0,1.0,4442.45934,1459406.0,,1.491126,84000001.0,USA,29764.449027,
1,Alaska,US,2020-11-16 05:30:30,61.3707,-152.4044,23814.0,98.0,7164.0,16552.0,2.0,3255.302135,867382.0,,0.411523,84000002.0,USA,118568.509114,
2,American Samoa,US,2020-11-16 05:30:30,-14.271,-170.132,0.0,0.0,,0.0,60.0,0.0,1768.0,,,16.0,ASM,3177.512985,
3,Arizona,US,2020-11-16 05:30:30,33.7298,-111.4312,275436.0,6302.0,45400.0,223734.0,4.0,3784.128439,1974498.0,,2.288009,84000004.0,USA,27127.006037,
4,Arkansas,US,2020-11-16 05:30:30,34.9697,-92.3731,133040.0,2183.0,114312.0,16545.0,5.0,4408.503667,1505554.0,,1.64086,84000005.0,USA,49889.058401,


In [13]:
load_job = loader.load_df(
    daily_report_us,
    'stanleysfang.surveillance_2019_ncov.csse_covid_19_daily_reports_us${}'.format(d.strftime('%Y%m%d')),
    schema=schema,
    time_partitioning=True
)

In [14]:
load_job.result()

<google.cloud.bigquery.job.load.LoadJob at 0x7f02038c52b0>

### Global

In [143]:
cur = datetime.datetime.today()
d = datetime.datetime(cur.year, cur.month, cur.day-1)
d = datetime.datetime(2020, 11, 17)

In [144]:
url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/'

In [146]:
col_mapping_global = {
    'FIPS': ('FIPS', 'float64'), # Federal Information Processing Standards code that uniquely identifies counties within the US
    'Admin2': ('county', 'object'), # US only
    'Province/State': ('province_state', 'object'),
    'Province_State': ('province_state', 'object'),
    'Country_Region': ('country_region', 'object'),
    'Country/Region': ('country_region', 'object'),
    'Combined_Key': ('combined_key', 'object'),
    'Latitude': ('latitude', 'float64'),
    'Lat': ('latitude', 'float64'),
    'Longitude': ('longitude', 'float64'),
    'Long_': ('longitude', 'float64'),
    'Confirmed': ('confirmed', 'float64'),
    'Deaths': ('deaths', 'float64'),
    'Recovered': ('recovered', 'float64'),
    'Active': ('active', 'float64'),
    'Incident_Rate': ('incident_rate', 'float64'),
    'Incidence_Rate': ('incident_rate', 'float64'),
    'Case_Fatality_Ratio': ('case_fatality_ratio', 'float64'),
    'Case-Fatality_Ratio': ('case_fatality_ratio', 'float64'),
    'Last Update': ('last_update', 'datetime64'),
    'Last_Update': ('last_update', 'datetime64'),
}

In [147]:
col_order_global = [
    'FIPS',
    'county',
    'province_state',
    'country_region',
    'combined_key',
    'latitude',
    'longitude',
    'confirmed',
    'deaths',
    'recovered',
    'active',
    'incident_rate',
    'case_fatality_ratio',
    'last_update',
]

In [148]:
daily_report_global = pd.read_csv(url + d.strftime('%m-%d-%Y') + '.csv')
daily_report_global = standardize_daily_reports(daily_report_global, col_mapping_global, col_order_global)

In [151]:
daily_report_global.shape

(38, 14)

In [149]:
daily_report_global.dtypes

FIPS                          float64
county                         object
province_state                 object
country_region                 object
combined_key                   object
latitude                      float64
longitude                     float64
confirmed                     float64
deaths                        float64
recovered                     float64
active                        float64
incident_rate                 float64
case_fatality_ratio           float64
last_update            datetime64[ns]
dtype: object

In [150]:
daily_report_global.head()

Unnamed: 0,FIPS,county,province_state,country_region,combined_key,latitude,longitude,confirmed,deaths,recovered,active,incident_rate,case_fatality_ratio,last_update
0,,,Anhui,Mainland China,,,,1.0,,,,,,2020-01-22 17:00:00
1,,,Beijing,Mainland China,,,,14.0,,,,,,2020-01-22 17:00:00
2,,,Chongqing,Mainland China,,,,6.0,,,,,,2020-01-22 17:00:00
3,,,Fujian,Mainland China,,,,1.0,,,,,,2020-01-22 17:00:00
4,,,Gansu,Mainland China,,,,,,,,,,2020-01-22 17:00:00


In [44]:
cols = [
    'FIPS',
    'county',
    'province_state',
    'country_region',
    'last_update',
    'latitude',
    'longitude',
    'confirmed',
    'deaths',
    'recovered',
    'active',
    'combined_key',
    'incident_rate',
    'case_fatality_ratio'
]

In [45]:
dtypes = {
    'FIPS': 'float64',
    'county': 'object',
    'province_state': 'object',
    'country_region': 'object',
    # 'last_update': 'datetime64', # use parse_dates argument in pd.read_csv
    'latitude': 'float64',
    'longitude': 'float64',
    'confirmed': 'float64',
    'deaths': 'float64',
    'recovered': 'float64',
    'active': 'float64',
    'combined_key': 'object',
    'incident_rate': 'float64',
    'case_fatality_ratio': 'float64'
}

In [46]:
schema = [
    ('FIPS', 'INT64'),
    ('county', 'STRING'),
    ('province_state', 'STRING'),
    ('country_region', 'STRING'),
    ('last_update', 'TIMESTAMP'),
    ('latitude', 'FLOAT64'),
    ('longitude', 'FLOAT64'),
    ('confirmed', 'INT64'),
    ('deaths', 'INT64'),
    ('recovered', 'INT64'),
    ('active', 'INT64'),
    ('combined_key', 'STRING'),
    ('incident_rate', 'FLOAT64'),
    ('case_fatality_ratio', 'FLOAT64')
]

In [41]:
daily_report_global.head()

Unnamed: 0,FIPS,county,province_state,country_region,last_update,latitude,longitude,confirmed,deaths,recovered,active,combined_key,incident_rate,case_fatality_ratio
0,,,,Afghanistan,2020-11-16 05:25:57,33.93911,67.709953,43240.0,1617.0,35092.0,6531.0,Afghanistan,111.075887,3.739593
1,,,,Albania,2020-11-16 05:25:57,41.1533,20.1683,27830.0,623.0,12889.0,14318.0,Albania,967.058169,2.238591
2,,,,Algeria,2020-11-16 05:25:57,28.0339,1.6596,67679.0,2154.0,44633.0,20892.0,Algeria,154.338404,3.182671
3,,,,Andorra,2020-11-16 05:25:57,42.5063,1.5218,5872.0,76.0,4747.0,1049.0,Andorra,7599.818805,1.294278
4,,,,Angola,2020-11-16 05:25:57,-11.2027,17.8739,13451.0,322.0,6444.0,6685.0,Angola,40.92646,2.393874


In [13]:
load_job = loader.load_df(
    daily_report_global,
    'stanleysfang.surveillance_2019_ncov.csse_covid_19_daily_reports_global${}'.format(d.strftime('%Y%m%d')),
    schema=schema,
    time_partitioning=True
)

In [14]:
load_job.result()

<google.cloud.bigquery.job.load.LoadJob at 0x7f02038c52b0>

### OOP

In [15]:
class CSSECovid19DailyReports:
    def __init__(self, client=None, run_project="stanleysfang"):
        if client:
            self.client = client
        else:
            self.client = bigquery.Client(project=run_project)
        
        self.run_project = self.client.project
        self.qr = QueryRunner(client=self.client)
        self.loader = Loader(client=self.client)
        self.extractor = Extractor(client=self.client)
    
    def update_us(self, dt, end_dt=None, destination_table='stanleysfang.surveillance_2019_ncov.csse_covid_19_daily_reports_us'):
        dt_list = None
        if isinstance(dt, (str, datetime.datetime)) and isinstance(end_dt, (str, datetime.datetime)):
            dt_list = pd.date_range(start=dt, end=end_dt).tolist()
        elif isinstance(dt, (str, datetime.datetime)):
            dt_list = [dt]
        elif isinstance(dt, list):
            dt_list = dt
        
        assert isinstance(dt_list, list), 'dt must be a str in "YYYY-mm-dd" format or a datetime.datetime object or a list of these'
        
        url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports_us/'
        cols = [
            'province_state',
            'country_region',
            'last_update',
            'latitude',
            'longitude',
            'confirmed',
            'deaths',
            'recovered',
            'active',
            'FIPS',
            'incident_rate',
            'total_test_results',
            'people_hospitalized',
            'case_fatality_ratio',
            'UID',
            'iso3',
            'testing_rate',
            'hospitalization_rate'
        ]
        dtypes = {
            'province_state': 'object',
            'country_region': 'object',
            # 'last_update': 'datetime64', # use parse_dates argument in pd.read_csv
            'latitude': 'float64',
            'longitude': 'float64',
            'confirmed': 'float64',
            'deaths': 'float64',
            'recovered': 'float64',
            'active': 'float64',
            'FIPS': 'float64',
            'incident_rate': 'float64',
            'total_test_results': 'float64',
            'people_hospitalized': 'float64',
            'case_fatality_ratio': 'float64',
            'UID': 'float64',
            'iso3': 'object',
            'testing_rate': 'float64',
            'hospitalization_rate': 'float64',
        }
        schema = [
            ('province_state', 'STRING'),
            ('country_region', 'STRING'),
            ('last_update', 'TIMESTAMP'),
            ('latitude', 'FLOAT64'),
            ('longitude', 'FLOAT64'),
            ('confirmed', 'INT64'),
            ('deaths', 'INT64'),
            ('recovered', 'INT64'),
            ('active', 'INT64'),
            ('FIPS', 'INT64'),
            ('incident_rate', 'FLOAT64'),
            ('total_test_results', 'INT64'),
            ('people_hospitalized', 'INT64'),
            ('case_fatality_ratio', 'FLOAT64'),
            ('UID', 'INT64'),
            ('iso3', 'STRING'),
            ('testing_rate', 'FLOAT64'),
            ('hospitalization_rate', 'FLOAT64'),
        ]
        
        for d in dt_list:
            assert isinstance(d, (str, datetime.datetime)), 'dt must be a str in "YYYY-mm-dd" format or a datetime.datetime object or a list of these'
            
            if isinstance(d, str):
                d = datetime.datetime.strptime(d, '%Y-%m-%d')
            
            print('Updating ' + d.strftime('%Y-%m-%d') + ' ... ', end='', flush=True)
            daily_report_us = pd.read_csv(
                url + d.strftime('%m-%d-%Y') + '.csv',
                header=0, names=cols,
                dtype=dtypes, parse_dates=['last_update']
            )
            load_job = self.loader.load_df(
                daily_report_us,
                '{destination_table}${partition}'.format(destination_table=destination_table, partition=d.strftime('%Y%m%d')),
                schema=schema,
                time_partitioning=True
            )
            load_job.result()
            print('Done')
    
    def update_global(self, dt, end_dt=None, destination_table='stanleysfang.surveillance_2019_ncov.csse_covid_19_daily_reports_global'):


In [16]:
daily_reports = CSSECovid19DailyReports(client)

In [18]:
daily_reports.update_us('2020-06-21', '2020-11-15')

Updating 2020-06-21 ... Done
Updating 2020-06-22 ... Done
Updating 2020-06-23 ... Done
Updating 2020-06-24 ... Done
Updating 2020-06-25 ... Done
Updating 2020-06-26 ... Done
Updating 2020-06-27 ... Done
Updating 2020-06-28 ... Done
Updating 2020-06-29 ... Done
Updating 2020-06-30 ... Done
Updating 2020-07-01 ... Done
Updating 2020-07-02 ... Done
Updating 2020-07-03 ... Done
Updating 2020-07-04 ... Done
Updating 2020-07-05 ... Done
Updating 2020-07-06 ... Done
Updating 2020-07-07 ... Done
Updating 2020-07-08 ... Done
Updating 2020-07-09 ... Done
Updating 2020-07-10 ... Done
Updating 2020-07-11 ... Done
Updating 2020-07-12 ... Done
Updating 2020-07-13 ... Done
Updating 2020-07-14 ... Done
Updating 2020-07-15 ... Done
Updating 2020-07-16 ... Done
Updating 2020-07-17 ... Done
Updating 2020-07-18 ... Done
Updating 2020-07-19 ... Done
Updating 2020-07-20 ... Done
Updating 2020-07-21 ... Done
Updating 2020-07-22 ... Done
Updating 2020-07-23 ... Done
Updating 2020-07-24 ... Done
Updating 2020-