# IMPORT AND SETUP

In [1]:
#Utility
import urllib
import os
import datetime
import logging
from io import BytesIO
import time

#Data Science
import pandas as pd
import numpy as np

#Google API
from google.cloud import bigquery
from google.cloud import storage # Imports the Google Cloud storage library

In [2]:
deployment = 'local' #local or cloud

In [3]:
if deployment == 'cloud':
    from pyspark.sql import SparkSession #ONlY FOR CLOUD DEPLOYMENT
    #Start spark session
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "com.google.cloud.spark:spark-bigquery-with-dependencies_2.11:0.17.0")\
        .master('yarn') \
        .appName('spark-bigquery-ryder') \
        .getOrCreate()
    
    #Instantiate BigQuery client
    bigquery_client = bigquery.Client() # Instantiates a client
    #Instantiate Storage client
    storage_client = storage.Client() # Instantiates a client
    
else:
    #Set credentials for bigquery !FOR LOCAL ONLY, DON'T COPY TO PYSPARK
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="covid-jul25-**************.json"
    bigquery_client = bigquery.Client() # Instantiates a client

    #Set credentials for cloud storage
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="covid-jul25-**************.json"
    storage_client = storage.Client() # Instantiates a client

In [4]:
#Set working environment
PROJECT_ID='covid-jul25'
REGION='us-west3'
ZONE='us-west3-a'
BUCKET_LINK='gs://us-west3-{BUCKET_NAME}'
BUCKET='us-west3-{BUCKET_NAME}'

# UPDATE STATE DATA ON BIGQUERY FOR REFERENCE

## Exclusion lists

In [5]:
#Exclude certain states
excludestate = ['American Samoa','United States Virgin Islands','Commonwealth of the Northern Mariana Islands','Guam','Puerto Rico']
excludestateabbr = ['AS','VI','MP','GU','PR']

## Countyarea table

In [6]:
#Construct countyarea
query_job = bigquery_client.query(
    """
    DROP TABLE IF EXISTS `covid-jul25.usprojections.countyarea`;
    CREATE TABLE `covid-jul25.usprojections.countyarea` AS
    (SELECT DISTINCT region_code, division_code, A.state_fips_code, state_name, state_abbreviation , county_name, concat(int_point_lat,',',int_point_lon) as lat_long, concat(state_name,'-',county_name) as statecounty
    FROM `bigquery-public-data.utility_us.us_states_area` as A
    RIGHT JOIN
    (SELECT DISTINCT county_name, state_fips_code,
    int_point_lat, int_point_lon,
    FROM `bigquery-public-data.geo_us_boundaries.counties`) as B
    ON A.state_fips_code = B.state_fips_code)
    """
)
results = query_job.result()  # Waits for job to complete.

sql = """
    SELECT *
    FROM [covid-jul25.usprojections.countyarea]
"""
countyarea = pd.read_gbq(sql, dialect='legacy')

Downloading: 100%|█████████████████████████████████████████████████████████████| 3233/3233 [00:00<00:00, 3978.58rows/s]


## Create state dictionary

In [7]:
#State abbrev list
stateabbrlist = countyarea.loc[:,['state_name','state_abbreviation']].drop_duplicates()
statelist = stateabbrlist[~stateabbrlist['state_name'].isin(excludestate)]['state_name'].to_list()
stateabb = stateabbrlist[~stateabbrlist['state_abbreviation'].isin(excludestateabbr)]['state_abbreviation'].to_list()

statedict = dict(zip(stateabb, statelist))
rstatedict = dict(zip(statelist,stateabb))

## Write excluded list back to BigQuery

In [8]:
#Excluding some states
countyarea = countyarea[~countyarea['state_name'].isin(excludestate)]
#Add region ISO
countyarea['region'] = 'US-' + countyarea['state_abbreviation']
#Write back to BigQuery
countyarea.to_gbq('usprojections.countyarea',if_exists='replace')

1it [00:03,  3.79s/it]


# UPDATE RT RESULTS

## Read from https://rt.live/ and create rt_temp table for update

In [None]:
#Read to frame the data for rt
rtdf = pd.read_csv('https://d14wlfuexuxgcm.cloudfront.net/covid/rt.csv')
#Adding update time
rtdf['update_time']=datetime.datetime.now()
#Put region in ISO format
rtdf['region'] = 'US-'+ rtdf['region']
rtdf.to_gbq('usprojections.temp_rt',if_exists='replace')

## Backup script to recreate table

In [None]:
# #RECREATE THE RT_RESULTS TABLE IF NEEDS TO
# DROP TABLE IF EXISTS `covid-jul25.usprojections.rt_results`;
# CREATE TABLE `covid-jul25.usprojections.rt_results` AS
# SELECT * except(date), cast(date as date) as date FROM `covid-jul25.usprojections.temp_rt`

# #LATEST RESULTS
# DROP TABLE IF EXISTS `covid-jul25.usprojections.latest_rt_results`;
# CREATE TABLE `covid-jul25.usprojections.latest_rt_results` AS
# (SELECT * except(row) FROM
# (SELECT * except(date), cast(date as date) as date, row_number() OVER(PARTITION BY region ORDER BY cast(date as date) DESC) row
# FROM `covid-jul25.usprojections.temp_rt`)
# WHERE row = 1)

## Update the static rt_results table using temp

In [None]:
query_job = bigquery_client.query(
"""
#ALL rt
DELETE FROM `covid-jul25.usprojections.rt_results` WHERE True;
INSERT INTO `covid-jul25.usprojections.rt_results`
SELECT * except(date), cast(date as date) as date FROM `covid-jul25.usprojections.temp_rt`;

#ONLY latest rt
DELETE FROM `covid-jul25.usprojections.latest_rt_results` WHERE True;
INSERT INTO `covid-jul25.usprojections.latest_rt_results`
(SELECT * except(row) FROM
(SELECT * except(date), cast(date as date) as date, row_number() OVER(PARTITION BY region ORDER BY cast(date as date) DESC) row
FROM `covid-jul25.usprojections.temp_rt`)
WHERE row = 1);
""")
results = query_job.result()

## Update the static rt_duration table with ranking and more metrics days over rt of 1

In [None]:
query_job = bigquery_client.query(
    """
    -- DROP TABLE IF EXISTS `covid-jul25.usprojections.rt_duration`
    -- CREATE TABLE `covid-jul25.usprojections.rt_duration` AS
    DELETE FROM `covid-jul25.usprojections.rt_duration` WHERE True;
    INSERT INTO `covid-jul25.usprojections.rt_duration`
    SELECT mean as d_mean , median as d_median, lower_80 as d_lower_80, upper_80 as d_upper_80, M.max_date as date, M.* except(max_date), 'latest' as type,
    RANK() OVER(ORDER BY mean DESC) rank
    FROM `covid-jul25.usprojections.rt_results` as S
    RIGHT JOIN
    (SELECT MAX(date) as max_date, MIN(date) as min_date, DATE_DIFF(MAX(date),MIN(date),DAY)+1 as duration, region,
    SUM(CASE WHEN mean < 1 THEN 0 ELSE 1 END)/(DATE_DIFF(MAX(date),MIN(date),DAY)+1) as rt_over_ratio, 
    SUM(CASE WHEN mean < 1 THEN 1 ELSE 0 END)/(DATE_DIFF(MAX(date),MIN(date),DAY)+1) as rt_under_ratio,
    SUM(CASE WHEN mean < 1 THEN 0 ELSE 1 END) as rt_over,
    SUM(CASE WHEN mean < 1 THEN 1 ELSE 0 END) as rt_under
    FROM `covid-jul25.usprojections.rt_results`
    GROUP BY region) as M
    ON S.region = M.region
    AND S.date = M.max_date

    UNION ALL

    SELECT mean as d_mean , median as d_median, lower_80 as d_lower_80, upper_80 as d_upper_80, M.min_date as date, M.* except(min_date), 'earliest' as type,
    RANK() OVER(ORDER BY mean DESC) rank
    FROM `covid-jul25.usprojections.rt_results` as S
    RIGHT JOIN
    (SELECT MAX(date) as max_date, MIN(date) as min_date, DATE_DIFF(MAX(date),MIN(date),DAY)+1 as duration, region,
    SUM(CASE WHEN mean < 1 THEN 0 ELSE 1 END)/(DATE_DIFF(MAX(date),MIN(date),DAY)+1) as rt_over_ratio, 
    SUM(CASE WHEN mean < 1 THEN 1 ELSE 0 END)/(DATE_DIFF(MAX(date),MIN(date),DAY)+1) as rt_under_ratio,
    SUM(CASE WHEN mean < 1 THEN 0 ELSE 1 END) as rt_over,
    SUM(CASE WHEN mean < 1 THEN 1 ELSE 0 END) as rt_under
    FROM `covid-jul25.usprojections.rt_results`
    GROUP BY region) as M
    ON S.region = M.region
    AND S.date = M.min_date
    """)
results = query_job.result()

# UPDATE CASE NUMBERS FROM JOHNS HOPKINS

In [9]:
#Import confirmed, deaths and recovered cases from url
confirmed = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv")
deaths = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv")
recovered = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv")

## Formatting the dataframe for export to bigquery

In [10]:
#Formating confirmed as df ready for import for maindf
df = confirmed.copy()
xlist = ['UID', 'iso2', 'iso3', 'code3','Country_Region', 'Lat', 'Long_','Combined_Key']

df['Lookup'] = df['Admin2'] + df['Province_State']
includelist = [x for x in df.columns if x not in xlist]
datelist = includelist.copy()
datelist.pop(0)
datelist.pop(0)
datelist.remove('Lookup')
df = df.loc[:,includelist]

#Transpose and change index
lookuplist = df['Lookup'].to_list()
df = df.transpose()
df.columns = lookuplist
df = df.transpose()

#Formating deaths as df1 ready for import for maindf
df1 = deaths.copy()
xlist = ['UID', 'iso2', 'iso3', 'code3','Country_Region', 'Lat', 'Long_','Combined_Key']

df1['Lookup'] = df1['Admin2'] + df1['Province_State']
includelist = [x for x in df1.columns if x not in xlist]
datelist = includelist.copy()
datelist.pop(0)
datelist.pop(0)
datelist.pop(0)
datelist.pop(0)
datelist.remove('Lookup')
df1 = df1.loc[:,includelist]

#Transpose and change index
lookuplist = df1['Lookup'].to_list()
df1 = df1.transpose()
df1.columns = lookuplist
df1 = df1.transpose()

#Create a lookup list to loop over for maindf
lookuplist = df1['Lookup'].value_counts().index.to_list()
lookuplist = sorted(lookuplist)

## Constructing main dataframe and write to bigquery

In [11]:
#Construction of maindf
colnames = ['lookup','state','county','FIPS','population','date','confirmed','deaths']

maindf = pd.DataFrame(index=range(0,len(lookuplist)*len(datelist)), columns=colnames)

#Start loop for confirmed
j = 0
for i in lookuplist:
    testdf = df.loc[i,:]
    testdf1 = df1.loc[i,:]
    
    #Confirmed cases
    timeseries = testdf[3:-1]
    date = timeseries.index.to_list()
    timeseries = timeseries.to_list()
    length = len(timeseries)
    
    #Deaths
    timeseries1 = testdf1[4:-1].to_list()
    
    maindf.iloc[j:(j+length),colnames.index('lookup')] = i
    maindf.iloc[j:(j+length),colnames.index('state')] = testdf[2]
    maindf.iloc[j:(j+length),colnames.index('county')] = testdf[1]
    maindf.iloc[j:(j+length),colnames.index('FIPS')] = testdf[0]
    maindf.iloc[j:(j+length),colnames.index('population')] = testdf1[3]
    maindf.iloc[j:(j+length),colnames.index('date')] = date
    maindf.iloc[j:(j+length),colnames.index('confirmed')] = timeseries
    maindf.iloc[j:(j+length),colnames.index('deaths')] = timeseries1
    j = j+length
    
maindf.loc[:,'date1'] = pd.to_datetime(maindf['date'],format='%m/%d/%y')
maindf['lookup'] = maindf['lookup'] + maindf['date1'].astype(str).tolist()

labels = maindf['lookup'].to_list()
maindf = maindf.transpose()
maindf.columns = labels
maindf = maindf.transpose()
maindf['statecounty'] = maindf['state'] + '-' + maindf['county']

#converting datatypes 
maindf = maindf.infer_objects() 
print(maindf.dtypes)

convert_dict = {'lookup': str, 
                'state': str,
                'county': str}
maindf = maindf.astype(convert_dict)

#Remove some states
maindf = maindf[~maindf['state'].isin(excludestate)]
regionlst = maindf['state'].tolist()
maindf['region'] = list(map(lambda x : 'US-'+rstatedict[x], regionlst))

#Special cases for certain county
maindf = maindf.replace('Dona Ana','Doña Ana')

#Wrie to bigquery
maindf.to_gbq('usprojections.temp_cases',if_exists='replace')

lookup                 object
state                  object
county                 object
FIPS                  float64
population              int64
date                   object
confirmed               int64
deaths                  int64
date1          datetime64[ns]
statecounty            object
dtype: object


1it [02:57, 177.53s/it]


## Update static cases table 

In [5]:
#Update the static cases table
query_job = bigquery_client.query(
    """
    DELETE FROM `covid-jul25.usprojections.cases` WHERE True;
    INSERT INTO `covid-jul25.usprojections.cases`
    -- DROP TABLE IF EXISTS `covid-jul25.usprojections.cases`;
    -- CREATE TABLE `covid-jul25.usprojections.cases`
    (SELECT * except(date,date1), date1 as date, '' as lat_long
    FROM `covid-jul25.usprojections.temp_cases`);
    
    DELETE FROM `covid-jul25.usprojections.final_cases` WHERE True;
    INSERT INTO `covid-jul25.usprojections.final_cases`
    -- DROP TABLE IF EXISTS `covid-jul25.usprojections.final_cases`;
    -- CREATE TABLE `covid-jul25.usprojections.final_cases` AS
    SELECT *, deaths/population*1000 as death_per_1k, confirmed/population*1000 as confirmed_per_1k
    FROM
    (SELECT A.* except(lat_long),B.lat_long 
    FROM `covid-jul25.usprojections.cases` as A
    JOIN `covid-jul25.usprojections.countyarea` as B
    ON A.statecounty = B.statecounty);
    
    -- DROP TABLE IF EXISTS `covid-jul25.usprojections.cases_velocity`;
    -- CREATE TABLE `covid-jul25.usprojections.cases_velocity` AS
    DELETE FROM `covid-jul25.usprojections.cases_velocity` WHERE True;
    INSERT INTO `covid-jul25.usprojections.cases_velocity`
    SELECT *, confirmed/duration as confirmed_velocity, deaths/duration as deaths_velocity FROM
    (SELECT *, row_number() OVER(PARTITION BY statecounty ORDER BY date ASC) as duration
    FROM `covid-jul25.usprojections.final_cases`);
    
    DELETE FROM `covid-jul25.usprojections.latest_cases_temp` WHERE True;
    INSERT INTO `covid-jul25.usprojections.latest_cases_temp`
    -- DROP TABLE IF EXISTS `covid-jul25.usprojections.latest_cases`;
    -- CREATE TABLE `covid-jul25.usprojections.latest_cases` AS
    SELECT * except(row),
    rank() OVER (ORDER BY death_per_1k DESC) death_rank,
    rank() OVER (ORDER BY confirmed_per_1k DESC) confirmed_rank
    FROM
    (SELECT *, row_number() OVER(PARTITION BY statecounty ORDER BY date DESC) row
    FROM `covid-jul25.usprojections.final_cases`)
    WHERE row = 1
    """)
results = query_job.result()  # Waits for job to complete.

BadRequest: 400 Query error: Inserted row has wrong column count; Has 15, expected 17 at [32:5]

(job ID: a596a5db-2f06-49c1-a914-833e90375345)

                                    -----Query Job SQL Follows-----                                     

    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |
   1:
   2:    DELETE FROM `covid-jul25.usprojections.cases` WHERE True;
   3:    INSERT INTO `covid-jul25.usprojections.cases`
   4:    -- DROP TABLE IF EXISTS `covid-jul25.usprojections.cases`;
   5:    -- CREATE TABLE `covid-jul25.usprojections.cases`
   6:    (SELECT * except(date,date1), date1 as date, '' as lat_long
   7:    FROM `covid-jul25.usprojections.temp_cases`);
   8:    
   9:    DELETE FROM `covid-jul25.usprojections.final_cases` WHERE True;
  10:    INSERT INTO `covid-jul25.usprojections.final_cases`
  11:    -- DROP TABLE IF EXISTS `covid-jul25.usprojections.final_cases`;
  12:    -- CREATE TABLE `covid-jul25.usprojections.final_cases` AS
  13:    SELECT *, deaths/population*1000 as death_per_1k, confirmed/population*1000 as confirmed_per_1k
  14:    FROM
  15:    (SELECT A.* except(lat_long),B.lat_long 
  16:    FROM `covid-jul25.usprojections.cases` as A
  17:    JOIN `covid-jul25.usprojections.countyarea` as B
  18:    ON A.statecounty = B.statecounty);
  19:    
  20:    -- DROP TABLE IF EXISTS `covid-jul25.usprojections.cases_velocity`;
  21:    -- CREATE TABLE `covid-jul25.usprojections.cases_velocity` AS
  22:    DELETE FROM `covid-jul25.usprojections.cases_velocity` WHERE True;
  23:    INSERT INTO `covid-jul25.usprojections.cases_velocity`
  24:    SELECT *, confirmed/duration as confirmed_velocity, deaths/duration as deaths_velocity FROM
  25:    (SELECT *, row_number() OVER(PARTITION BY statecounty ORDER BY date ASC) as duration
  26:    FROM `covid-jul25.usprojections.final_cases`);
  27:    
  28:    DELETE FROM `covid-jul25.usprojections.latest_cases` WHERE True;
  29:    INSERT INTO `covid-jul25.usprojections.latest_cases`
  30:    -- DROP TABLE IF EXISTS `covid-jul25.usprojections.latest_cases`;
  31:    -- CREATE TABLE `covid-jul25.usprojections.latest_cases` AS
  32:    SELECT * except(row),
  33:    rank() OVER (ORDER BY death_per_1k DESC) death_rank,
  34:    rank() OVER (ORDER BY confirmed_per_1k DESC) confirmed_rank
  35:    FROM
  36:    (SELECT *, row_number() OVER(PARTITION BY statecounty ORDER BY date DESC) row
  37:    FROM `covid-jul25.usprojections.final_cases`)
  38:    WHERE row = 1
  39:    
    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |

In [6]:
#Read from latest_cases_temp table
sql = """
    SELECT *
    FROM [covid-jul25.usprojections.latest_cases_temp]
    """
lastestcases = pd.read_gbq(sql, dialect='legacy')

#Clear the latest_cases table
query_job = bigquery_client.query(
    """DELETE FROM `covid-jul25.usprojections.latest_cases` WHERE True""")
results = query_job.result()

#Write to BigQuery
lastestcases.to_gbq('usprojections.latest_cases',if_exists='append')

Downloading: 100%|█████████████████████████████████████████████████████████████| 3135/3135 [00:00<00:00, 3699.79rows/s]
1it [00:04,  4.43s/it]


# UPDATE MOBILITY DATA

## Create STATE temp table to hold updated data

In [12]:
#Create a STATE LEVEL table to hold the updated data
query_job = bigquery_client.query(
    """
    -- DROP TABLE IF EXISTS `covid-jul25.usprojections.mobility_state`;
    -- CREATE TABLE `covid-jul25.usprojections.mobility_state` AS
    DELETE FROM `covid-jul25.usprojections.mobility_state` WHERE True;
    INSERT INTO `covid-jul25.usprojections.mobility_state`
    #STATE LEVEL
    SELECT *, CURRENT_TIMESTAMP() as mobility_update_time FROM
    (SELECT sub_region_1 as state, sub_region_2 as county, iso_3166_2_code as region, * except(sub_region_1,sub_region_2,iso_3166_2_code) FROM `bigquery-public-data.covid19_google_mobility.mobility_report`
    WHERE country_region_code = 'US'
    AND sub_region_1 NOT IN ('American Samoa','United States Virgin Islands','Commonwealth of the Northern Mariana Islands','Guam','Puerto Rico')
    AND sub_region_2 is null); #State level is null, county level is not null
    """)
results = query_job.result()

In [23]:
# #Mobility construction for each type
# mobilitylonglist = ['retail_and_recreation_percent_change_from_baseline','grocery_and_pharmacy_percent_change_from_baseline','parks_percent_change_from_baseline',
# 'transit_stations_percent_change_from_baseline','workplaces_percent_change_from_baseline','residential_percent_change_from_baseline']
# mobilityshortlist = ['retail & recreation','grocery & pharmacy','parks','transit','workplaces','residential']
# mobilitytablelist = ['retail_recreation','grocery_pharmacy','parks','transit','workplaces','residential']

# #dictionary with tablename as key and tuples of long and short name for values
# mobilitydict = dict(zip(mobilitytablelist,zip(mobilitylonglist,mobilityshortlist)))

In [32]:
# #Looping through the tables
# for key in mobilitydict:
#     query_job = bigquery_client.query(
#         f"""
#         DROP TABLE IF EXISTS `covid-jul25.usprojections.mobility_{key}`;
#         CREATE TABLE `covid-jul25.usprojections.mobility_{key}` AS
#         #DELETE FROM `covid-jul25.usprojections.mobility_{key}` WHERE True;
#         #INSERT INTO `covid-jul25.usprojections.mobility_{key}`
#         SELECT state, region, date, mobility_update_time, {mobilitydict[key][0]} as %change from baseline, '{mobilitydict[key][1]}' as type
#         FROM `covid-jul25.usprojections.mobility_state`
#         """)
#     results = query_job.result()

In [None]:
query_job = bigquery_client.query(
    """
    -- DROP TABLE `covid-jul25.usprojections.mobility_arima_join`;
    -- CREATE TABLE `covid-jul25.usprojections.mobility_arima_join` AS
    DELETE FROM `covid-jul25.usprojections.mobility_arima_join` WHERE True;
    INSERT INTO `covid-jul25.usprojections.mobility_arima_join`
    (SELECT * except(iso_3166_2_code), iso_3166_2_code as region FROM `bigquery-public-data.covid19_google_mobility.mobility_report`
    WHERE country_region_code = 'US'
    AND sub_region_2 is null
    AND iso_3166_2_code is not null)
""")
results = query_job.result()

# UPDATE STATIC STATS

In [13]:
sql = """
    #Static stats updated daily
    SELECT avg(death_per_1k) as national_avg_death_per_1k,
    avg(confirmed_per_1k) as national_avg_confirmed_per_1k
    FROM [covid-jul25.usprojections.latest_cases]
    """
staticstats = pd.read_gbq(sql, dialect='legacy')

Downloading: 100%|█████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.08rows/s]


## Read rt latest data

In [14]:
sql = """
    #Static stats updated daily
    SELECT *
    FROM [covid-jul25.usprojections.latest_rt_results]
    """
rtstats = pd.read_gbq(sql, dialect='legacy')

#Get only certain columns
rtstats = rtstats[['region','mean','median','lower_80','upper_80']]
rtstats.columns = ['region','rt_mean','rt_median','rt_lower_80','rt_upper_80']
rtstats = rtstats.set_index('region')

Downloading: 100%|██████████████████████████████████████████████████████████████████| 51/51 [00:00<00:00, 151.74rows/s]


## Read state mobility latest data 

In [15]:
sql = """
    #Static stats updated daily
    SELECT *
    FROM [covid-jul25.usprojections.mobility_state]
    """
mobilitystats = pd.read_gbq(sql, dialect='legacy')

mobilitylist = ['retail_and_recreation_percent_change_from_baseline', 'grocery_and_pharmacy_percent_change_from_baseline', 'parks_percent_change_from_baseline',
'transit_stations_percent_change_from_baseline', 'workplaces_percent_change_from_baseline', 'residential_percent_change_from_baseline']
mobilitystats = mobilitystats.sort_values(['region','date']).drop_duplicates(subset=['region'],keep='last')

mobilitystats = mobilitystats.set_index('region')
mobilitystats = mobilitystats[mobilitylist]

Downloading: 100%|█████████████████████████████████████████████████████████████| 9282/9282 [00:02<00:00, 4414.67rows/s]


## Construct staticstatsdf on state level for static data

In [16]:
#Import from staticstats
staticstatsdf = pd.DataFrame()
staticstatsdf['region'] = list(countyarea['region'].unique())
staticstatsdf['national_avg_death_per_1k'] = staticstats.loc[0,'national_avg_death_per_1k']
staticstatsdf['national_avg_confirmed_per_1k'] = staticstats.loc[0,'national_avg_confirmed_per_1k']
staticstatsdf['index'] = staticstatsdf['region']
staticstatsdf = staticstatsdf.set_index('index')

#Join with rt data
staticstatsdf = staticstatsdf.join(rtstats, lsuffix='_caller', rsuffix='_other')

#Join with mobility data
staticstatsdf = staticstatsdf.join(mobilitystats, lsuffix='_caller', rsuffix='_other')

In [17]:
#Clear table
query_job = bigquery_client.query(
    """
    DELETE FROM `covid-jul25.usprojections.latest_stats` WHERE True
    """)
results = query_job.result()

#Write to BigQuery
staticstatsdf.to_gbq('usprojections.latest_stats',if_exists='append')

1it [00:03,  3.13s/it]


# UPDATE STATIC STATS FOR ARIMA NEXT 7 DAYS

In [None]:
#Read raw data from BigQuery
sql = """SELECT * FROM [covid-jul25.arimamodels.confirmed_US_forecast]"""
rawconfirmeddf = pd.read_gbq(sql, dialect='legacy')

sql = """SELECT * FROM [covid-jul25.arimamodels.deaths_US_forecast]"""
rawdeathsdf = pd.read_gbq(sql, dialect='legacy')

In [None]:
#Set today
today = datetime.date.today()
sd = today + datetime.timedelta(days=7) #7-day forecast
#Set timezone to UTC
sd = datetime.datetime(sd.year,sd.month,sd.day,tzinfo=datetime.timezone.utc)

## CONFIRMED cases next 7 days

In [None]:
#Get only counties in the countyarea list (EXCLUDE certain counties and territories)
confirmeddf = rawconfirmeddf.copy()
confirmeddf = confirmeddf[confirmeddf['statecounty'].isin(list(countyarea['statecounty']))]
#Get data for 7 day from now and forecast_value
confirmeddf = confirmeddf[confirmeddf['forecast_timestamp']==sd][['statecounty','forecast_value']]

In [None]:
#Get state names using split
state = list(map(lambda x:x.split('-',1),list(confirmeddf['statecounty'])))
confirmeddf['state']=list(pd.DataFrame(state)[0])

#Aggregate
# statefc_confirmed = confirmeddf.groupby('state').sum()
# statefc_confirmed['state']=statefc_confirmed.index #Set a state column

In [None]:
#Get state abbr
stateabbr = list(map(lambda x:rstatedict[x],list(confirmeddf['state'])))
statefc_confirmed = confirmeddf.copy()
statefc_confirmed['region'] = list(pd.DataFrame(stateabbr)[0])
statefc_confirmed['region'] = 'US-'+ statefc_confirmed['region']
statefc_confirmed = statefc_confirmed.drop(columns=['state'])
statefc_confirmed.columns = ['statecounty','confirmed_forecast','region']

## DEATHS cases next 7 days

In [None]:
#Get only counties in the countyarea list (EXCLUDE certain counties and territories)
deathsdf = rawdeathsdf.copy()
deathsdf = deathsdf[deathsdf['statecounty'].isin(list(countyarea['statecounty']))]
#Get data for 7 day from now and forecast_value
deathsdf = deathsdf[deathsdf['forecast_timestamp']==sd][['statecounty','forecast_value']]

In [None]:
#Get state names using split
state = list(map(lambda x:x.split('-',1),list(deathsdf['statecounty'])))
deathsdf['state']=list(pd.DataFrame(state)[0])

#Aggregate
# statefc_deaths = deathsdf.groupby('state').sum()
# statefc_deaths['state']=statefc_deaths.index #Set a state column

In [None]:
#Get state abbr
stateabbr = list(map(lambda x:rstatedict[x],list(deathsdf['state'])))
statefc_deaths = deathsdf.copy()
statefc_deaths['region'] = list(pd.DataFrame(stateabbr)[0])
statefc_deaths['region'] = 'US-'+ statefc_deaths['region']
statefc_deaths = statefc_deaths.drop(columns=['state'])
statefc_deaths.columns = ['statecounty','deaths_forecast','region']

## Write to BigQuery and Update

In [None]:
#Write to BigQuery
statefc_confirmed.to_gbq('usprojections.arima_confirmed_statecounty',if_exists='replace')
statefc_deaths.to_gbq('usprojections.arima_deaths_statecounty',if_exists='replace')

In [None]:
#Update static stats
query_job = bigquery_client.query(
    """
    UPDATE `covid-jul25.usprojections.latest_cases` as M
    SET M.arima_confirmed_forecast = S.confirmed_forecast
    FROM `covid-jul25.usprojections.arima_confirmed_statecounty` as S
    WHERE M.statecounty = S.statecounty;
    
    UPDATE `covid-jul25.usprojections.latest_cases` as M
    SET M.arima_deaths_forecast = S.deaths_forecast
    FROM `covid-jul25.usprojections.arima_deaths_statecounty` as S
    WHERE M.statecounty = S.statecounty;
    """)
results = query_job.result()  # Waits for job to complete.