# Elastic Search Update Notebook

This notebook is used to update the elastic search index with the latest datasets

In [15]:
!pip install git+https://github.com/rbilleci/pandora.git

Collecting git+https://github.com/rbilleci/pandora.git
  Cloning https://github.com/rbilleci/pandora.git to /tmp/pip-req-build-7rg62u9x
  Running command git clone -q https://github.com/rbilleci/pandora.git /tmp/pip-req-build-7rg62u9x
Building wheels for collected packages: pandora
  Building wheel for pandora (setup.py) ... [?25ldone
[?25h  Created wheel for pandora: filename=pandora-0.1.0-py3-none-any.whl size=2681412 sha256=a526a1c68100d7873c82a77c5b24ae65468d05964da395955a471267c9ff7311
  Stored in directory: /tmp/pip-ephem-wheel-cache-623oq6zh/wheels/01/8b/d5/a72c927a738750e04a4bb4fd22f63b4b88c7b5871732e2d67b
Successfully built pandora


In [2]:
import boto3
import pandas as pd
import pandora
import pandora.data.age_distribution as age_dist
import pandora.data.oxford_data as oxford
import pandora.data.population as population
import pandora.data.temperatures as temperatures
import shutil
import os
import numpy as np
from pandora.data import geo, continent, country_code, working_day
from pandora import loader, encoders
from pandora.core_fields import DATE, COUNTRY_CODE
import datetime
from pathlib import Path
from logging import INFO, basicConfig, info
import warnings

In [3]:
# setup logging
basicConfig(level=INFO, format='%(asctime)s\t%(levelname)s\t%(filename)s\t%(message)s')
warnings.filterwarnings('ignore', category=FutureWarning)  # ignore FutureWarning from scikit learn
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_info_columns = 1000

# Download the data files

In [5]:
import pandora.data.oxford_data_update

2021-02-09 18:02:40,676	INFO	oxford_data_update.py	download oxford data set
2021-02-09 18:02:44,637	INFO	oxford_data_update.py	writing to /opt/conda/lib/python3.7/site-packages/pandora/data/oxford_data.csv


# Generate Dataset

In [9]:

# determine the last date we have from the oxford data set
# which is the first day we'll begin prediction from
prediction_start_date = pd.read_csv(oxford.module.location, keep_default_na=False, na_values='')['date'].max()
prediction_start_date = datetime.datetime.strptime(prediction_start_date, '%Y-%m-%d').date()

# max number of days to create, build outs the dataframe
# which will have placeholder values we want to predict for
days_to_predict = 180

# the data range should cover the ground truth data + the time window we are predicting into
start_date = datetime.date(2020, 1, 1)
end_date = prediction_start_date + datetime.timedelta(days=days_to_predict)

# the imputation window might extend far before or after
# the range we are predicting, this is so we have more data samples for imputation calculations
imputation_window_start_date =  datetime.date(2020, 1, 1)
imputation_window_end_date =  datetime.date(2021, 12, 31)

# load the data
df = loader.load(start_date,
                 end_date,
                 imputation_window_start_date,
                 imputation_window_end_date,
                 geo.module,
                 [
                     country_code.module,
                     continent.module,
                     population.module,
                     age_dist.module,
                     temperatures.module,
                     oxford.module,
                     working_day.module
                 ])

2021-02-09 18:07:42,754	INFO	loader.py	/opt/conda/lib/python3.7/site-packages/pandora/data/geo.csv - loading
2021-02-09 18:07:48,304	INFO	loader.py	/opt/conda/lib/python3.7/site-packages/pandora/data/country_code.csv - loading
2021-02-09 18:07:52,612	INFO	loader.py	/opt/conda/lib/python3.7/site-packages/pandora/data/continent.csv - loading
2021-02-09 18:07:57,070	INFO	loader.py	/opt/conda/lib/python3.7/site-packages/pandora/data/population.csv - loading
2021-02-09 18:08:01,835	INFO	imputer.py	/opt/conda/lib/python3.7/site-packages/pandora/data/population.csv - imputing population
2021-02-09 18:08:02,320	INFO	imputer.py	/opt/conda/lib/python3.7/site-packages/pandora/data/population.csv - imputing population_density
2021-02-09 18:08:02,808	INFO	imputer.py	/opt/conda/lib/python3.7/site-packages/pandora/data/population.csv - imputing population_percent_urban
2021-02-09 18:08:03,313	INFO	imputer.py	/opt/conda/lib/python3.7/site-packages/pandora/data/population.csv - imputing gdp_per_capita


# Add computed column

In [12]:
def compute_ma(field, window_size):
    df[f"{field}_ma_{window_size}"] = df.groupby('geo_code')[field].rolling(window_size, center=False).mean().fillna(0).reset_index(0, drop=True)

def add_working_day_tomorrow(grouped):
    grouped['working_day' + '_tomorrow'] = grouped['working_day'].copy().shift(-1).bfill().ffill()
    return grouped


def add_working_day_yesterday(grouped):
    grouped['working_day' + '_yesterday'] = grouped['working_day'].copy().shift(1).bfill().ffill()
    return grouped

def transform_column_order(df):
    df = df.reindex(sorted(df.columns), axis=1)  # Sort columns by name
    df_label = df['predicted_new_cases']
    df = df.drop(labels=['predicted_new_cases'], axis=1)
    df.insert(0, 'predicted_new_cases', df_label)
    return df

# Compute number of new cases and deaths each day
# Replace negative values (which do not make sense for these columns) with 0
df['new_cases'] = df.groupby('geo_code').confirmed_cases.diff().fillna(0)
df['new_cases'] = df['new_cases'].clip(lower=0)

# add predicted new cases
df['predicted_new_cases'] = df.groupby('geo_code').new_cases.shift(-1).fillna(0)
df['predicted_new_cases'] = df['predicted_new_cases'].clip(lower=0)

# add confirmed cases as percent of population
df['new_cases_as_percent_of_population'] = df['new_cases'] / df['population']
df['confirmed_cases_as_percent_of_population'] = df['confirmed_cases'] / df['population']

# Add moving averages
for window_size in [3, 7, 21]:
    compute_ma('new_cases', window_size)
    compute_ma('confirmed_cases', window_size)
    compute_ma('specific_humidity', window_size)    
    compute_ma('temperature', window_size)        
    compute_ma('c1_school_closing', window_size)        
    compute_ma('c2_workplace_closing', window_size)        
    compute_ma('c3_cancel_public_events', window_size)        
    compute_ma('c4_restrictions_on_gatherings', window_size)   
    compute_ma('c5_close_public_transport', window_size)        
    compute_ma('c6_stay_at_home_requirements', window_size)        
    compute_ma('c7_restrictions_on_internal_movement', window_size)        
    compute_ma('c8_international_travel_controls', window_size)   
    compute_ma('h1_public_information_campaigns', window_size)        
    compute_ma('h2_testing_policy', window_size)        
    compute_ma('h3_contact_tracing', window_size)        
    compute_ma('h6_facial_coverings', window_size)     
    compute_ma('working_day', window_size)        
    compute_ma('new_cases_as_percent_of_population', window_size)     
    compute_ma('confirmed_cases_as_percent_of_population', window_size)     

# Add working day information for tomorrow, and today
df = df.groupby('geo_code').apply(lambda group: add_working_day_tomorrow(group)).reset_index(drop=True)
df = df.groupby('geo_code').apply(lambda group: add_working_day_yesterday(group)).reset_index(drop=True)
df['npi_sum'] = df['c1_school_closing'] + df['c2_workplace_closing'] + \
                df['c3_cancel_public_events'] + df['c4_restrictions_on_gatherings'] + \
                df['c5_close_public_transport'] + df['c6_stay_at_home_requirements'] + \
                df['c7_restrictions_on_internal_movement'] + df['c8_international_travel_controls'] + \
                df['h1_public_information_campaigns'] + df['h2_testing_policy'] + \
                df['h3_contact_tracing'] + df['h6_facial_coverings']

# Add a column to indicate ground-truth data vs predicted data
df['predicted'] = df['date'].apply(lambda x: x >= prediction_start_date)

# Drop unused columns
df = transform_column_order(df)
df = df.sort_values(['geo_code', 'date'])

# Write out the data

In [14]:
shutil.rmtree('temp', ignore_errors=True)
Path('temp').mkdir(parents=True, exist_ok=True)


df.to_csv('temp/01-data.csv', index=False)