# Elastic Search Update Notebook

This notebook is used to update the elastic search index with the latest datasets

In [39]:
!pip install xgboost
!pip install git+https://github.com/rbilleci/pandora.git

Collecting git+https://github.com/rbilleci/pandora.git
  Cloning https://github.com/rbilleci/pandora.git to /tmp/pip-req-build-rsfc7_wp
  Running command git clone -q https://github.com/rbilleci/pandora.git /tmp/pip-req-build-rsfc7_wp
Building wheels for collected packages: pandora
  Building wheel for pandora (setup.py) ... [?25ldone
[?25h  Created wheel for pandora: filename=pandora-0.1.0-py3-none-any.whl size=2681412 sha256=120d54e423f81eed34930b641a9b812ab006267ba82c501f53b465521b01f4b8
  Stored in directory: /tmp/pip-ephem-wheel-cache-b8nfv62n/wheels/01/8b/d5/a72c927a738750e04a4bb4fd22f63b4b88c7b5871732e2d67b
Successfully built pandora


In [83]:
import boto3
import pandas as pd
import shutil
import os
import numpy as np
from pandora import loader, encoders
from sklearn.preprocessing import RobustScaler, StandardScaler
import datetime
from pathlib import Path
from logging import INFO, basicConfig, info
import warnings
import xgboost as xgb
from sklearn.metrics import mean_squared_error


In [84]:
# setup logging
basicConfig(level=INFO, format='%(asctime)s\t%(levelname)s\t%(filename)s\t%(message)s')
warnings.filterwarnings('ignore', category=FutureWarning)  # ignore FutureWarning from scikit learn

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_info_columns = 1000

# Read the Data

In [85]:
# load the dataset and set the date column
df = pd.read_csv('temp/01-data.csv', keep_default_na=False, na_values='')
df['date'] = pd.to_datetime(df['date'])

# determine the date where prediction should begin
prediction_start_date = df[df['predicted'] == True]['date'].min().date()



  interactivity=interactivity, compiler=compiler, result=result)


# Prepare dataset for machine learning

In [86]:

# declare the encoders
enc = {}
enc['continent'] = encoders.BinaryEncoder('continent')
enc['geo_code'] = encoders.BinaryEncoder('geo_code')
enc['country_code'] = encoders.BinaryEncoder('country_code')
enc['day_of_week'] = encoders.BinaryEncoder('day_of_week')
enc['day_of_week_cyc'] = encoders.CyclicalEncoder('day_of_week')
enc['day_of_month'] = encoders.BinaryEncoder('day_of_month')
enc['day_of_month_cyc'] = encoders.CyclicalEncoder('day_of_month')
enc['day_of_year'] = encoders.BinaryEncoder('day_of_year')
enc['day_of_year_cyc'] = encoders.CyclicalEncoder('day_of_year')

def encode(df_x, fit):
    
    # encode the geo data
    if fit:
        df_x = enc['continent'].fit_transform(df_x)
        df_x = enc['geo_code'].fit_transform(df_x)
        df_x = enc['country_code'].fit_transform(df_x)
    else:
        df_x = enc['continent'].transform(df_x)
        df_x = enc['geo_code'].transform(df_x)
        df_x = enc['country_code'].transform(df_x)
        
    if fit:
        df_x = enc['day_of_week'].fit_transform(df_x)
        df_x['day_of_week'] = df_x['day_of_week'].apply(lambda x: x / 7.)
        df_x = enc['day_of_week_cyc'].fit_transform(df_x)
    else:
        df_x = enc['day_of_week'].transform(df_x)
        df_x['day_of_week'] = df_x['day_of_week'].apply(lambda x: x / 7.)
        df_x = enc['day_of_week_cyc'].transform(df_x)
        
    if fit:
        df_x['day_of_month'] = df_x['day_of_month'].apply(lambda x: x / 31.)
        df_x = enc['day_of_month_cyc'].fit_transform(df_x)
        df_x['day_of_year'] = df_x['day_of_year'].apply(lambda x: x / 366.)
        df_x = enc['day_of_year_cyc'].fit_transform(df_x)
    else:    
        df_x['day_of_month'] = df_x['day_of_month'].apply(lambda x: x / 31.)
        df_x = enc['day_of_month_cyc'].transform(df_x)
        df_x['day_of_year'] = df_x['day_of_year'].apply(lambda x: x / 366.)
        df_x = enc['day_of_year_cyc'].transform(df_x)
    
    # convert the date to an integer value
    df_x['date_day'] = df_x['date'].apply(lambda x: x.day)
    
    # drop unused columns
    df_x = df_x.drop(labels=['country_name',
                           'continent',
                           'country_code',
                           'day_of_week',
                           'day_of_month',
                           'day_of_year',
                            'country_code3',
                            'country_code_numeric',
                            'confirmed_deaths',
                            'predicted',
                            'region_name',
                            'month',                           
                            'quarter',
                            'week'], axis=1)
    return df_x

# only work within the specified range
df_ml = df.loc[df['date'] < pd.to_datetime(prediction_start_date)]
df_ml = encode(df_ml, fit=True)

# Get the train, val, test split



In [87]:
days_for_validation = 31
days_for_test = 14

def split(df: pd.DataFrame, 
          days_for_validation: int, 
          days_for_test: int) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    
    # First, sort the data by date
    df = df.sort_values('date')

    # Determine the maximum date
    date_start_test = df['date'].max() - pd.to_timedelta(days_for_test - 1, unit='d')
    date_start_validation = date_start_test - pd.to_timedelta(days_for_validation, unit='d')

    df_train = df[df['date'] < date_start_validation]
    df_validation = df[(df['date'] >= date_start_validation) & (df['date'] < date_start_test)]
    df_test = df[df['date'] >= date_start_test]

    # Debug the outpoint
    print(f"Training Range:   {df_train['date'].min().date()} - {df_train['date'].max().date()}")
    print(f"Validation Range: {df_validation['date'].min().date()} - {df_validation['date'].max().date()}")
    print(f"Test Range:       {df_test['date'].min().date()} - {df_test['date'].max().date()}")

    # Sanity Check
    if len(df.index) != len(df_train.index) + len(df_validation.index) + len(df_test.index):
        raise Exception('entries do not add up')

    return df_train, df_validation, df_test

df_train_prescaled, df_validation_prescaled, df_test_prescaled = split(df_ml, days_for_validation, days_for_test)

Training Range:   2020-01-01 - 2020-12-25
Validation Range: 2020-12-26 - 2021-01-25
Test Range:       2021-01-26 - 2021-02-08


# Scale

In [89]:
df_train = df_train_prescaled.copy()
df_validation = df_validation_prescaled.copy()
df_test = df_test_prescaled.copy()

scalers = {}

print(df_ml.columns.values)
for feature_name in df_ml.columns.values:
    if feature_name == 'geo_code' or feature_name == 'date' or feature_name == 'predicted_new_cases':
        continue
    scalers[feature_name] = StandardScaler()
    df_train[feature_name] = scalers[feature_name].fit_transform(df_train_prescaled[[feature_name]])
        
if len(df_validation_prescaled) > 0:        
    for feature_name in df_ml.columns.values:
        if feature_name == 'geo_code' or feature_name == 'date' or feature_name == 'predicted_new_cases':
            continue
        df_validation[feature_name] = scalers[feature_name].transform(df_validation_prescaled[[feature_name]])

if len(df_test_prescaled) > 0:        
    for feature_name in df_ml.columns.values:
        if feature_name == 'geo_code' or feature_name == 'date' or feature_name == 'predicted_new_cases':
            continue        
        df_test[feature_name] = scalers[feature_name].transform(df_test_prescaled[[feature_name]])

df_train = df_train.drop(labels=['geo_code', 'date'], axis=1)
df_validation = df_validation.drop(labels=['geo_code', 'date'], axis=1)
df_test = df_test.drop(labels=['geo_code', 'date'], axis=1)


['predicted_new_cases' 'age_distribution_00_04' 'age_distribution_05_14'
 'age_distribution_15_34' 'age_distribution_34_64'
 'age_distribution_65_plus' 'c1_school_closing' 'c1_school_closing_ma_21'
 'c1_school_closing_ma_3' 'c1_school_closing_ma_7' 'c2_workplace_closing'
 'c2_workplace_closing_ma_21' 'c2_workplace_closing_ma_3'
 'c2_workplace_closing_ma_7' 'c3_cancel_public_events'
 'c3_cancel_public_events_ma_21' 'c3_cancel_public_events_ma_3'
 'c3_cancel_public_events_ma_7' 'c4_restrictions_on_gatherings'
 'c4_restrictions_on_gatherings_ma_21'
 'c4_restrictions_on_gatherings_ma_3' 'c4_restrictions_on_gatherings_ma_7'
 'c5_close_public_transport' 'c5_close_public_transport_ma_21'
 'c5_close_public_transport_ma_3' 'c5_close_public_transport_ma_7'
 'c6_stay_at_home_requirements' 'c6_stay_at_home_requirements_ma_21'
 'c6_stay_at_home_requirements_ma_3' 'c6_stay_at_home_requirements_ma_7'
 'c7_restrictions_on_internal_movement'
 'c7_restrictions_on_internal_movement_ma_21'
 'c7_restrictio

# Let's Train

In [90]:

from sklearn.metrics import mean_squared_error

params_tree = {
    'nthread': 1,
    'objective':'reg:squarederror',
    'eta': 0.1
}


params_linear = {
    "booster": "gblinear",
    'nthread': 1,    
    "objective": "reg:squarederror",
}


rounds = 100
early_stopping_rounds = 10

test_x, test_y = df_test.iloc[:, 1:], df_test.iloc[:, :1]
dtest = xgb.DMatrix(data=test_x,label=test_y)
callback_monitor = xgb.callback.EvaluationMonitor(rank=0, period=10, show_stdv=False)

dtrain = xgb.DMatrix(data=df_train.iloc[:, 1:], label=df_train.iloc[:, :1])
dvalidation = xgb.DMatrix(data=df_validation.iloc[:, 1:], label=df_validation.iloc[:, :1])
watchlist = [(dvalidation, 'eval'), (dtrain, 'train')]

bst = xgb.train(params_linear, 
                dtrain, 
                rounds,
                watchlist,
                early_stopping_rounds=20,      
                callbacks=[callback_monitor],
                verbose_eval=False)
bst.save_model('temp/predictor.model')
predictions = bst.predict(dtest)
score = mean_squared_error(test_y, predictions, squared=False)



[0]	eval-rmse:8091.68994	train-rmse:4131.47510
[10]	eval-rmse:4195.99463	train-rmse:3324.67139
[20]	eval-rmse:4029.14404	train-rmse:3270.61499
[30]	eval-rmse:4028.06201	train-rmse:3254.31934
[40]	eval-rmse:4027.02441	train-rmse:3247.43579
[50]	eval-rmse:4021.57471	train-rmse:3243.72852
[60]	eval-rmse:4013.83936	train-rmse:3241.34839
[70]	eval-rmse:4005.06689	train-rmse:3239.60864
[80]	eval-rmse:3995.99072	train-rmse:3238.23291
[90]	eval-rmse:3987.03906	train-rmse:3237.09521
[99]	eval-rmse:3979.32056	train-rmse:3236.22192


In [91]:
predictions = bst.predict(dtest)


# Handle the Prescription Indexes

In [112]:

def compute_ma(df_x, field, window_size):
    df_x[f"{field}_ma_{window_size}"] = df_x.groupby('geo_code')[field].rolling(window_size, center=False).mean().fillna(0).reset_index(0, drop=True)

    
def predict(prescription):
    # filter the date range for the prediction window
    days_to_predict = 10
    date_to_predict_from = df[df['predicted'] == False]['date'].max()
    date_to_predict_to = date_to_predict_from + pd.to_timedelta(days_to_predict, unit='d')  
    df_out = df.copy()
    df_out = df_out.sort_values(['geo_code', 'date'])
    df_out = df_out[df_out['date'] <= pd.to_datetime(date_to_predict_to)]
    df_out['region_name'] = df_out['region_name'].fillna('')
    
    
    # assign npi values
    print("assigning npi values")
    for _, row in df_out.loc[df_out['date'] >= date_to_predict_from].iterrows():
        date_from = row['date']
        country_name = row['country_name']
        region_name = row['region_name']
        filter = ((prescription['Date'] == date_from) & (prescription['CountryName'] == country_name) & (prescription['RegionName'] == region_name))
        prescription_row = prescription.loc[filter] 
        row['c1_school_closing'] = prescription_row['C1_School closing'].max()
        row['c2_workplace_closing'] = prescription_row['C2_Workplace closing'].max()
        row['c3_cancel_public_events'] = prescription_row['C3_Cancel public events'].max()
        row['c4_restrictions_on_gatherings'] = prescription_row['C4_Restrictions on gatherings'].max()
        row['c5_close_public_transport'] = prescription_row['C5_Close public transport'].max()
        row['c6_stay_at_home_requirements'] = prescription_row['C6_Stay at home requirements'].max()
        row['c7_restrictions_on_internal_movement'] = prescription_row['C7_Restrictions on internal movement'].max()
        row['c8_international_travel_controls'] = prescription_row['C8_International travel controls'].max()
        row['h1_public_information_campaigns'] = prescription_row['H1_Public information campaigns'].max()
        row['h2_testing_policy'] = prescription_row['H2_Testing policy'].max()
        row['h3_contact_tracing'] = prescription_row['H3_Contact tracing'].max()
        row['h6_facial_coverings'] = prescription_row['H6_Facial Coverings'].max()            
    

    # encode the data
    df_input = encode(df_out.copy(), fit=False)

     # get iterators we'll use on the rows
    it_df = df_out.loc[df_out['date'] >= date_to_predict_from].iterrows()
    it_ml = df_input.loc[df_input['date'] >= date_to_predict_from].iterrows()
    
    # scale values
    print("scaling")
    for name in scalers:
        if name not in df_input.columns:
            continue
        if name == 'confirmed_cases':
            continue
        if name == 'new_cases':
            continue
        df_input[name] = scalers[name].transform(df_input[[name]])
    
    
    # predict each day
    print("predicting")
    for _, row in it_df:
        geo_code = row['geo_code']
        confirmed_cases = row['confirmed_cases']
        date_from = row['date']
        date_to = date_from + pd.to_timedelta(1, unit='d')  
                
        # conver to df
        model_input = next(it_ml)[1]    
        model_input = pd.DataFrame(model_input.to_frame().T)
        model_input = model_input.drop(labels=['geo_code', 'date'], axis=1)
        for name in model_input.columns:
            model_input[name] = pd.to_numeric(model_input[name])
        model_input['new_cases'] = scalers['new_cases'].transform(model_input[['new_cases']])        
        model_input['confirmed_cases'] = scalers['confirmed_cases'].transform(model_input[['confirmed_cases']])
                
        # predict
        predictions = bst.predict(xgb.DMatrix(data=model_input.iloc[:, 1:], label=model_input.iloc[:, :1]))
        new_cases = max(0., predictions[0])
        
        # assign new cases and predicted cases to next row
        filter_out = (df_out['geo_code'] == geo_code) & (df_out['date'] == date_to)
        filter_in = (df_input['geo_code'] == geo_code) & (df_input['date'] == date_to)        
        df_out.loc[filter_out, ['new_cases', 'confirmed_cases']] = [new_cases, confirmed_cases + new_cases]
        df_input.loc[filter_in, ['new_cases', 'confirmed_cases']] = [new_cases, confirmed_cases + new_cases]
        
        # update the moving averages for new and confirmed cases
        for window_size in [3, 7, 21]:
            compute_ma(df_input, 'new_cases', window_size)
            compute_ma(df_input, 'confirmed_cases', window_size)

    # debug    
    print(df_out.loc[df_out['geo_code'] == 'DE'][['date', 'new_cases', 'confirmed_cases', 'c1_school_closing']].tail(7))
        
        

# read the prescriptions and generate a prediction file for each prescription
def do_predictions():
    prescriptions = pd.read_csv('prescriptions.csv', keep_default_na=False, na_values='')
    prescriptions['Date'] = pd.to_datetime(prescriptions['Date']) 
    for i in range(1):
        print(f"generating predictions for index {i}")
        predict(prescriptions.loc[prescriptions['PrescriptionIndex'] == i])

print(datetime.datetime.now())
do_predictions()
print(datetime.datetime.now())


2021-02-11 16:17:09.500686
generating predictions for index 0
assigning npi values
scaling
predicting
            date     new_cases  confirmed_cases  c1_school_closing
25606 2021-02-12  1.828189e+09     1.830486e+09                3.0
25607 2021-02-13  1.819276e+09     1.821572e+09                3.0
25608 2021-02-14  1.811876e+09     1.814173e+09                3.0
25609 2021-02-15  1.805673e+09     1.807969e+09                3.0
25610 2021-02-16  1.800749e+09     1.803046e+09                3.0
25611 2021-02-17  1.797499e+09     1.799796e+09                3.0
25612 2021-02-18  1.793725e+09     1.796021e+09                3.0
2021-02-11 16:29:17.364573


In [105]:
df_predict.loc[df_predict['geo_code'] == 'DE'][['date', 'new_cases', 'confirmed_cases']].tail(7)

NameError: name 'df_predict' is not defined