# Elastic Search Update Notebook

This notebook is used to update the elastic search index with the latest datasets

In [39]:
!pip install xgboost
!pip install git+https://github.com/rbilleci/pandora.git

Collecting git+https://github.com/rbilleci/pandora.git
  Cloning https://github.com/rbilleci/pandora.git to /tmp/pip-req-build-rsfc7_wp
  Running command git clone -q https://github.com/rbilleci/pandora.git /tmp/pip-req-build-rsfc7_wp
Building wheels for collected packages: pandora
  Building wheel for pandora (setup.py) ... [?25ldone
[?25h  Created wheel for pandora: filename=pandora-0.1.0-py3-none-any.whl size=2681412 sha256=120d54e423f81eed34930b641a9b812ab006267ba82c501f53b465521b01f4b8
  Stored in directory: /tmp/pip-ephem-wheel-cache-b8nfv62n/wheels/01/8b/d5/a72c927a738750e04a4bb4fd22f63b4b88c7b5871732e2d67b
Successfully built pandora


In [197]:
import boto3
import pandas as pd
import shutil
import os
import numpy as np
from pandora import loader, encoders
from sklearn.preprocessing import RobustScaler, StandardScaler
import datetime
from pathlib import Path
from logging import INFO, basicConfig, info
import warnings
import xgboost as xgb
from sklearn.metrics import mean_squared_error


In [198]:
# setup logging
basicConfig(level=INFO, format='%(asctime)s\t%(levelname)s\t%(filename)s\t%(message)s')
warnings.filterwarnings('ignore', category=FutureWarning)  # ignore FutureWarning from scikit learn

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_info_columns = 1000

# Read the Data

In [200]:
# load the dataset and set the date column
df = pd.read_csv('temp/01-data.csv', keep_default_na=False, na_values='')
df['date'] = pd.to_datetime(df['date'])
df['region_name'] = df['region_name'].fillna('')

# determine the date where prediction should begin
prediction_start_date = df[df['predicted'] == True]['date'].min().date()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138768 entries, 0 to 138767
Data columns (total 107 columns):
 #    Column                                      Non-Null Count   Dtype         
---   ------                                      --------------   -----         
 0    predicted_new_cases                         138768 non-null  float64       
 1    age_distribution_00_04                      138768 non-null  float64       
 2    age_distribution_05_14                      138768 non-null  float64       
 3    age_distribution_15_34                      138768 non-null  float64       
 4    age_distribution_34_64                      138768 non-null  float64       
 5    age_distribution_65_plus                    138768 non-null  float64       
 6    c1_school_closing                           138768 non-null  float64       
 7    c1_school_closing_ma_21                     138768 non-null  float64       
 8    c1_school_closing_ma_3                      138768 non-null  f

  interactivity=interactivity, compiler=compiler, result=result)


# Prepare dataset for machine learning

In [201]:

# declare the encoders
enc = {}
enc['continent'] = encoders.BinaryEncoder('continent')
enc['geo_code'] = encoders.BinaryEncoder('geo_code')
enc['country_code'] = encoders.BinaryEncoder('country_code')
enc['day_of_week'] = encoders.BinaryEncoder('day_of_week')
enc['day_of_week_cyc'] = encoders.CyclicalEncoder('day_of_week')
enc['day_of_month'] = encoders.BinaryEncoder('day_of_month')
enc['day_of_month_cyc'] = encoders.CyclicalEncoder('day_of_month')
enc['day_of_year'] = encoders.BinaryEncoder('day_of_year')
enc['day_of_year_cyc'] = encoders.CyclicalEncoder('day_of_year')

def encode(df_x, fit):
    
    # encode the geo data
    if fit:
        df_x = enc['continent'].fit_transform(df_x)
        df_x = enc['geo_code'].fit_transform(df_x)
        df_x = enc['country_code'].fit_transform(df_x)
    else:
        df_x = enc['continent'].transform(df_x)
        df_x = enc['geo_code'].transform(df_x)
        df_x = enc['country_code'].transform(df_x)
        
    if fit:
        df_x = enc['day_of_week'].fit_transform(df_x)
        df_x['day_of_week'] = df_x['day_of_week'].apply(lambda x: x / 7.)
        df_x = enc['day_of_week_cyc'].fit_transform(df_x)
    else:
        df_x = enc['day_of_week'].transform(df_x)
        df_x['day_of_week'] = df_x['day_of_week'].apply(lambda x: x / 7.)
        df_x = enc['day_of_week_cyc'].transform(df_x)
        
    if fit:
        df_x['day_of_month'] = df_x['day_of_month'].apply(lambda x: x / 31.)
        df_x = enc['day_of_month_cyc'].fit_transform(df_x)
        df_x['day_of_year'] = df_x['day_of_year'].apply(lambda x: x / 366.)
        df_x = enc['day_of_year_cyc'].fit_transform(df_x)
    else:    
        df_x['day_of_month'] = df_x['day_of_month'].apply(lambda x: x / 31.)
        df_x = enc['day_of_month_cyc'].transform(df_x)
        df_x['day_of_year'] = df_x['day_of_year'].apply(lambda x: x / 366.)
        df_x = enc['day_of_year_cyc'].transform(df_x)
    
    # convert the date to an integer value
    df_x['date_day'] = df_x['date'].apply(lambda x: x.day)
    
    # drop unused columns
    df_x = df_x.drop(labels=['country_name',
                           'continent',
                           'country_code',
                           'day_of_week',
                           'day_of_month',
                           'day_of_year',
                            'npi_sum',
                            'pneumonia_deaths_per_100k',
                            'pneumonia_deaths_per_100k--',
                            'country_code3',
                            'country_code_numeric',
                            'confirmed_deaths',
                            'predicted',
                            'region_name',
                            'month',                           
                            'quarter',
                            'week',
                            'temperature',
                            'year'], axis=1)
    return df_x

# only work within the specified range
df_ml = df.loc[df['date'] < pd.to_datetime(prediction_start_date)]
df_ml = encode(df_ml, fit=True)

# Get the train, val, test split



In [211]:
days_for_validation = 21
days_for_test = 14

def split(df: pd.DataFrame, 
          days_for_validation: int, 
          days_for_test: int) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    
    # First, sort the data by date
    df = df.sort_values('date')

    # Determine the maximum date
    date_start_test = df['date'].max() - pd.to_timedelta(days_for_test - 1, unit='d')
    date_start_validation = date_start_test - pd.to_timedelta(days_for_validation, unit='d')

    df_train = df[df['date'] < date_start_validation]
    df_validation = df[(df['date'] >= date_start_validation) & (df['date'] < date_start_test)]
    df_test = df[df['date'] >= date_start_test]

    # Debug the outpoint
    print(f"Training Range:   {df_train['date'].min().date()} - {df_train['date'].max().date()}")
    print(f"Validation Range: {df_validation['date'].min().date()} - {df_validation['date'].max().date()}")
    print(f"Test Range:       {df_test['date'].min().date()} - {df_test['date'].max().date()}")

    # Sanity Check
    if len(df.index) != len(df_train.index) + len(df_validation.index) + len(df_test.index):
        raise Exception('entries do not add up')

    return df_train, df_validation, df_test

df_train_prescaled, df_validation_prescaled, df_test_prescaled = split(df_ml, days_for_validation, days_for_test)

Training Range:   2020-01-01 - 2021-01-06
Validation Range: 2021-01-07 - 2021-01-27
Test Range:       2021-01-28 - 2021-02-10


# Scale

In [212]:
df_train = df_train_prescaled.copy()
df_validation = df_validation_prescaled.copy()
df_test = df_test_prescaled.copy()

scalers = {}

for feature_name in df_ml.columns.values:
    if feature_name == 'geo_code' or feature_name == 'date' or feature_name == 'predicted_new_cases':
        continue
    scalers[feature_name] = StandardScaler()
    df_train[feature_name] = scalers[feature_name].fit_transform(df_train_prescaled[[feature_name]])
        
if len(df_validation_prescaled) > 0:        
    for feature_name in df_ml.columns.values:
        if feature_name == 'geo_code' or feature_name == 'date' or feature_name == 'predicted_new_cases':
            continue
        df_validation[feature_name] = scalers[feature_name].transform(df_validation_prescaled[[feature_name]])

if len(df_test_prescaled) > 0:        
    for feature_name in df_ml.columns.values:
        if feature_name == 'geo_code' or feature_name == 'date' or feature_name == 'predicted_new_cases':
            continue        
        df_test[feature_name] = scalers[feature_name].transform(df_test_prescaled[[feature_name]])

df_train = df_train.drop(labels=['geo_code', 'date'], axis=1)
df_validation = df_validation.drop(labels=['geo_code', 'date'], axis=1)
df_test = df_test.drop(labels=['geo_code', 'date'], axis=1)


# Let's Train

In [213]:

from sklearn.metrics import mean_squared_error

params_tree = {
    'nthread': 1,
    'objective':'reg:squarederror',
    'eta': 0.1
}


params_linear = {
    "booster": "gblinear",
    'nthread': 1,    
    "objective": "reg:squarederror",
}


rounds = 1000
early_stopping_rounds = 10

test_x, test_y = df_test.iloc[:, 1:], df_test.iloc[:, :1]
dtest = xgb.DMatrix(data=test_x,label=test_y)
callback_monitor = xgb.callback.EvaluationMonitor(rank=0, period=10, show_stdv=False)

dtrain = xgb.DMatrix(data=df_train.iloc[:, 1:], label=df_train.iloc[:, :1])
dvalidation = xgb.DMatrix(data=df_validation.iloc[:, 1:], label=df_validation.iloc[:, :1])
watchlist = [(dvalidation, 'eval'), (dtrain, 'train')]

bst = xgb.train(params_linear, 
                dtrain, 
                rounds,
                watchlist,
                early_stopping_rounds=20,      
                callbacks=[callback_monitor],
                verbose_eval=False)
bst.save_model('temp/predictor.model')
predictions = bst.predict(dtest)
score = mean_squared_error(test_y, predictions, squared=False)



[0]	eval-rmse:8251.42188	train-rmse:4282.09522
[10]	eval-rmse:3946.68311	train-rmse:3375.67920
[20]	eval-rmse:3737.02148	train-rmse:3313.60913
[30]	eval-rmse:3768.69971	train-rmse:3294.04102
[40]	eval-rmse:3790.04639	train-rmse:3286.70752
[50]	eval-rmse:3795.46240	train-rmse:3283.09448
[60]	eval-rmse:3791.55029	train-rmse:3280.67554
[70]	eval-rmse:3782.72803	train-rmse:3278.71387
[80]	eval-rmse:3771.71826	train-rmse:3276.99976
[90]	eval-rmse:3760.03516	train-rmse:3275.47559
[100]	eval-rmse:3748.52002	train-rmse:3274.10840
[110]	eval-rmse:3737.57812	train-rmse:3272.89258
[120]	eval-rmse:3727.39111	train-rmse:3271.82056
[130]	eval-rmse:3718.01318	train-rmse:3270.86963
[140]	eval-rmse:3709.42895	train-rmse:3270.02783
[150]	eval-rmse:3701.60303	train-rmse:3269.29224
[160]	eval-rmse:3694.46558	train-rmse:3268.64331
[170]	eval-rmse:3687.96729	train-rmse:3268.07397
[180]	eval-rmse:3682.04492	train-rmse:3267.57275
[190]	eval-rmse:3676.64526	train-rmse:3267.13379
[200]	eval-rmse:3671.71802	trai

In [214]:
predictions = bst.predict(dtest)


# Handle the Prescription Indexes

In [None]:

def add_geo_date_index(df_x):
    df_x['_index'] = df_x['CountryName'] + df_x['RegionName'] + df_x['Date'].dt.strftime('%Y%m%d')
    return df_x
    

# read the prescriptions and generate a prediction file for each prescription
def evaluate_intervention_plans(days):
    df_intervention_plans = pd.read_csv('prescriptions.csv', keep_default_na=False, na_values='')
    df_intervention_plans['RegionName'] = df_intervention_plans['RegionName'].fillna('')
    df_intervention_plans['Date'] = pd.to_datetime(df_intervention_plans['Date']) 
    df_intervention_plans = add_geo_date_index(df_intervention_plans)
    for i in range(10):
        print(f"{datetime.datetime.now()} - generating predictions for index {i}")
        df_intervention_plan = df_intervention_plans.loc[df_intervention_plans['PrescriptionIndex'] == i]
        evaluate_intervention_plan(i, df_intervention_plan, days)

def evaluate_intervention_plan(prescription_index, 
                               df_intervention_plan, 
                               days_to_predict):
    print(f"{datetime.datetime.now()} - applying intervention plan")
    date_to_predict_from = df[df['predicted'] == False]['date'].max()
    date_to_predict_to = date_to_predict_from + pd.to_timedelta(days_to_predict, unit='d')  
    df_out = df.copy()
    df_out = df_out[df_out['date'] <= pd.to_datetime(date_to_predict_to) + pd.to_timedelta(1, unit='d')  ]
    df_out = df_out.sort_values(['geo_code', 'date'])
    
    for _, row in df_out.loc[df_out['date'] >= date_to_predict_from].iterrows():
        date_from = row['date']
        country_name = row['country_name']
        region_name = row['region_name']
        key = country_name + region_name + date_from.strftime('%Y%m%d')
        ip_row = df_intervention_plan.loc[df_intervention_plan['_index'] == key] 
        row['c1_school_closing'] = ip_row['C1_School closing'].max()
        row['c2_workplace_closing'] = ip_row['C2_Workplace closing'].max()
        row['c3_cancel_public_events'] = ip_row['C3_Cancel public events'].max()
        row['c4_restrictions_on_gatherings'] = ip_row['C4_Restrictions on gatherings'].max()
        row['c5_close_public_transport'] = ip_row['C5_Close public transport'].max()
        row['c6_stay_at_home_requirements'] = ip_row['C6_Stay at home requirements'].max()
        row['c7_restrictions_on_internal_movement'] = ip_row['C7_Restrictions on internal movement'].max()
        row['c8_international_travel_controls'] = ip_row['C8_International travel controls'].max()
        row['h1_public_information_campaigns'] = ip_row['H1_Public information campaigns'].max()
        row['h2_testing_policy'] = ip_row['H2_Testing policy'].max()
        row['h3_contact_tracing'] = ip_row['H3_Contact tracing'].max()
        row['h6_facial_coverings'] = ip_row['H6_Facial Coverings'].max()    
    prediction = predict(df_out, date_to_predict_from, date_to_predict_to)
    prediction.to_csv(f"_plan_{prescription_index}.csv", index=False)

def predict_baseline(days_to_predict):
    print(f"{datetime.datetime.now()} - evaluating baselin plan")
    date_to_predict_from = df[df['predicted'] == False]['date'].max()
    date_to_predict_to = date_to_predict_from + pd.to_timedelta(days_to_predict, unit='d')  
    df_out = df.copy()
    df_out = df_out[df_out['date'] <= pd.to_datetime(date_to_predict_to) + pd.to_timedelta(1, unit='d')  ]
    df_out = df_out.sort_values(['geo_code', 'date'])        
    prediction = predict(df_out, date_to_predict_from, date_to_predict_to)    
    prediction.to_csv(f"_plan_baseline.csv", index=False)
    
def predict(df_out, 
            date_to_predict_from, 
            date_to_predict_to):
    # predict each country
    print(f"{datetime.datetime.now()} - predicting")
    df_out = df_out.groupby(['geo_code']).apply(
        lambda g: predict_for_geo(g, date_to_predict_from, date_to_predict_to)).reset_index(0, drop=True)
    
    # filter out any extra days
    df_out = df_out[df_out['date'] <= pd.to_datetime(date_to_predict_to)]
    return df_out


def predict_for_geo(df_geo,
                    date_to_predict_from, 
                    date_to_predict_to):
    geo_code = df_geo['geo_code'].iloc[0]
    df_input = df_geo.copy()
    df_input = encode(df_input, fit=False)

    # scale values
    for name in scalers:
        if name not in df_input.columns:
            continue
        if name == 'confirmed_cases':
            continue
        if name == 'new_cases':
            continue
        df_input[name] = scalers[name].transform(df_input[[name]])
    
    # get iterators we'll use on the rows
    df_geo_it = df_geo.loc[df_geo['date'] >= date_to_predict_from].iterrows()
    df_input_it = df_input.loc[df_input['date'] >= date_to_predict_from].iterrows()
    
    # predict each day
    new_cases = 0.
    new_cases_ma = []
    confirmed_cases = 0.
    confirmed_cases_ma = []
    first_row = True
    
    for x, row in df_geo_it:
        if first_row:
            first_row = False
            new_cases = row['new_cases']
            new_cases_ma = [new_cases] * 21
            confirmed_cases = row['confirmed_cases']     
            confirmed_cases_ma = [confirmed_cases] * 21       
        else:            
            row['new_cases'] = new_cases
            row['new_cases_as_percent_of_population'] = new_cases / row['population']              
            row['confirmed_cases'] = confirmed_cases
            row['confirmed_cases_as_percent_of_population'] = confirmed_cases / row['population']        
        
        # update the moving averages for this row
        for window_size in [3, 7, 21]:
            row[f"new_cases_ma_{window_size}"] = np.mean(new_cases_ma[-window_size:])
            row[f"confirmed_cases_ma_{window_size}"] = np.mean(confirmed_cases_ma[-window_size:])
        
        # get the dates
        date_from = row['date']
        date_to = date_from + pd.to_timedelta(1, unit='d')  

        # convert to df
        model_input = next(df_input_it)[1]    
        model_input = pd.DataFrame(model_input.to_frame().T)
        model_input['new_cases'] = new_cases
        model_input['new_cases_as_percent_of_population'] = row['new_cases_as_percent_of_population']
        model_input['confirmed_cases'] = confirmed_cases
        model_input['confirmed_cases_as_percent_of_population'] = row['confirmed_cases_as_percent_of_population']
        model_input = model_input.drop(labels=['geo_code', 'date'], axis=1)
        for name in model_input.columns:
            model_input[name] = pd.to_numeric(model_input[name])
        model_input['new_cases'] = scalers['new_cases'].transform(model_input[['new_cases']])        
        model_input['new_cases_as_percent_of_population'] = scalers['new_cases_as_percent_of_population'].transform(model_input[['new_cases_as_percent_of_population']])        
        model_input['confirmed_cases'] = scalers['confirmed_cases'].transform(model_input[['confirmed_cases']])
        model_input['confirmed_cases_as_percent_of_population'] = scalers['confirmed_cases_as_percent_of_population'].transform(model_input[['confirmed_cases_as_percent_of_population']])

        # predict
        predictions = bst.predict(xgb.DMatrix(data=model_input.iloc[:, 1:], label=model_input.iloc[:, :1]))

        # update calculations
        new_cases = max(0., predictions[0])
        confirmed_cases += new_cases
        # add to moving averages
        new_cases_ma.append(new_cases)
        confirmed_cases_ma.append(confirmed_cases) 
        # assign the row
        df_geo.loc[x] = row
        
    return df_geo
 

predict_baseline(90)
evaluate_intervention_plans(90)

2021-02-12 00:30:42.021873 - evaluating baselin plan
2021-02-12 00:30:42.165762 - predicting
2021-02-12 00:54:43.605991 - generating predictions for index 0
2021-02-12 00:54:43.619722 - applying intervention plan
2021-02-12 00:55:37.741257 - predicting
2021-02-12 01:20:04.268035 - generating predictions for index 1
2021-02-12 01:20:04.271597 - applying intervention plan
2021-02-12 01:21:00.143422 - predicting
2021-02-12 01:45:57.392249 - generating predictions for index 2
2021-02-12 01:45:57.396042 - applying intervention plan
2021-02-12 01:46:53.479173 - predicting
2021-02-12 02:11:51.268466 - generating predictions for index 3
2021-02-12 02:11:51.271888 - applying intervention plan
2021-02-12 02:12:47.260838 - predicting
2021-02-12 02:36:54.456971 - generating predictions for index 4
2021-02-12 02:36:54.460503 - applying intervention plan
2021-02-12 02:37:48.969809 - predicting


In [None]:
df_predict.loc[df_predict['geo_code'] == 'DE'][['date', 'new_cases', 'confirmed_cases']].tail(7)