# Elastic Search Update Notebook

This notebook is used to update the elastic search index with the latest datasets

In [3]:
!pip install xgboost
!pip install git+https://github.com/rbilleci/pandora.git

Collecting git+https://github.com/rbilleci/pandora.git
  Cloning https://github.com/rbilleci/pandora.git to /tmp/pip-req-build-a7w_ctbc
  Running command git clone -q https://github.com/rbilleci/pandora.git /tmp/pip-req-build-a7w_ctbc
Building wheels for collected packages: pandora
  Building wheel for pandora (setup.py) ... [?25ldone
[?25h  Created wheel for pandora: filename=pandora-0.1.0-py3-none-any.whl size=2681412 sha256=c3747069c2561901e403d263153eb5ed96c5ab86d6f89b30d892701e81850a19
  Stored in directory: /tmp/pip-ephem-wheel-cache-2gablfqc/wheels/01/8b/d5/a72c927a738750e04a4bb4fd22f63b4b88c7b5871732e2d67b
Successfully built pandora


In [2]:
import boto3
import pandas as pd
import shutil
import os
import numpy as np
from pandora import loader, encoders
from sklearn.preprocessing import RobustScaler, StandardScaler
import datetime
from pathlib import Path
from logging import INFO, basicConfig, info
import warnings
import xgboost as xgb
from sklearn.metrics import mean_squared_error


In [3]:
# setup logging
basicConfig(level=INFO, format='%(asctime)s\t%(levelname)s\t%(filename)s\t%(message)s')
warnings.filterwarnings('ignore', category=FutureWarning)  # ignore FutureWarning from scikit learn

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_info_columns = 1000

# Read the Data

In [17]:
# load the dataset and set the date column
df = pd.read_csv('temp/01-data.csv', keep_default_na=False, na_values='')
df['date'] = pd.to_datetime(df['date'])

# determine the date where prediction should begin
prediction_start_date = df[df['predicted'] == True]['date'].min().date()



  interactivity=interactivity, compiler=compiler, result=result)


# Prepare dataset for machine learning

In [20]:

# declare the encoders
enc = {}
enc['continent'] = encoders.BinaryEncoder('continent')
enc['geo_code'] = encoders.BinaryEncoder('geo_code')
enc['country_code'] = encoders.BinaryEncoder('country_code')
enc['day_of_week'] = encoders.BinaryEncoder('day_of_week')
enc['day_of_week_cyc'] = encoders.CyclicalEncoder('day_of_week')
enc['day_of_month'] = encoders.BinaryEncoder('day_of_month')
enc['day_of_month_cyc'] = encoders.CyclicalEncoder('day_of_month')
enc['day_of_year'] = encoders.BinaryEncoder('day_of_year')
enc['day_of_year_cyc'] = encoders.CyclicalEncoder('day_of_year')

def encode(df_x, fit):
    # convert the date to an integer value
    df_x['date_day'] = df_x['date'].apply(lambda x: x.day)

    # encode the geo data
    if fit:
        df_x = enc['continent'].fit_transform(df_x)
        df_x = enc['geo_code'].fit_transform(df_x)
        df_x = enc['country_code'].fit_transform(df_x)
    else:
        df_x = enc['continent'].transform(df_x)
        df_x = enc['geo_code'].transform(df_x)
        df_x = enc['country_code'].transform(df_x)
    if fit:
        df_x = enc['day_of_week'].fit_transform(df_x)
        df_x['day_of_week'] = df_x['day_of_week'] / 7.0
        df_x = enc['day_of_week_cyc'].fit_transform(df_x)
    else:
        df_x = enc['day_of_week'].transform(df_x)
        df_x['day_of_week'] = df_x['day_of_week'] / 7.0
        df_x = enc['day_of_week_cyc'].transform(df_x)
    if fit:
        df_x['day_of_month'] = df_x['day_of_month'] / 31.0 # keep it simple
        df_x = enc['day_of_month_cyc'].fit_transform(df_x)
        df_x['day_of_year'] = df_x['day_of_year'] / 366.0 # keep it simple
        df_x = enc['day_of_year_cyc'].fit_transform(df_x)
    else:
        df_x['day_of_month'] = df_x['day_of_month'] / 31.0 # keep it simple
        df_x = enc['day_of_month_cyc'].transform(df_x)
        df_x['day_of_year'] = df_x['day_of_year'] / 366.0 # keep it simple
        df_x = enc['day_of_year_cyc'].transform(df_x)
        
    
    # drop unused columns
    df_x = df_x.drop(labels=['country_name',
                           'continent',
                           'geo_code',
                           'country_code',
                           'day_of_week',
                           'day_of_month',
                           'day_of_year',
                            'country_code3',
                            'country_code_numeric',
                            'confirmed_deaths',
                            'predicted',
                            'region_name',
                            'month',                           
                            'quarter',
                            'week'], axis=1)
    return df_x

# only work within the specified range
df_ml = df.loc[df['date'] < pd.to_datetime(prediction_start_date)]
df_ml = encode(df_ml, fit=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


# Get the train, val, test split



In [22]:
days_for_validation = 31
days_for_test = 14

def split(df: pd.DataFrame, 
          days_for_validation: int, 
          days_for_test: int) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    
    # First, sort the data by date
    df = df.sort_values('date')

    # Determine the maximum date
    date_start_test = df['date'].max() - pd.to_timedelta(days_for_test - 1, unit='d')
    date_start_validation = date_start_test - pd.to_timedelta(days_for_validation, unit='d')

    df_train = df[df['date'] < date_start_validation]
    df_validation = df[(df['date'] >= date_start_validation) & (df['date'] < date_start_test)]
    df_test = df[df['date'] >= date_start_test]

    # Debug the outpoint
    print(f"Training Range:   {df_train['date'].min().date()} - {df_train['date'].max().date()}")
    print(f"Validation Range: {df_validation['date'].min().date()} - {df_validation['date'].max().date()}")
    print(f"Test Range:       {df_test['date'].min().date()} - {df_test['date'].max().date()}")

    # Sanity Check
    if len(df.index) != len(df_train.index) + len(df_validation.index) + len(df_test.index):
        raise Exception('entries do not add up')

    return df_train, df_validation, df_test

df_train_prescaled, df_validation_prescaled, df_test_prescaled = split(df_ml, days_for_validation, days_for_test)

Training Range:   2020-01-01 - 2020-12-25
Validation Range: 2020-12-26 - 2021-01-25
Test Range:       2021-01-26 - 2021-02-08


# Scale

In [23]:
df_train = df_train_prescaled.copy()
df_validation = df_validation_prescaled.copy()
df_test = df_test_prescaled.copy()

scalers = {}


for feature_name in df_ml.columns.values:
    if feature_name == 'date' or feature_name == 'predicted_new_cases':
        continue
    scalers[feature_name] = StandardScaler()
    df_train[feature_name] = scalers[feature_name].fit_transform(df_train_prescaled[[feature_name]])
        
if len(df_validation_prescaled) > 0:        
    for feature_name in df_ml.columns.values:
        if feature_name == 'date' or feature_name == 'predicted_new_cases':
            continue
        df_validation[feature_name] = scalers[feature_name].transform(df_validation_prescaled[[feature_name]])

if len(df_test_prescaled) > 0:        
    for feature_name in df_ml.columns.values:
        if feature_name == 'date' or feature_name == 'predicted_new_cases':
            continue        
        df_test[feature_name] = scalers[feature_name].transform(df_test_prescaled[[feature_name]])

df_train = df_train.drop(labels=['date'], axis=1)
df_validation = df_validation.drop(labels=['date'], axis=1)
df_test = df_test.drop(labels=['date'], axis=1)


# Let's Train

In [25]:

from sklearn.metrics import mean_squared_error

params_tree = {
    'nthread': 1,
    'objective':'reg:squarederror',
    'eta': 0.1
}


params_linear = {
    "booster": "gblinear",
    'nthread': 1,    
    "objective": "reg:squarederror",
}


rounds = 100
early_stopping_rounds = 10

test_x, test_y = df_test.iloc[:, 1:], df_test.iloc[:, :1]
dtest = xgb.DMatrix(data=test_x,label=test_y)
callback_monitor = xgb.callback.EvaluationMonitor(rank=0, period=10, show_stdv=False)

dtrain = xgb.DMatrix(data=df_train.iloc[:, 1:], label=df_train.iloc[:, :1])
dvalidation = xgb.DMatrix(data=df_validation.iloc[:, 1:], label=df_validation.iloc[:, :1])
watchlist = [(dvalidation, 'eval'), (dtrain, 'train')]

bst = xgb.train(params_linear, 
                dtrain, 
                rounds,
                watchlist,
                early_stopping_rounds=20,      
                callbacks=[callback_monitor],
                verbose_eval=False)
bst.save_model('temp/predictor.model')
predictions = bst.predict(dtest)
score = mean_squared_error(test_y, predictions, squared=False)



[0]	eval-rmse:8091.68066	train-rmse:4131.48633
[10]	eval-rmse:4195.98486	train-rmse:3324.66919
[20]	eval-rmse:4029.15161	train-rmse:3270.61499
[30]	eval-rmse:4028.06714	train-rmse:3254.31982
[40]	eval-rmse:4027.02539	train-rmse:3247.43579
[50]	eval-rmse:4021.57471	train-rmse:3243.72827
[60]	eval-rmse:4013.83936	train-rmse:3241.34839
[70]	eval-rmse:4005.06616	train-rmse:3239.60889
[80]	eval-rmse:3995.98877	train-rmse:3238.23267
[90]	eval-rmse:3987.03931	train-rmse:3237.09546
[99]	eval-rmse:3979.31958	train-rmse:3236.22241


In [26]:
predictions = bst.predict(dtest)


# Handle the Prescription Indexes

In [48]:

def compute_ma(df_x, field, window_size):
    df_x[f"{field}_ma_{window_size}"] = df_x.groupby('geo_code')[field].rolling(window_size, center=False).mean().fillna(0).reset_index(0, drop=True)


def predict_day(
        df_predict,
        group, 
        prescription):
    
    # get the dates, and check to see if we should predict or not
    date_from = group['date'].max()
    date_to = date_from + pd.to_timedelta(1, unit='d')   
    
    # get the geo code and values, we'll need to map back
    # to the origial data frame for output
    group = group.copy()
    geo_codes = group['geo_code'].values
    
    # try applying the prescriptions
    if prescription is not None:
        for _, row in prescription.loc[prescription['Date'] == date_from].iterrows():        
            filter= (group['country_name'] == row['CountryName']) & (roup['region_name'] == row['RegionName'])
            group.loc[filter, 'c1_school_closing'] = row['C1_School closing']
            #group.loc[filter, 'c2_workplace_closing'] = row['C2_Workplace closing']
            #group.loc[filter, 'c3_cancel_public_events'] = row['C3_Cancel public events']
            #group.loc[filter, 'c4_restrictions_on_gatherings'] = row['C4_Restrictions on gatherings']
            #group.loc[filter, 'c5_close_public_transport'] = row['C5_Close public transport']
            #group.loc[filter, 'c6_stay_at_home_requirements'] = row['C6_Stay at home requirements']
            #group.loc[filter, 'c7_restrictions_on_internal_movement'] = row['C7_Restrictions on internal movement']
            #group.loc[filter, 'c8_international_travel_controls'] = row['C8_International travel controls']            
            #group.loc[filter, 'h1_public_information_campaigns'] = row['H1_Public information campaigns']            
            #group.loc[filter, 'h2_testing_policy'] = row['H2_Testing policy']            
            #group.loc[filter, 'h3_contact_tracing'] = row['H3_Contact tracing']            
            #group.loc[filter, 'h6_facial_coverings'] = row['H6_Facial Coverings']            
        

    
    # determine the date we are predicting from encode and scale    
    group = encode(group, False)
    group = group.drop(labels=['date'], axis=1)
    for scaler_name in scalers:
        group[scaler_name] = scalers[scaler_name].transform(group[[scaler_name]])

    # run the prediction algorithm
    print(f"predicting from {date_from} for {date_to}")
    gx, gy = group.iloc[:, 1:], group.iloc[:, :1]
    dg = xgb.DMatrix(data=gx,label=gy)
    predictions = bst.predict(dg)

    # apply the predictions to the NEXT day
    for i in range(len(predictions)):
        geo_code = geo_codes[i]
        value = max(0, predictions[i])
        filter_from = (df_predict['geo_code'] == geo_code) & (df_predict['date'] == date_from)        
        filter_to = (df_predict['geo_code'] == geo_code) & (df_predict['date'] == date_to)        
        df_predict.loc[filter_to, 'new_cases'] = value
        df_predict.loc[filter_to, 'confirmed_cases'] = df_predict.loc[filter_from]['confirmed_cases'].max() + value
    
    # update the moving averages
    for window_size in [3, 7, 21]:
        compute_ma(df_predict, 'new_cases', window_size)
        compute_ma(df_predict, 'confirmed_cases', window_size)
        compute_ma(df_predict, 'c1_school_closing', window_size)        
        compute_ma(df_predict, 'c2_workplace_closing', window_size)        
        compute_ma(df_predict, 'c3_cancel_public_events', window_size)        
        compute_ma(df_predict, 'c4_restrictions_on_gatherings', window_size)   
        compute_ma(df_predict, 'c5_close_public_transport', window_size)        
        compute_ma(df_predict, 'c6_stay_at_home_requirements', window_size)        
        compute_ma(df_predict, 'c7_restrictions_on_internal_movement', window_size)        
        compute_ma(df_predict, 'c8_international_travel_controls', window_size)   
        compute_ma(df_predict, 'h1_public_information_campaigns', window_size)        
        compute_ma(df_predict, 'h2_testing_policy', window_size)        
        compute_ma(df_predict, 'h3_contact_tracing', window_size)        
        compute_ma(df_predict, 'h6_facial_coverings', window_size)     
        compute_ma(df_predict, 'new_cases_as_percent_of_population', window_size)     
        compute_ma(df_predict, 'confirmed_cases_as_percent_of_population', window_size)     


def predict(prescription):
    # filter the date range for the prediction window
    days_to_predict = 3
    date_to_predict_from = df[df['predicted'] == False]['date'].max()
    date_to_predict_to = date_to_predict_from + pd.to_timedelta(days_to_predict, unit='d')   
    df_predict = df.copy()
    df_predict = df_predict.sort_values(['date', 'geo_code'])
    df_predict = df_predict[df_predict['date'] <= pd.to_datetime(date_to_predict_to)]

    # predict out the dates
    df_predict_filter = (df_predict['date'] >= date_to_predict_from)
    df_predict.loc[df_predict_filter].groupby('date').apply(
        lambda group: predict_day(
            df_predict,
            group, 
            prescription))

# read the prescriptions and generate a prediction file for each prescription
def do_predictions():
    prescriptions = pd.read_csv('prescriptions.csv', keep_default_na=False, na_values='')
    prescriptions['Date'] = pd.to_datetime(prescriptions['Date']) 
    prescriptions.info()
    for i in range(10):
        print(f"generating predictions for index {i}")
        predict(prescriptions.loc[prescriptions['PrescriptionIndex'] == i])

do_predictions()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212400 entries, 0 to 212399
Data columns (total 18 columns):
 #   Column                                Non-Null Count   Dtype         
---  ------                                --------------   -----         
 0   PrescriptionIndex                     212400 non-null  int64         
 1   CountryName                           212400 non-null  object        
 2   RegionName                            50400 non-null   object        
 3   Date                                  212400 non-null  datetime64[ns]
 4   C1_School closing                     212400 non-null  int64         
 5   C2_Workplace closing                  212400 non-null  int64         
 6   C3_Cancel public events               212400 non-null  int64         
 7   C4_Restrictions on gatherings         212400 non-null  int64         
 8   C5_Close public transport             212400 non-null  int64         
 9   C6_Stay at home requirements          212400 non-null  int6

KeyboardInterrupt: 

In [None]:
df_predict.loc[df_predict['geo_code'] == 'DE'][['date', 'new_cases', 'confirmed_cases']].tail(7)