# Elastic Search Update Notebook

This notebook is used to update the elastic search index with the latest datasets

In [None]:
!pip install xgboost

In [None]:
import boto3
import pandas as pd
import shutil
import os
import uuid
import numpy as np
from sklearn.preprocessing import RobustScaler
from sagemaker import get_execution_role, session
import datetime
from pathlib import Path
from elasticsearch import helpers
from requests_aws4auth import AWS4Auth
from elasticsearch import Elasticsearch, RequestsHttpConnection
from logging import INFO, basicConfig, info
import warnings

In [None]:
# setup logging
basicConfig(level=INFO, format='%(asctime)s\t%(levelname)s\t%(filename)s\t%(message)s')
warnings.filterwarnings('ignore', category=FutureWarning)  # ignore FutureWarning from scikit learn

In [None]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_info_columns = 1000

# Read the Data

In [None]:
df = pd.read_csv('temp/01-data.csv', keep_default_na=False, na_values='')

In [None]:
def compute_ma(field, window_size):
    df[f"{field}_ma_{window_size}"] = df.groupby('geo_code')[field].rolling(window_size, center=False).mean().fillna(0).reset_index(0, drop=True)

def add_working_day_tomorrow(grouped):
    grouped['working_day' + '_tomorrow'] = grouped['working_day'].copy().shift(-1).bfill().ffill()
    return grouped


def add_working_day_yesterday(grouped):
    grouped['working_day' + '_yesterday'] = grouped['working_day'].copy().shift(1).bfill().ffill()
    return grouped

def transform_column_order(df):
    df = df.reindex(sorted(df.columns), axis=1)  # Sort columns by name
    df_label = df['predicted_new_cases']
    df = df.drop(labels=['predicted_new_cases'], axis=1)
    df.insert(0, 'predicted_new_cases', df_label)
    return df

# Compute number of new cases and deaths each day
# Replace negative values (which do not make sense for these columns) with 0
df['new_cases'] = df.groupby('geo_code').confirmed_cases.diff().fillna(0)
df['new_cases'] = df['new_cases'].clip(lower=0)

# add predicted new cases
df['predicted_new_cases'] = df.groupby('geo_code').new_cases.shift(-1).fillna(0)
df['predicted_new_cases'] = df['predicted_new_cases'].clip(lower=0)

# add confirmed cases as percent of population
df['new_cases_as_percent_of_population'] = df['new_cases'] / df['population']
df['confirmed_cases_as_percent_of_population'] = df['confirmed_cases'] / df['population']

# Add moving averages
for window_size in [3, 7, 21]:
    compute_ma('new_cases', window_size)
    compute_ma('confirmed_cases', window_size)
    compute_ma('specific_humidity', window_size)    
    compute_ma('temperature', window_size)        
    compute_ma('c1_school_closing', window_size)        
    compute_ma('c2_workplace_closing', window_size)        
    compute_ma('c3_cancel_public_events', window_size)        
    compute_ma('c4_restrictions_on_gatherings', window_size)   
    compute_ma('c5_close_public_transport', window_size)        
    compute_ma('c6_stay_at_home_requirements', window_size)        
    compute_ma('c7_restrictions_on_internal_movement', window_size)        
    compute_ma('c8_international_travel_controls', window_size)   
    compute_ma('h1_public_information_campaigns', window_size)        
    compute_ma('h2_testing_policy', window_size)        
    compute_ma('h3_contact_tracing', window_size)        
    compute_ma('h6_facial_coverings', window_size)     
    compute_ma('working_day', window_size)        
    compute_ma('new_cases_as_percent_of_population', window_size)     
    compute_ma('confirmed_cases_as_percent_of_population', window_size)     

# Add working day information for tomorrow, and today
df = df.groupby('geo_code').apply(lambda group: add_working_day_tomorrow(group)).reset_index(drop=True)
df = df.groupby('geo_code').apply(lambda group: add_working_day_yesterday(group)).reset_index(drop=True)
df['npi_sum'] = df['c1_school_closing'] + df['c2_workplace_closing'] + \
                df['c3_cancel_public_events'] + df['c4_restrictions_on_gatherings'] + \
                df['c5_close_public_transport'] + df['c6_stay_at_home_requirements'] + \
                df['c7_restrictions_on_internal_movement'] + df['c8_international_travel_controls'] + \
                df['h1_public_information_campaigns'] + df['h2_testing_policy'] + \
                df['h3_contact_tracing'] + df['h6_facial_coverings']

# Drop unused columns
df = transform_column_order(df)
df = df.sort_values(['geo_code', 'date'])

# Prepare dataset for machine learning

In [None]:
# only work within the specified range
df_ml = df.loc[df['date'] < pd.to_datetime(prediction_start_date)]
df_ml = df_ml.drop(labels=['country_name',
                        'country_code3',
                        'country_code_numeric',
                        'confirmed_deaths',
                        'region_name',
                        'month',
                        'quarter',
                        'week'], axis=1)

def encode(df_x):
    # convert the date to an integer value
    df_x['date_day'] = df_x['date'].apply(lambda x: x.day)

    # encode the geo data
    df_x = encoders.BinaryEncoder('continent').fit_transform(df_x).drop(labels=['continent'], axis=1)
    df_x = encoders.BinaryEncoder('geo_code').fit_transform(df_x).drop(labels=['geo_code'], axis=1)
    df_x = encoders.BinaryEncoder('country_code').fit_transform(df_x).drop(labels=['country_code'], axis=1)

    # dy of week
    df_x = encoders.OneHotEncoder('day_of_week').fit_transform(df_x)
    df_x['day_of_week'] = df_x['day_of_week'] / 7.0
    df_x = encoders.CyclicalEncoder('day_of_week').fit_transform(df_x).drop(labels=['day_of_week'], axis=1)

    # day of month
    df_x['day_of_month'] = df_x['day_of_month'] / 31.0 # keep it simple
    df_x = encoders.BinaryEncoder('day_of_month').fit_transform(df_x)
    df_x = encoders.CyclicalEncoder('day_of_month').fit_transform(df_x).drop(labels=['day_of_month'], axis=1)

    # day of year
    df_x['day_of_year'] = df_x['day_of_year'] / 366.0 # keep it simple
    df_x = encoders.BinaryEncoder('day_of_year').fit_transform(df_x)
    df_x = encoders.CyclicalEncoder('day_of_year').fit_transform(df_x).drop(labels=['day_of_year'], axis=1)
    return df_x

df_ml = encode(df_ml)

# Get the train, val, test split



In [None]:
days_for_validation = 0
days_for_test = 14

def split(df: pd.DataFrame, 
          days_for_validation: int, 
          days_for_test: int) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    
    # First, sort the data by date
    df = df.sort_values('date')

    # Determine the maximum date
    date_start_test = df[DATE].max() - pd.to_timedelta(days_for_test - 1, unit='d')
    date_start_validation = date_start_test - pd.to_timedelta(days_for_validation, unit='d')

    df_train = df[df['date'] < date_start_validation]
    df_validation = df[(df['date'] >= date_start_validation) & (df['date'] < date_start_test)]
    df_test = df[df['date'] >= date_start_test]

    # Debug the outpoint
    print(f"Training Range:   {df_train['date'].min().date()} - {df_train['date'].max().date()}")
    print(f"Validation Range: {df_validation['date'].min().date()} - {df_validation['date'].max().date()}")
    print(f"Test Range:       {df_test['date'].min().date()} - {df_test['date'].max().date()}")

    # Sanity Check
    if len(df.index) != len(df_train.index) + len(df_validation.index) + len(df_test.index):
        raise Exception('entries do not add up')

    return df_train, df_validation, df_test

df_train_prescaled, df_validation_prescaled, df_test_prescaled = split(df_ml, days_for_validation, days_for_test)

# Scale

In [None]:


df_train = df_train_prescaled.copy()
df_validation = df_validation_prescaled.copy()
df_test = df_test_prescaled.copy()

scalers = {}


for feature_name in df_ml.columns.values:
    if feature_name == 'date' or feature_name == 'predicted_new_cases':
        continue
    scalers[feature_name] = RobustScaler()
    df_train[feature_name] = scalers[feature_name].fit_transform(df_train_prescaled[[feature_name]])
        
if len(df_validation_prescaled) > 0:        
    for feature_name in df_ml.columns.values:
        if feature_name == 'date' or feature_name == 'predicted_new_cases':
            continue
        df_validation[feature_name] = scalers[feature_name].transform(df_validation_prescaled[[feature_name]])

if len(df_test_prescaled) > 0:        
    for feature_name in df_ml.columns.values:
        if feature_name == 'date' or feature_name == 'predicted_new_cases':
            continue        
        df_test[feature_name] = scalers[feature_name].transform(df_test_prescaled[[feature_name]])

df_train = df_train.drop(labels=['date'], axis=1)
df_validation = df_validation.drop(labels=['date'], axis=1)
df_test = df_test.drop(labels=['date'], axis=1)


# HPO Baby!

In [15]:

# this is a manual step to determine the best parameters...

# note that HPO does not have a header column
"""
df_train.to_csv(f"hpo_train.csv", index=False, header=False)
df_validation.to_csv(f"hpo_validation.csv", index=False, header=False)
df_test.to_csv(f"hpo_test.csv", index=False, header=False)

# push to s3
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('hpo', 'train')).upload_file('hpo_train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('hpo', 'validation')).upload_file('hpo_validation.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('hpo', 'test')).upload_file('hpo_test.csv')
"""

# Let's Train

In [None]:
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error


train_x, train_y, validation_x, validation_y, test_x, test_y = (
    df_train.iloc[:, 1:], df_train.iloc[:, :1],
    df_validation.iloc[:, 1:], df_validation.iloc[:, :1],
    df_test.iloc[:, 1:], df_test.iloc[:, :1])


data_dmatrix = xgb.DMatrix(data=train_x,label=train_y)



params = {
    'alpha': 0.734,
    #"booster": "gbtree",
    'colsample_bytree': 0.9247351903173018,
    "colsample_bylevel": 0.11338785332863967,
    "eta": 0.2857594277665818,
    "gamma": 0.7840414267431137,
    #'learning_rate': 0.1,
    "max_delta_step": 0,
    'max_depth': 43, 
    "max_leaves": 0,
    "min_child_weight": 0.015174585370894897,
    "objective": "reg:squarederror",
    "subsample": 0.5982549859025215
}


params_baseline = {
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    'objective':'reg:squarederror'
}

cv_results = xgb.cv(dtrain=data_dmatrix, 
                    params=params, 
                    nfold=10,
                    num_boost_round=20,
                    early_stopping_rounds=10,
                    metrics="rmse", 
                    as_pandas=True,
                    verbose_eval=True,
                    seed=123)
print(cv_results.tail(100))
print((cv_results["test-rmse-mean"]).tail(10))




[0]	train-rmse:7030.52534+146.13767	test-rmse:7017.84634+1264.82343
[1]	train-rmse:6014.82485+152.46684	test-rmse:6051.61816+1363.17175
[2]	train-rmse:5153.38862+151.06150	test-rmse:5271.16826+1461.30391
[3]	train-rmse:4452.34253+162.47070	test-rmse:4639.33069+1552.80275
[4]	train-rmse:3857.96174+167.17000	test-rmse:4126.22151+1645.06974
[5]	train-rmse:3365.56177+160.58723	test-rmse:3722.10637+1718.97338
[6]	train-rmse:2938.35625+157.62463	test-rmse:3396.18481+1778.71113
[7]	train-rmse:2574.99138+155.65825	test-rmse:3147.93709+1830.01902
[8]	train-rmse:2282.22495+156.46754	test-rmse:2971.96692+1868.51628
[9]	train-rmse:2041.65498+163.99757	test-rmse:2819.78458+1900.49468
[10]	train-rmse:1818.10314+152.91726	test-rmse:2690.87959+1929.98852
[11]	train-rmse:1618.93767+143.72061	test-rmse:2602.85756+1949.31768
[12]	train-rmse:1446.69102+137.13164	test-rmse:2537.62733+1963.47213
[13]	train-rmse:1308.88207+128.32482	test-rmse:2488.94933+1975.21751
[14]	train-rmse:1186.30573+126.88331	test-rm

In [273]:
print(cv_results.head(10))

    train-rmse-mean  train-rmse-std  test-rmse-mean  test-rmse-std
0         14.448232        0.457032       14.128955       3.542234
1         13.281850        0.453128       13.022488       3.631210
2         12.247323        0.451868       12.034971       3.731681
3         11.357578        0.527235       11.216321       3.785879
4         10.516548        0.530564       10.433459       3.872672
5          9.828135        0.591498        9.789183       3.954284
6          9.142688        0.583163        9.192409       4.022693
7          8.485017        0.562325        8.619303       4.099982
8          7.929716        0.564755        8.147279       4.160788
9          7.432303        0.567991        7.728239       4.210274
10         6.981730        0.556943        7.346896       4.272597
11         6.590199        0.541217        7.070919       4.311739
12         6.198319        0.515502        6.778163       4.365013
13         5.871631        0.526149        6.519623       4.40

# Write the model, scalers, and training data