# Elastic Search Update Notebook

This notebook is used to update the elastic search index with the latest datasets

In [3]:
!pip install xgboost
!pip install git+https://github.com/rbilleci/pandora.git

Collecting git+https://github.com/rbilleci/pandora.git
  Cloning https://github.com/rbilleci/pandora.git to /tmp/pip-req-build-a7w_ctbc
  Running command git clone -q https://github.com/rbilleci/pandora.git /tmp/pip-req-build-a7w_ctbc
Building wheels for collected packages: pandora
  Building wheel for pandora (setup.py) ... [?25ldone
[?25h  Created wheel for pandora: filename=pandora-0.1.0-py3-none-any.whl size=2681412 sha256=c3747069c2561901e403d263153eb5ed96c5ab86d6f89b30d892701e81850a19
  Stored in directory: /tmp/pip-ephem-wheel-cache-2gablfqc/wheels/01/8b/d5/a72c927a738750e04a4bb4fd22f63b4b88c7b5871732e2d67b
Successfully built pandora


In [212]:
import boto3
import pandas as pd
import shutil
import os
import numpy as np
from pandora import loader, encoders
from sklearn.preprocessing import RobustScaler, StandardScaler
import datetime
from pathlib import Path
from logging import INFO, basicConfig, info
import warnings
import xgboost as xgb
from sklearn.metrics import mean_squared_error


In [155]:
# setup logging
basicConfig(level=INFO, format='%(asctime)s\t%(levelname)s\t%(filename)s\t%(message)s')
warnings.filterwarnings('ignore', category=FutureWarning)  # ignore FutureWarning from scikit learn

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_info_columns = 1000

# Read the Data

In [170]:
# load the dataset and set the date column
df = pd.read_csv('temp/01-data.csv', keep_default_na=False, na_values='')
df['date'] = pd.to_datetime(df['date'])

# determine the date where prediction should begin
prediction_start_date = df[df['predicted'] == True]['date'].min().date()


  interactivity=interactivity, compiler=compiler, result=result)


# Prepare dataset for machine learning

In [215]:
# only work within the specified range
df_ml = df.loc[df['date'] < pd.to_datetime(prediction_start_date)]
enc = {}
enc['continent'] = encoders.BinaryEncoder('continent')
enc['geo_code'] = encoders.BinaryEncoder('geo_code')
enc['country_code'] = encoders.BinaryEncoder('country_code')
enc['day_of_week'] = encoders.OneHotEncoder('day_of_week')
enc['day_of_week_cyc'] = encoders.CyclicalEncoder('day_of_week')
enc['day_of_month'] = encoders.OneHotEncoder('day_of_month')
enc['day_of_month_cyc'] = encoders.CyclicalEncoder('day_of_month')
enc['day_of_year'] = encoders.BinaryEncoder('day_of_year')
enc['day_of_year_cyc'] = encoders.CyclicalEncoder('day_of_year')

def encode(df_x, fit):
    # convert the date to an integer value
    df_x['date_day'] = df_x['date'].apply(lambda x: x.day)

    # encode the geo data
    if fit:
        df_x = enc['continent'].fit_transform(df_x)
        df_x = enc['geo_code'].fit_transform(df_x)
        df_x = enc['country_code'].fit_transform(df_x)
    else:
        df_x = enc['continent'].transform(df_x)
        df_x = enc['geo_code'].transform(df_x)
        df_x = enc['country_code'].transform(df_x)
    if fit:
        df_x = enc['day_of_week'].fit_transform(df_x)
        df_x['day_of_week'] = df_x['day_of_week'] / 7.0
        df_x = enc['day_of_week_cyc'].fit_transform(df_x)
    else:
        df_x = enc['day_of_week'].transform(df_x)
        df_x['day_of_week'] = df_x['day_of_week'] / 7.0
        df_x = enc['day_of_week_cyc'].transform(df_x)
    if fit:
        df_x = enc['day_of_month'].fit_transform(df_x)
        df_x['day_of_month'] = df_x['day_of_month'] / 31.0 # keep it simple
        df_x = enc['day_of_month_cyc'].fit_transform(df_x)
        df_x = enc['day_of_year'].fit_transform(df_x)
        df_x['day_of_year'] = df_x['day_of_year'] / 366.0 # keep it simple
        df_x = enc['day_of_year_cyc'].fit_transform(df_x)
    else:
        df_x = enc['day_of_month'].transform(df_x)
        df_x['day_of_month'] = df_x['day_of_month'] / 31.0 # keep it simple
        df_x = enc['day_of_month_cyc'].transform(df_x)
        df_x = enc['day_of_year'].transform(df_x)
        df_x['day_of_year'] = df_x['day_of_year'] / 366.0 # keep it simple
        df_x = enc['day_of_year_cyc'].transform(df_x)
        
    
    # drop unused columns
    df_x = df_x.drop(labels=['country_name',
                           'continent',
                           'geo_code',
                           'country_code',
                           'day_of_week',
                           'day_of_month',
                           'day_of_year',
                            'country_code3',
                            'country_code_numeric',
                            'confirmed_deaths',
                            'predicted',
                            'region_name',
                            'month',                           
                            'quarter',
                            'week'], axis=1)
    return df_x

df_ml = encode(df_ml, fit=True)





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


# Get the train, val, test split



In [181]:
days_for_validation = 31
days_for_test = 14

def split(df: pd.DataFrame, 
          days_for_validation: int, 
          days_for_test: int) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    
    # First, sort the data by date
    df = df.sort_values('date')

    # Determine the maximum date
    date_start_test = df['date'].max() - pd.to_timedelta(days_for_test - 1, unit='d')
    date_start_validation = date_start_test - pd.to_timedelta(days_for_validation, unit='d')

    df_train = df[df['date'] < date_start_validation]
    df_validation = df[(df['date'] >= date_start_validation) & (df['date'] < date_start_test)]
    df_test = df[df['date'] >= date_start_test]

    # Debug the outpoint
    print(f"Training Range:   {df_train['date'].min().date()} - {df_train['date'].max().date()}")
    print(f"Validation Range: {df_validation['date'].min().date()} - {df_validation['date'].max().date()}")
    print(f"Test Range:       {df_test['date'].min().date()} - {df_test['date'].max().date()}")

    # Sanity Check
    if len(df.index) != len(df_train.index) + len(df_validation.index) + len(df_test.index):
        raise Exception('entries do not add up')

    return df_train, df_validation, df_test

df_train_prescaled, df_validation_prescaled, df_test_prescaled = split(df_ml, days_for_validation, days_for_test)

Training Range:   2020-01-01 - 2020-12-25
Validation Range: 2020-12-26 - 2021-01-25
Test Range:       2021-01-26 - 2021-02-08


# Scale

In [182]:
df_train = df_train_prescaled.copy()
df_validation = df_validation_prescaled.copy()
df_test = df_test_prescaled.copy()

scalers = {}


for feature_name in df_ml.columns.values:
    if feature_name == 'date' or feature_name == 'predicted_new_cases':
        continue
    scalers[feature_name] = StandardScaler()
    df_train[feature_name] = scalers[feature_name].fit_transform(df_train_prescaled[[feature_name]])
        
if len(df_validation_prescaled) > 0:        
    for feature_name in df_ml.columns.values:
        if feature_name == 'date' or feature_name == 'predicted_new_cases':
            continue
        df_validation[feature_name] = scalers[feature_name].transform(df_validation_prescaled[[feature_name]])

if len(df_test_prescaled) > 0:        
    for feature_name in df_ml.columns.values:
        if feature_name == 'date' or feature_name == 'predicted_new_cases':
            continue        
        df_test[feature_name] = scalers[feature_name].transform(df_test_prescaled[[feature_name]])

df_train = df_train.drop(labels=['date'], axis=1)
df_validation = df_validation.drop(labels=['date'], axis=1)
df_test = df_test.drop(labels=['date'], axis=1)


# HPO Baby!

In [183]:

# this is a manual step to determine the best parameters...

# note that HPO does not have a header column
"""
df_train.to_csv(f"hpo_train.csv", index=False, header=False)
df_validation.to_csv(f"hpo_validation.csv", index=False, header=False)
df_test.to_csv(f"hpo_test.csv", index=False, header=False)

# push to s3
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('hpo', 'train')).upload_file('hpo_train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('hpo', 'validation')).upload_file('hpo_validation.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('hpo', 'test')).upload_file('hpo_test.csv')
"""

'\ndf_train.to_csv(f"hpo_train.csv", index=False, header=False)\ndf_validation.to_csv(f"hpo_validation.csv", index=False, header=False)\ndf_test.to_csv(f"hpo_test.csv", index=False, header=False)\n\n# push to s3\nboto3.Session().resource(\'s3\').Bucket(bucket).Object(os.path.join(\'hpo\', \'train\')).upload_file(\'hpo_train.csv\')\nboto3.Session().resource(\'s3\').Bucket(bucket).Object(os.path.join(\'hpo\', \'validation\')).upload_file(\'hpo_validation.csv\')\nboto3.Session().resource(\'s3\').Bucket(bucket).Object(os.path.join(\'hpo\', \'test\')).upload_file(\'hpo_test.csv\')\n'

# Let's Train

In [184]:



params_hpo = {
    'alpha': 0.03244945870956029,
    'nthread': 1,
    'objective':'reg:squarederror',
    'eta': 0.16740916905083,
    'colsample_bylevel': 0.8953507439536397,
    'colsample_bytree': 0.9459754246923018,
    'gamma': 0.4185824423681137,
    'max_delta_step': 0,
    'max_depth': 6,
    'min_child_weight': 0.8210632572458949,
    'subsample': 0.7371710015275215
    
    
}

params_tree = {
    'nthread': 1,
    'objective':'reg:squarederror',
    'eta': 0.1
}

params_dart = {
    "booster": "dart",
    'nthread': 1,    
    "objective": "reg:squarederror",
    "rate_drop": 0.1
}

params_linear = {
    "booster": "gblinear",
    'nthread': 1,    
    "objective": "reg:squarederror",
}


In [185]:

from sklearn.metrics import mean_squared_error

use_walk_forward = False

test_x, test_y = df_test.iloc[:, 1:], df_test.iloc[:, :1]
dtest = xgb.DMatrix(data=test_x,label=test_y)
early_stopping_rounds = 10
callback_monitor = xgb.callback.EvaluationMonitor(rank=0, period=10, show_stdv=False)

if use_walk_forward:
    df_train_and_validation = pd.concat([df_train, df_validation])
    training_loops = 3
    training_steps = 50
    records = df_train_and_validation.shape[0]
    records_per_step = int(records / training_steps)


    # The number of training rounds
    trained_once = False
    for z in range(0, training_loops):

        # Step through the dataset
        for step in range(0, training_steps):

            # get the ranges we'll work from
            train_start = records_per_step * step
            val_start = records_per_step * (step + 1)
            val_end = val_start + records_per_step
            
            # make sure there is sufficient validation data available to run an iteration
            if (records - val_start) < records_per_step:
                print(f"finishing walk with {records - val_start} records remaining")
                break

            # slice our training and validation sets
            train, val = df_train_and_validation[train_start:val_start], df_train_and_validation[val_start:val_end]

            # Get the train and val data, then train the model
            tx, ty = train.iloc[:, 1:], train.iloc[:, :1]
            vx, vy = val.iloc[:, 1:], val.iloc[:, :1]

            # setup the training dataset        
            dtrain = xgb.DMatrix(data=tx,label=ty)
            dvalidation = xgb.DMatrix(data=vx,label=vy)
            watchlist = [(dvalidation, 'eval'), (dtrain, 'train')]

            if trained_once:
                bst = xgb.train(params_linear, 
                                dtrain, 
                                1000, 
                                watchlist, 
                                early_stopping_rounds=early_stopping_rounds,
                                verbose_eval=False,
                                xgb_model='temp/predictor.model')
                bst.save_model('temp/predictor.model')
            else:
                bst = xgb.train(params_linear, 
                                dtrain, 
                                1000, 
                                watchlist,
                                early_stopping_rounds=early_stopping_rounds,      
                                verbose_eval=False)
                bst.save_model('temp/predictor.model')
                trained_once = True


            # get the status
            predictions = bst.predict(dtest)
            score = mean_squared_error(test_y, predictions, squared=False)
            print(f"walk={z+1}/{training_loops}, step={step+1}/{training_steps}, {len(train)}, {len(val)}, rmse={score}")
else:
    
    dtrain = xgb.DMatrix(data=df_train.iloc[:, 1:], label=df_train.iloc[:, :1])
    dvalidation = xgb.DMatrix(data=df_validation.iloc[:, 1:], label=df_validation.iloc[:, :1])
    watchlist = [(dvalidation, 'eval'), (dtrain, 'train')]
    
    bst = xgb.train(params_linear, 
                    dtrain, 
                    1000, 
                    watchlist,
                    early_stopping_rounds=early_stopping_rounds,      
                    callbacks=[callback_monitor],
                    verbose_eval=False)
    bst.save_model('temp/predictor.model')
    predictions = bst.predict(dtest)
    score = mean_squared_error(test_y, predictions, squared=False)
    print(score)
    


[0]	eval-rmse:8100.29346	train-rmse:4129.94922
[10]	eval-rmse:4225.42432	train-rmse:3324.32739
[20]	eval-rmse:4043.60181	train-rmse:3270.14624
[30]	eval-rmse:4039.64380	train-rmse:3253.82935
[40]	eval-rmse:4037.38037	train-rmse:3246.95752
[50]	eval-rmse:4030.91553	train-rmse:3243.26343
[60]	eval-rmse:4022.33081	train-rmse:3240.89331
[70]	eval-rmse:4012.86548	train-rmse:3239.16211
[80]	eval-rmse:4003.22412	train-rmse:3237.78980
[90]	eval-rmse:3993.83057	train-rmse:3236.65527
[100]	eval-rmse:3984.92651	train-rmse:3235.69653
[110]	eval-rmse:3976.62744	train-rmse:3234.87793
[120]	eval-rmse:3968.97095	train-rmse:3234.18115
[130]	eval-rmse:3961.95581	train-rmse:3233.57764
[140]	eval-rmse:3955.55493	train-rmse:3233.06323
[150]	eval-rmse:3949.72559	train-rmse:3232.61621
[160]	eval-rmse:3944.42090	train-rmse:3232.23022
[170]	eval-rmse:3939.58520	train-rmse:3231.89893
[180]	eval-rmse:3935.17749	train-rmse:3231.60644
[190]	eval-rmse:3931.15210	train-rmse:3231.35889
[200]	eval-rmse:3927.47754	trai

In [186]:
predictions = bst.predict(dtest)
score = mean_squared_error(test_y, predictions, squared=False)
print(score)

4297.474437630029


# Save the scalers

In [187]:
from pickle import dump

dump(scalers, open('temp/scalers.pkl', 'wb'))

# Save the Scaled Data, used for predictions

In [217]:
# let's sorty by geo code and date first
df_baseline = df.copy()
df_baseline = df_baseline.drop(labels=['predicted_new_cases'], axis=1)
df_baseline = df_baseline.sort_values(['geo_code', 'date'])

filename_prefix = 'prediction-baseline'
dir_output_prediction_baseline = 'temp/prediction-baseline'

# recreate the directory, deleting any existing content
shutil.rmtree(dir_output_prediction_baseline, ignore_errors=True)
Path(dir_output_prediction_baseline).mkdir(parents=True, exist_ok=True)

# for each geography, write a JSON and CSV file
for geo_code in df_baseline['geo_code'].unique():
    df_geo = df_baseline.loc[df_baseline['geo_code'] == geo_code].copy()
    
    # encode and scale
    df_geo = encode(df_geo, fit=False)
    for scaler_name in scalers:
        if scaler_name in df_geo.columns:
            df_geo[scaler_name] = scalers[scaler_name].transform(df_geo[[scaler_name]])    
    
    # write the output
    df_geo.to_csv(f"{dir_output_prediction_baseline}/{filename_prefix}-{geo_code.replace('/', '-')}.csv", 
                  index=False)


In [218]:
df_ml.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95580 entries, 0 to 138114
Data columns (total 178 columns):
 #    Column                                          Non-Null Count  Dtype         
---   ------                                          --------------  -----         
 0    predicted_new_cases                             95580 non-null  float64       
 1    age_distribution_00_04                          95580 non-null  float64       
 2    age_distribution_05_14                          95580 non-null  float64       
 3    age_distribution_15_34                          95580 non-null  float64       
 4    age_distribution_34_64                          95580 non-null  float64       
 5    age_distribution_65_plus                        95580 non-null  float64       
 6    c1_school_closing                               95580 non-null  float64       
 7    c1_school_closing_ma_21                         95580 non-null  float64       
 8    c1_school_closing_ma_3           