# Elastic Search Update Notebook

This notebook is used to update the elastic search index with the latest datasets

In [32]:
import boto3
from sagemaker import get_execution_role, session
role = get_execution_role()
region = boto3.Session().region_name
bucket =  session.Session(boto3.Session()).default_bucket()

arn:aws:iam::675175040308:role/DashboardRole
eu-central-1


In [1]:
!pip install --upgrade --force-reinstall git+https://github.com/rbilleci/pandora.git

Collecting git+https://github.com/rbilleci/pandora.git
  Cloning https://github.com/rbilleci/pandora.git to /tmp/pip-req-build-03_ii95i
  Running command git clone -q https://github.com/rbilleci/pandora.git /tmp/pip-req-build-03_ii95i
Collecting pandas~=1.2.1
  Using cached pandas-1.2.1-cp37-cp37m-manylinux1_x86_64.whl (9.9 MB)
Processing /root/.cache/pip/wheels/8a/82/52/2f7cb5b39aad6b4beb08a6741a756fc3d1e104224c2b42fa1b/fnvhash-0.1.0-py3-none-any.whl
Collecting scikit-learn~=0.24.1
  Using cached scikit_learn-0.24.1-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
Collecting workalendar~=14.1.0
  Using cached workalendar-14.1.0-py3-none-any.whl (187 kB)
Collecting category-encoders~=2.2.2
  Using cached category_encoders-2.2.2-py2.py3-none-any.whl (80 kB)
Collecting numpy>=1.16.5
  Using cached numpy-1.20.0-cp37-cp37m-manylinux2010_x86_64.whl (15.3 MB)
Collecting python-dateutil>=2.7.3
  Using cached python_dateutil-2.8.1-py2.py3-none-any.whl (227 kB)
Collecting pytz>=2017.3
  Using cach

In [6]:

from datetime import date
from logging import basicConfig, INFO

import pandas as pd

import pandora
import pandora.data.age_distribution as age_dist
import pandora.data.oxford_data as oxford
import pandora.data.population as population
import pandora.data.temperatures as temperatures
from pandora.data import geo, continent, country_code, working_day
from pandora import loader
from pandora.core_fields import DATE, COUNTRY_CODE

basicConfig(level=INFO, format='%(asctime)s\t%(levelname)s\t%(filename)s\t%(message)s')

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_info_columns = 1000


# Update the data files

In [22]:
pandora.data.oxford_data_update.update()

2021-02-07 19:17:35,242	INFO	oxford_data_update.py	download oxford data set


# Build the data files

In [23]:
start_date = date(2020, 1, 1)
end_date = date.today()
imputation_window_start_date = date(2020, 1, 1)
imputation_window_end_date = end_date
df = loader.load(start_date,
                 end_date,
                 imputation_window_start_date,
                 imputation_window_end_date,
                 geo.module,
                 [
                     country_code.module,
                     continent.module,
                     population.module,
                     age_dist.module,
                     temperatures.module,
                     oxford.module,
                     working_day.module
                 ])
df.info()

2021-02-07 19:17:52,331	INFO	loader.py	/opt/conda/lib/python3.7/site-packages/pandora/data/geo.csv - loading
2021-02-07 19:17:55,225	INFO	loader.py	/opt/conda/lib/python3.7/site-packages/pandora/data/country_code.csv - loading
2021-02-07 19:17:57,475	INFO	loader.py	/opt/conda/lib/python3.7/site-packages/pandora/data/continent.csv - loading
2021-02-07 19:18:00,108	INFO	loader.py	/opt/conda/lib/python3.7/site-packages/pandora/data/population.csv - loading
2021-02-07 19:18:04,197	INFO	imputer.py	/opt/conda/lib/python3.7/site-packages/pandora/data/population.csv - imputing population
2021-02-07 19:18:04,590	INFO	imputer.py	/opt/conda/lib/python3.7/site-packages/pandora/data/population.csv - imputing population_density
2021-02-07 19:18:04,978	INFO	imputer.py	/opt/conda/lib/python3.7/site-packages/pandora/data/population.csv - imputing population_percent_urban
2021-02-07 19:18:05,390	INFO	imputer.py	/opt/conda/lib/python3.7/site-packages/pandora/data/population.csv - imputing gdp_per_capita


<class 'pandas.core.frame.DataFrame'>
Int64Index: 95344 entries, 0 to 95343
Data columns (total 51 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   age_distribution_00_04                95344 non-null  float64       
 1   age_distribution_05_14                95344 non-null  float64       
 2   age_distribution_15_34                95344 non-null  float64       
 3   age_distribution_34_64                95344 non-null  float64       
 4   age_distribution_65_plus              95344 non-null  float64       
 5   c1_school_closing                     95344 non-null  float64       
 6   c2_workplace_closing                  95344 non-null  float64       
 7   c3_cancel_public_events               95344 non-null  float64       
 8   c4_restrictions_on_gatherings         95344 non-null  float64       
 9   c5_close_public_transport             95344 non-null  float64       
 10

# Add computed column

In [133]:

# Compute number of new cases and deaths each day
df['new_cases'] = df.groupby('geo_code').confirmed_cases.diff().fillna(0)
df['new_deaths'] = df.groupby('geo_code').confirmed_deaths.diff().fillna(0)

# Replace negative values (which do not make sense for these columns) with 0
df['new_cases'] = df['new_cases'].clip(lower=0)
df['new_deaths'] = df['new_deaths'].clip(lower=0)

for window_size in [3, 7, 21]:
    # Compute smoothed versions of new cases and deaths each day
    df[f"new_cases_smoothed_ma_{window_size}"] = df.groupby('geo_code')['new_cases'].rolling(
        window_size, center=False).mean().fillna(0).reset_index(0, drop=True)
    df[f"new_deaths_smoothed_ma_{window_size}"] = df.groupby('geo_code')['new_deaths'].rolling(
        window_size, center=False).mean().fillna(0).reset_index(0, drop=True)
    
df.sample(3)

Unnamed: 0,age_distribution_00_04,age_distribution_05_14,age_distribution_15_34,age_distribution_34_64,age_distribution_65_plus,c1_school_closing,c2_workplace_closing,c3_cancel_public_events,c4_restrictions_on_gatherings,c5_close_public_transport,c6_stay_at_home_requirements,c7_restrictions_on_internal_movement,c8_international_travel_controls,confirmed_cases,confirmed_cases--,confirmed_deaths,continent,country_code,country_code3,country_code_numeric,country_name,date,day_of_month,day_of_week,day_of_year,gdp_per_capita,gdp_per_capita--,geo_code,h1_public_information_campaigns,h2_testing_policy,h3_contact_tracing,h6_facial_coverings,month,obesity_rate,obesity_rate--,pneumonia_deaths_per_100k,pneumonia_deaths_per_100k--,population,population--,population_density,population_density--,population_percent_urban,population_percent_urban--,quarter,region_name,specific_humidity,temperature,week,working_day,working_day--,year,new_cases,new_deaths,new_cases_smoothed_ma_3,new_deaths_smoothed_ma_3,new_cases_smoothed_ma_7,new_deaths_smoothed_ma_7,new_cases_smoothed_ma_21,new_deaths_smoothed_ma_21
22699,0.1461,0.2531,0.3648,0.2007,0.0354,2.0,0.0,2.0,1.0,0.0,0.0,0.0,1.0,5.0,False,0.0,AFRICA,ET,ETH,231,Ethiopia,2020-03-16,16,1,76,772.31,False,ET,2.0,1.0,1.0,0.0,3,24.022703,True,86.94,False,109224414.0,False,109.22,False,21.22,False,1,,0.011059,296.405685,12,1.0,False,2020,4.0,0.0,1.333333,0.0,0.714286,0.0,0.238095,0.0
10181,0.045,0.1018,0.2139,0.4247,0.2147,3.0,1.0,2.0,4.0,0.0,2.0,2.0,3.0,187.0,False,3.0,EUROPE,BG,BGR,100,Bulgaria,2020-03-22,22,7,82,9270.0,False,BG,2.0,1.0,0.0,0.0,3,25.0,False,12.954341,False,6948445.0,False,64.0,False,75.35,False,1,,0.005204,281.526071,12,0.0,False,2020,24.0,0.0,31.0,0.0,19.428571,0.142857,8.904762,0.142857
38697,0.1183,0.204,0.3673,0.2608,0.0495,1.0,1.0,1.0,3.0,0.0,1.0,1.0,2.0,2041.0,False,44.0,AFRICA,LS,LSO,426,Lesotho,2020-11-13,13,5,318,1299.153126,False,LS,2.0,1.0,1.0,4.0,11,16.6,False,120.102903,False,2142249.0,False,69.437813,False,28.153,False,4,,0.007611,296.85458,46,1.0,False,2020,15.0,0.0,5.0,0.0,10.571429,0.0,5.095238,0.047619


# Write the data files

In [134]:
# let's sorty by geo code and date first
df = df.sort_values(['geo_code', 'date'])

In [135]:
from pathlib import Path
import shutil

filename_prefix = 'ground-truth'
dir_output = 'output'
dir_output_geo = 'output/geo'

# recreate the directory, deleting any existing content
shutil.rmtree(dir_output, ignore_errors=True)
Path(dir_output_geo).mkdir(parents=True, exist_ok=True)


# for each geography, write a JSON and CSV file
for geo in df['geo_code'].unique():
    df_geo = df.loc[df['geo_code'] == geo]
    df_geo.to_json(f"{dir_output_geo}/{filename_prefix}-{geo.replace('/', '-')}.json", 
                   orient='records', 
                   lines=False, 
                   date_format='iso')
    df_geo.to_csv(f"{dir_output_geo}/{filename_prefix}-{geo.replace('/', '-')}.csv", index=False)

# also write the complete output
df_geo.to_json(f"{dir_output}/{filename_prefix}.json", 
               orient='records', 
               lines=False, 
               date_format='iso')
df_geo.to_csv(f"{dir_output}/{filename_prefix}.csv", index=False)


In [136]:
# print(bucket)
# boto3.Session().resource('s3').Bucket(bucket).Object("ground_truth").upload_file(fn_grouth_truth)
# print("Success! You are all set to proceed.")

# Load to Elastic Search


In [137]:
!pip install requests-aws4auth
!pip install elasticsearch



In [138]:
from requests_aws4auth import AWS4Auth
from elasticsearch import Elasticsearch, RequestsHttpConnection

endpoint = boto3.client('es').describe_elasticsearch_domain(DomainName=f"es-pandora-{region}")['DomainStatus']['Endpoint']
credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, 'es', session_token=credentials.token)
        
es = Elasticsearch(
    hosts = [{'host': endpoint, 'port': 443}],
    http_auth = awsauth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection
)


In [139]:
import os
import uuid
from elasticsearch import helpers

def decorate_json(documents, _index, doc_type):
    for doc in documents:
        doc['date'] = doc['date'].split('T')[0]
        yield {
            "_index": _index,
            "_type": doc_type,
            "_id": f"{doc['geo_code']}-{doc['date']}",
            "_source": doc }
        
            
for fn in os.listdir(dir_output_geo):
    if fn.endswith(".json"):
        file = open(f"{dir_output_geo}/{fn}")
        documents = json.loads(file.read())
        documents = decorate_json(documents, 'my_index_01', 'update')
        helpers.bulk(es, documents)

2021-02-07 23:31:13,382	INFO	base.py	POST https://search-es-pandora-eu-central-1-cqze54lhcquh7gcq7sbt7nccma.eu-central-1.es.amazonaws.com:443/_bulk [status:200 request:0.485s]
2021-02-07 23:31:13,508	INFO	base.py	POST https://search-es-pandora-eu-central-1-cqze54lhcquh7gcq7sbt7nccma.eu-central-1.es.amazonaws.com:443/_bulk [status:200 request:0.096s]
2021-02-07 23:31:13,644	INFO	base.py	POST https://search-es-pandora-eu-central-1-cqze54lhcquh7gcq7sbt7nccma.eu-central-1.es.amazonaws.com:443/_bulk [status:200 request:0.105s]
2021-02-07 23:31:13,778	INFO	base.py	POST https://search-es-pandora-eu-central-1-cqze54lhcquh7gcq7sbt7nccma.eu-central-1.es.amazonaws.com:443/_bulk [status:200 request:0.105s]
2021-02-07 23:31:13,867	INFO	base.py	POST https://search-es-pandora-eu-central-1-cqze54lhcquh7gcq7sbt7nccma.eu-central-1.es.amazonaws.com:443/_bulk [status:200 request:0.060s]
2021-02-07 23:31:13,977	INFO	base.py	POST https://search-es-pandora-eu-central-1-cqze54lhcquh7gcq7sbt7nccma.eu-central-1