This notebook is used to update the elastic search index with the latest datasets

In [1]:
!pip install --upgrade --force-reinstall git+https://github.com/rbilleci/pandora.git

Collecting git+https://github.com/rbilleci/pandora.git
  Cloning https://github.com/rbilleci/pandora.git to /tmp/pip-req-build-ozk09mwy
  Running command git clone -q https://github.com/rbilleci/pandora.git /tmp/pip-req-build-ozk09mwy
Collecting pandas~=1.2.1
  Using cached pandas-1.2.1-cp37-cp37m-manylinux1_x86_64.whl (9.9 MB)
Processing /root/.cache/pip/wheels/8a/82/52/2f7cb5b39aad6b4beb08a6741a756fc3d1e104224c2b42fa1b/fnvhash-0.1.0-py3-none-any.whl
Collecting scikit-learn~=0.24.1
  Using cached scikit_learn-0.24.1-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
Collecting workalendar~=14.1.0
  Using cached workalendar-14.1.0-py3-none-any.whl (187 kB)
Collecting category-encoders~=2.2.2
  Using cached category_encoders-2.2.2-py2.py3-none-any.whl (80 kB)
Collecting numpy>=1.16.5
  Using cached numpy-1.20.0-cp37-cp37m-manylinux2010_x86_64.whl (15.3 MB)
Collecting pytz>=2017.3
  Using cached pytz-2021.1-py2.py3-none-any.whl (510 kB)
Collecting python-dateutil>=2.7.3
  Using cached python_

In [2]:

from datetime import date
from logging import basicConfig, INFO

import pandas as pd

import pandora.data.age_distribution as age_dist
import pandora.data.oxford_data as oxford
import pandora.data.population as population
import pandora.data.temperatures as temperatures
from pandora.data import geo, continent, country_code, working_day
from pandora import loader
from pandora.core_fields import DATE, COUNTRY_CODE

basicConfig(level=INFO, format='%(asctime)s\t%(levelname)s\t%(filename)s\t%(message)s')

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_info_columns = 1000


In [3]:
pd.show_versions()

2021-02-07 18:32:04,126	INFO	utils.py	NumExpr defaulting to 2 threads.



INSTALLED VERSIONS
------------------
commit           : 9d598a5e1eee26df95b3910e3f2934890d062caa
python           : 3.7.6.final.0
python-bits      : 64
OS               : Linux
OS-release       : 4.14.214-160.339.amzn2.x86_64
Version          : #1 SMP Sun Jan 10 05:53:05 UTC 2021
machine          : x86_64
processor        : 
byteorder        : little
LC_ALL           : C.UTF-8
LANG             : C.UTF-8
LOCALE           : en_US.UTF-8

pandas           : 1.2.1
numpy            : 1.20.0
pytz             : 2021.1
dateutil         : 2.8.1
pip              : 20.0.2
setuptools       : 53.0.0
Cython           : 0.29.15
pytest           : 5.3.5
hypothesis       : 5.5.4
sphinx           : 2.4.0
blosc            : None
feather          : None
xlsxwriter       : 1.2.7
lxml.etree       : 4.5.0
html5lib         : 1.0.1
pymysql          : None
psycopg2         : None
jinja2           : 2.11.1
IPython          : 7.12.0
pandas_datareader: None
bs4              : 4.8.2
bottleneck       : 1.3.2
fsspec

In [4]:
start_date = date(2020, 1, 1)
end_date = date(2020, 12, 31)
imputation_window_start_date = date(2020, 1, 1)
imputation_window_end_date = date(2020, 12, 31)
df = loader.load(start_date,
                 end_date,
                 imputation_window_start_date,
                 imputation_window_end_date,
                 geo.module,
                 [
                     country_code.module,
                     continent.module,
                     population.module,
                     age_dist.module,
                     temperatures.module,
                     oxford.module,
                     working_day.module
                 ])
df.info()

2021-02-07 18:32:06,945	INFO	loader.py	/opt/conda/lib/python3.7/site-packages/pandora/data/geo.csv - loading
2021-02-07 18:32:10,223	INFO	loader.py	/opt/conda/lib/python3.7/site-packages/pandora/data/country_code.csv - loading
2021-02-07 18:32:12,685	INFO	loader.py	/opt/conda/lib/python3.7/site-packages/pandora/data/continent.csv - loading
2021-02-07 18:32:15,101	INFO	loader.py	/opt/conda/lib/python3.7/site-packages/pandora/data/population.csv - loading
2021-02-07 18:32:20,759	INFO	imputer.py	/opt/conda/lib/python3.7/site-packages/pandora/data/population.csv - imputing population
2021-02-07 18:32:20,760	INFO	imputer.py	/opt/conda/lib/python3.7/site-packages/pandora/data/population.csv - imputing population_density
2021-02-07 18:32:20,762	INFO	imputer.py	/opt/conda/lib/python3.7/site-packages/pandora/data/population.csv - imputing population_percent_urban
2021-02-07 18:32:20,764	INFO	imputer.py	/opt/conda/lib/python3.7/site-packages/pandora/data/population.csv - imputing gdp_per_capita


<class 'pandas.core.frame.DataFrame'>
Int64Index: 86376 entries, 0 to 86375
Data columns (total 51 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   age_distribution_00_04                86376 non-null  float64       
 1   age_distribution_05_14                86376 non-null  float64       
 2   age_distribution_15_34                86376 non-null  float64       
 3   age_distribution_34_64                86376 non-null  float64       
 4   age_distribution_65_plus              86376 non-null  float64       
 5   c1_school_closing                     86376 non-null  float64       
 6   c2_workplace_closing                  86376 non-null  float64       
 7   c3_cancel_public_events               86376 non-null  float64       
 8   c4_restrictions_on_gatherings         86376 non-null  float64       
 9   c5_close_public_transport             86376 non-null  float64       
 10