In [1]:
%load_ext autoreload
%autoreload 1

In [34]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

%aimport data_cleaning
from data_cleaning import encode_dates

In [49]:
dataset_raw = pd.read_csv('/home/sam/kaggle/uncover_covid/UNCOVER/WHO/world-health-organization-who-situation-reports.csv',
                         parse_dates=[0])
dataset_raw

Unnamed: 0,date,location,new_cases,new_deaths,total_cases,total_deaths
0,2020-02-25,Afghanistan,,,1,
1,2020-02-26,Afghanistan,0.0,,1,
2,2020-02-27,Afghanistan,0.0,,1,
3,2020-02-28,Afghanistan,0.0,,1,
4,2020-02-29,Afghanistan,0.0,,1,
...,...,...,...,...,...,...
2862,2020-03-13,World,7488.0,338.0,132758,4956.0
2863,2020-03-14,World,9761.0,433.0,142534,5392.0
2864,2020-03-15,World,10967.0,343.0,153517,5735.0
2865,2020-03-16,World,13971.0,855.0,167506,6606.0


In [52]:
dataset_dated = encode_dates(dataset_raw, col='date', replacement_field='date')
dataset_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0.0)),
    ('country_encoder', OrdinalEncoder())
])
dataset = dataset_pipeline.fit_transform(dataset_dated)
dataset = pd.DataFrame(dataset, columns=dataset_dated.columns)
dataset

Unnamed: 0,location,new_cases,new_deaths,total_cases,total_deaths,dateyear,datemonth,dateday
0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,24.0
1,0.0,3.0,0.0,0.0,0.0,0.0,1.0,25.0
2,0.0,3.0,0.0,0.0,0.0,0.0,1.0,26.0
3,0.0,3.0,0.0,0.0,0.0,0.0,1.0,27.0
4,0.0,3.0,0.0,0.0,0.0,0.0,1.0,28.0
...,...,...,...,...,...,...,...,...
2862,160.0,312.0,90.0,488.0,175.0,0.0,2.0,12.0
2863,160.0,313.0,93.0,489.0,176.0,0.0,2.0,13.0
2864,160.0,314.0,91.0,490.0,177.0,0.0,2.0,14.0
2865,160.0,316.0,96.0,491.0,178.0,0.0,2.0,15.0


In [53]:
# dataset_train, dataset_test = train_test_split(dataset, test_size=0.2, shuffle=True)
# dataset_test.to_csv('/home/sam/kaggle/uncover_covid/deaths_test.csv', index=False)
dataset_test = pd.read_csv('/home/sam/kaggle/uncover_covid/deaths_test.csv')
dataset_train = pd.concat([dataset, dataset_test]).drop_duplicates(keep=False)
dataset_train

Unnamed: 0,location,new_cases,new_deaths,total_cases,total_deaths,dateyear,datemonth,dateday
0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,24.0
1,0.0,3.0,0.0,0.0,0.0,0.0,1.0,25.0
2,0.0,3.0,0.0,0.0,0.0,0.0,1.0,26.0
4,0.0,3.0,0.0,0.0,0.0,0.0,1.0,28.0
6,0.0,3.0,0.0,0.0,0.0,0.0,2.0,1.0
...,...,...,...,...,...,...,...,...
2862,160.0,312.0,90.0,488.0,175.0,0.0,2.0,12.0
2863,160.0,313.0,93.0,489.0,176.0,0.0,2.0,13.0
2864,160.0,314.0,91.0,490.0,177.0,0.0,2.0,14.0
2865,160.0,316.0,96.0,491.0,178.0,0.0,2.0,15.0


In [69]:
Xy_train, Xy_val = train_test_split(dataset_train, test_size=0.2)
inference = 'total_deaths'
X_train, X_val = Xy_train.drop(inference, axis=1), Xy_val.drop(inference , axis=1)
y_train, y_val = Xy_train[inference], Xy_val[inference]
X_train

Unnamed: 0,location,new_cases,new_deaths,total_cases,dateyear,datemonth,dateday
163,9.0,16.0,0.0,36.0,0.0,2.0,4.0
892,54.0,3.0,0.0,0.0,0.0,1.0,27.0
176,10.0,3.0,0.0,2.0,0.0,2.0,0.0
1450,84.0,6.0,0.0,12.0,0.0,2.0,2.0
2530,145.0,3.0,0.0,41.0,0.0,2.0,1.0
...,...,...,...,...,...,...,...
629,39.0,65.0,0.0,166.0,0.0,2.0,14.0
2459,142.0,45.0,0.0,158.0,0.0,2.0,8.0
1693,102.0,3.0,0.0,0.0,0.0,1.0,23.0
1541,90.0,3.0,0.0,21.0,0.0,1.0,21.0


In [75]:
reg = RandomForestRegressor()
reg.fit(X_train, y_train)
scores = cross_val_score(reg, X_train, y_train)
scores

array([0.99143742, 0.9959025 , 0.99186234, 0.98312608, 0.99538357])

In [82]:
print(reg.predict(X_train.iloc[0:10]))
y_train[0:10]

[ 0.15  0.    0.    0.    0.81  7.8   0.95 43.7   0.15  6.01]


163      0.0
892      0.0
176      0.0
1450     0.0
2530     1.0
2027     5.0
994      1.0
503     43.0
2351     0.0
1157     6.0
Name: total_deaths, dtype: float64