In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', 500)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

The goal of this notebook is to build a simple machine learning model with the COVID data to predict the number of cases of the next day. First of all, the data will be loaded and treated for build the model.

In [None]:
data = pd.read_csv('/kaggle/input/covid19-in-limeiraspbrazil/covid-limeira-daily.csv')
data['Date'] = pd.to_datetime(data['Date'])
data.head(10)

In [None]:
print(data['Date'].diff().value_counts())

Not all records are in sequence, but we need a time series that have no gaps. The maximum gap size is 5 days. The gaps are inthe first records, when data started to being published and doesn't have a very high precision. Therefore, these records will be dropped.

In [None]:
# keep sequential dates only
data = data[(data['Date'] - data['Date'].shift(5)) == pd.Timedelta(5, 'D')]
print(len(data))
#data.head(10)

## Time Series References

The references bellow was used to build this model:

* [Séries temporais — definições e características](https://medium.com/data-sprints/s%C3%A9ries-temporais-defini%C3%A7%C3%B5es-e-caracter%C3%ADsticas-698d85f4b353)
* [Time Series #1 - Como Criar o Ambiente de Desenvolvimento para Data Science](https://www.youtube.com/watch?v=lYLGaLEvWto)
* [Time Series #2 - TUDO o Que Você Precisa Saber Para Criar Seu Modelo de Machine Learning](https://www.youtube.com/watch?v=8UTNg4bzWlE)
* [Time Series #3 - 5 Dicas para Melhorar Seus Modelos de Machine Learning](https://www.youtube.com/watch?v=_xa1Yx6ZQo4)

The test set will be the last 15 days of data. From train set, will be extracted a validation set, with the last 15 records of the train set.

In [None]:
data['y'] = data['NewCases'].shift(-1)

def train_test_split(data):
    train = data.iloc[:-15,:]
    train.reset_index(drop=True, inplace=True)
    test = data.tail(15).copy()
    test.reset_index(drop=True, inplace=True)    
    return train, test

train, test = train_test_split(data)
train, valid = train_test_split(train)

In [None]:
def sep_feature_target(data):
    y = data['y']
    x = data.drop('y', axis=1)
    return x, y

x_train, y_train = sep_feature_target(train)
x_valid, y_valid = sep_feature_target(valid)
x_test, y_test   = sep_feature_target(test)

In [None]:
# drop last row of test_set because there is no target for it
x_test.drop(x_test.tail(1).index,inplace=True)
y_test.drop(y_test.tail(1).index,inplace=True)
y_test

In [None]:
print('Checking sizes of train/dev/test sets:')
print(len(x_train), len(y_train))
print(len(x_valid), len(y_valid))
print(len(x_test), len(y_test))

## Baseline

The baseline model is a simple model created to get a simple and gross error value, that must be beated for the more complex models that will be built next.

Two baseline models will be used:
* baseline shift1: this model considers that the new cases will be the same of the day before.
* baseline rollingavg5: this model considers that new cases will be the rolling average of the last 5 days.

In [None]:
from sklearn.metrics import mean_squared_log_error

def calc_error(valid, predict):
    return np.sqrt(mean_squared_log_error( valid, predict ))

baseline_valid = x_valid['NewCases'].shift(1)
print("baseline shift1 =", calc_error( y_valid[y_valid.notnull()][1:], baseline_valid[baseline_valid.notnull()] ) * 100.0)

baseline_valid = x_valid['NewCases'].shift(1).rolling(5).mean()
print("baseline rollingavg5 =", calc_error( y_valid[y_valid.notnull()][5:], baseline_valid[baseline_valid.notnull()] ) * 100.0)

For better capturing the time series, the `feature_generation` function will add more features to the datasets, like the week day and week number, lags (difference of new cases between the days before) and rolling averages.

In [None]:
import datetime as dt

def feature_generation(data_features):
    data_features['weekday'] = data_features['Date'].dt.weekday
    data_features['weekofyear'] = data_features['Date'].dt.isocalendar().week
    data_features['weekofyear'] = data_features['weekofyear'].astype('int32')
    data_features['dayofyear'] = data_features['Date'].dt.dayofyear
    data_features['diff_1'] = data_features['NewCases'].diff()
    data_features['diff_2'] = data_features['NewCases'].shift(1).diff()
    data_features['diff_3'] = data_features['NewCases'].shift(2).diff()
    data_features['ra3'] = data_features['NewCases'].rolling(3).mean()#.reset_index(level=0, drop=True)
    data_features['ra7'] = data_features['NewCases'].rolling(7).mean()#.reset_index(level=0, drop=True)
    data_features['ra10'] = data_features['NewCases'].rolling(10).mean()#.reset_index(level=0, drop=True)
    data_features['ra14'] = data_features['NewCases'].rolling(14).mean()#.reset_index(level=0, drop=True)
    data_features = data_features.fillna(0)
    #data_features['Date'] = data_features['Date'].map(dt.datetime.toordinal)
    data_features['Date'] = data_features['Date'].values.astype(float)
    return data_features

## First Model (RandomForestRegressor)

For this first model, will be used a `RandomForestRegressor`. The hyperparameters was tuned after a few executions of the model fit.

In [None]:
from sklearn.ensemble import RandomForestRegressor

model1 = RandomForestRegressor(n_jobs=-1, random_state=11, min_samples_leaf=2)
model1.fit(feature_generation(x_train), y_train)

t = model1.predict(feature_generation(x_train))
v = model1.predict(feature_generation(x_valid))

print('Checking the model1 errors:')
print("model1 (train set error)=", calc_error(y_train, t))
print("model1 (dev set error)=", calc_error(y_valid, v))

For better view and understanding, the expected and predicted data will be ploted bellow.

In [None]:
import matplotlib.pyplot as plt

def plot_results(valid,predict):
    results = pd.DataFrame()
    results['target'] = valid
    results['predict'] = predict
    results.plot(figsize=(10,5))

In [None]:
plot_results(y_train, t)
plt.show()

In [None]:
plot_results(y_valid, v)

## Second Model (LightGBM)

This is an another try, using a more robust machine learning algorithm (LightGBM).

In [None]:
import lightgbm as lgb

model2 = lgb.LGBMRegressor(n_jobs=-1, random_state=11, learning_rate=.005, n_estimators=700);
model2.fit(feature_generation(x_train), y_train)

t = model2.predict(feature_generation(x_train))
v2 = model2.predict(feature_generation(x_valid))

print('Checking the model2 errors:')
print("model2 (train set error)=", calc_error(y_train, t))
print("model2 (dev set error)=", calc_error(y_valid, v2))

In [None]:
plot_results(y_train, t)

In [None]:
plot_results(y_valid, v2)

## Ensembling Models

In order to minimize possible overfitting, these two models will be ensembled.

In [None]:
def predict_ensemble(features, w=0.7):
    s1 = model1.predict(feature_generation(features))
    s2 = model2.predict(feature_generation(features))    
    return s1*(1. - w) + s2*w

In [None]:
ensemble = predict_ensemble(x_valid)
plot_results(y_valid, ensemble)
calc_error(y_valid, ensemble)

In [None]:
ensemble = predict_ensemble(x_test)
plot_results(y_test, ensemble)
calc_error(y_test, ensemble)

# 0.40791120181955176 | 0.1 , 0.7

## Predictions

Using the models generated before, follows the prediction of the number of cases for the next day (considering the last day of the dataset).

In [None]:
def predict_last_rec():
    last_rec = data.tail(1).copy()
    last_rec.drop('y', axis=1, inplace=True)
    next_day = (data.tail(1)['Date'].iloc[0] + dt.timedelta(1))
    print("Predicted new confirmed cases for", next_day.strftime('%m/%d/%Y') , ":", int(predict_ensemble(last_rec)))    

In [None]:
predict_last_rec()

Using the estimated values generated for the model for new cases and considering the rolling average for the other columns, here is the estimations for the next 2 weeks.

In [None]:
last_rec = data.tail(1).copy()
last_rec.drop('y', axis=1, inplace=True)    

data_estimated = data.copy()

#preds = []
#next_dates = []
#preds.columns = data.columns

for _ in range(1,14):
    next_day = (last_rec['Date'].iloc[0] + dt.timedelta(1))
    #next_dates.append(next_day)
    next_newcases = int(predict_ensemble(last_rec))    
    
    print("Predicted new confirmed cases for", next_day.strftime('%m/%d/%Y') , ":", int(predict_ensemble(last_rec)))    
    
    last_rec['Date'] = next_day
    last_rec['NewCases'] = next_newcases
    last_rec['Confirmed'] = last_rec['Confirmed'] + next_newcases
    last_rec['Notifications'] = last_rec['Notifications'] + next_newcases
    
    # estimate other fields
    newdeaths = int(data['NewDeaths'].tail(7).mean())
    last_rec['NewDeaths'] = newdeaths
    
    newurc = data['URCOccupancy'].tail(7).mean()
    last_rec['URCOccupancy'] = newurc
    
    data_estimated = data_estimated.append(last_rec)
    
    #preds.append(next_newcases)

In [None]:
data_estimated.tail(15)

In [None]:
last_dataset_date = data.tail(1).iloc[0, 0]

ax = data_estimated[data_estimated['Date'] <= last_dataset_date].plot(x='Date',
                                                                            y='NewCases',
                                                                            figsize=(15,10),
                                                                            ylabel='Confirmed Cases',
                                                                            title='COVID Confirmed Cases - Actual vs Predictions')
data_estimated[data_estimated['Date'] >= last_dataset_date].plot(x='Date', y='NewCases', ax=ax)
ax.legend(['Actual', 'Predictions'])
plt.show()

## Conclusions

* The model captured well the pattern of the time series, where working days always have more cases then weekends. This happens because the laboratories that process the results doesn't work on weekends.
* The estimations of new cases are strongly influenced by the deaths and weekday. Therefore, another strategy has to be taken, using direct or native methods.