# COVID predictions using RandomForest and GridSearch

**THIS NOTEBOOK IS UNDER CONSTRUCTION**

This is my second attempt to build a Time Series model of COVID new cases and deaths. With a different approach of my first attempt (check [here](https://www.kaggle.com/lssilveira11/covid-cases-prediction-limeira-sp-brazil)), here I will try to predict new cases and deaths at the same time.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/covid19-in-limeiraspbrazil/covid-limeira-daily.csv')

# convert Date columns to 'datetime' type
data['Date'] = pd.to_datetime(data['Date'])

# drop columns that will not be predicted
#data.drop(columns=['UnderInvestigation', 'UnderInvestigationDeaths', 'Negative', 'NewInvestigation', 'NewNegative', 'NegativeRate', 'Active'], inplace=True)

# keep sequential dates only
data = data[(data['Date'] - data['Date'].shift(5)) == pd.Timedelta(5, 'D')]

# fillna at URCOccypancy with zeros
#data['URCOccupancy'] = data['URCOccupancy'].fillna(0)

# reset index
data.reset_index(drop=True, inplace=True)

In [None]:
data[data['NewNotifications'] < 0]

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
y_cols = ['NewCases', 'NewDeaths', 'Notifications']
y_val = data[y_cols].to_numpy()[1:]
y_val = np.append(y_val, np.full([1,len(y_cols)], fill_value=np.NaN), axis=0)
data['y'] = y_val.tolist()
data.tail()

In [None]:
def sep_feature_target(data):
    y = data['y']
    y = np.stack(y.to_numpy())
    x = data.drop('y', axis=1)
    return x, y

def train_test_split(data, split_size=15):
    train = data.iloc[:-split_size,:]
    train.reset_index(drop=True, inplace=True)
    test = data.tail(split_size).copy()
    test.reset_index(drop=True, inplace=True)    
    return train, test

train, test = train_test_split(data)
train, valid = train_test_split(train,60)

x_train, y_train = sep_feature_target(train)
x_valid, y_valid = sep_feature_target(valid)
x_test, y_test   = sep_feature_target(test)

# drop last row of test_set because there is no target for it
x_test.drop(x_test.tail(1).index,inplace=True)
y_test = y_test[:-1]

print('Checking sizes of train/dev/test sets:')
print(len(x_train), len(y_train))
print(len(x_valid), len(y_valid))
print(len(x_test), len(y_test))

In [None]:
# Baseline

from sklearn.metrics import mean_squared_log_error

def calc_error(valid, predict):
    return np.sqrt(mean_squared_log_error( valid, predict ))

baseline_valid = np.append([x_train.tail(1)[y_cols].values[0]],
                           x_valid[y_cols].shift(1).values[1:], 
                           axis=0)
print("baseline shift1 =", calc_error( y_valid, baseline_valid ))

#baseline_valid = x_valid['NewCases'].shift(1).rolling(5).mean()
#print("baseline rollingavg5 =", calc_error( y_valid[y_valid.notnull()][5:], baseline_valid[baseline_valid.notnull()] ) * 100.0)

In [None]:
import datetime as dt

def ra_gen(data_features, colName):
    data_features[colName+'diff_1'] = data_features[colName].diff()
    data_features[colName+'diff_2'] = data_features[colName].shift(1).diff()
    data_features[colName+'diff_3'] = data_features[colName].shift(2).diff()
    data_features[colName+'diff_4'] = data_features[colName].shift(3).diff()
    data_features[colName+'diff_5'] = data_features[colName].shift(4).diff()
    data_features[colName+'diff_6'] = data_features[colName].shift(5).diff()
    data_features[colName+'diff_7'] = data_features[colName].shift(6).diff()
    data_features[colName+'diff_8'] = data_features[colName].shift(7).diff()
    data_features[colName+'diff_9'] = data_features[colName].shift(8).diff()
    data_features[colName+'diff_10'] = data_features[colName].shift(9).diff()
    data_features[colName+'ra3'] = data_features[colName].rolling(3).mean()#.reset_index(level=0, drop=True)
    data_features[colName+'ra7'] = data_features[colName].rolling(7).mean()#.reset_index(level=0, drop=True)
    data_features[colName+'ra10'] = data_features[colName].rolling(10).mean()#.reset_index(level=0, drop=True)
    data_features[colName+'ra14'] = data_features[colName].rolling(14).mean()#.reset_index(level=0, drop=True)
    data_features[colName+'ra20'] = data_features[colName].rolling(20).mean()#.reset_index(level=0, drop=True)
    data_features[colName+'ra30'] = data_features[colName].rolling(30).mean()#.reset_index(level=0, drop=True)
    return data_features

def feature_generation(data_features):
    data_features['weekday'] = data_features['Date'].dt.weekday
    data_features['weekofyear'] = data_features['Date'].dt.isocalendar().week
    data_features['weekofyear'] = data_features['weekofyear'].astype('int32')
    data_features['dayofyear'] = data_features['Date'].dt.dayofyear
    data_features = ra_gen(data_features, 'NewCases')
    data_features = ra_gen(data_features, 'NewDeaths')
    data_features = ra_gen(data_features, 'NewNotifications')
    data_features = data_features.fillna(0)
    #data_features['Date'] = data_features['Date'].map(dt.datetime.toordinal)
    data_features['Date'] = data_features['Date'].values.astype(float)
    return data_features

In [None]:
import matplotlib.pyplot as plt

def plot_results(target,predict,setName=''):
    target_df = pd.DataFrame(target)
    target_df.columns = y_cols
    #target_df.columns = data.columns.values[1:6]
    #target_df['NewCases'] = target_df['Confirmed'].diff()
    #target_df['NewDeaths'] = target_df['Deaths'].diff()
    #target_df['NewNotifications'] = target_df['Notifications'].diff()
    
    predict_df = pd.DataFrame(predict)
    predict_df.columns = y_cols
    #predict_df.columns = data.columns.values[1:6]
    #predict_df['NewCases'] = predict_df['Confirmed'].diff()
    #predict_df['NewDeaths'] = predict_df['Deaths'].diff()
    #predict_df['NewNotifications'] = predict_df['Notifications'].diff() 
    
    fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(21,5))
    
    target_df.plot(y='NewCases', ax=ax[0])
    predict_df.plot(y='NewCases', ax=ax[0])
    ax[0].legend(['Target', 'Predicted'])
    ax[0].set_title('New cases ('+setName+')')
    
    target_df.plot(y='NewDeaths', ax=ax[1])
    predict_df.plot(y='NewDeaths', ax=ax[1])
    ax[1].legend(['Target', 'Predicted'])
    ax[1].set_title('New deaths ('+setName+')')
    
    target_df.plot(y='Notifications', ax=ax[2])
    predict_df.plot(y='Notifications', ax=ax[2])
    ax[2].legend(['Target', 'Predicted'])
    ax[2].set_title('Notifications ('+setName+')')
    
    return target_df, predict_df
#    results = pd.DataFrame()
#    results['target'] = valid[:,0]
#    results['predict'] = predict[:,0]
#    results.plot(figsize=(10,5))

In [None]:
from sklearn.ensemble import RandomForestRegressor

maxfeat=int(x_train.shape[1]*0.6)

model1 = RandomForestRegressor(n_jobs=-1, random_state=93, n_estimators=200, max_features=41, max_depth=6)
model1.fit(feature_generation(x_train), y_train)

print(model1)
print('Checking the model1 errors:')

t = model1.predict(feature_generation(x_train))
print("model1 (train set error)=", calc_error(y_train, t))

v = model1.predict(feature_generation(x_valid))
print("model1 (dev set error)=", calc_error(y_valid, v))

    RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=93)
    Checking the model1 errors:
    model1 (train set error)= 0.3647075764880938
    model1 (dev set error)= 1.068082654287613
    
    RandomForestRegressor(max_depth=5, max_features=26, n_estimators=1000,
                      n_jobs=-1, random_state=93)
    Checking the model1 errors:
    model1 (train set error)= 0.5469734071346023
    model1 (dev set error)= 1.0342799360032215

In [None]:
plot_results(y_train, t, 'Train set')
plt.show()

plot_results(y_valid, v, 'Dev set')
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV

params = {
          'n_estimators': [100,200], 
          'min_samples_leaf': [2,3],
          'min_samples_split': [2,3],
          'max_depth': [*range(1, 6, 1)],
          'max_features': [*range(1, x_train.shape[1], 20)],
         }

cv = GridSearchCV(model1, param_grid=params, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=1)
cv.fit(feature_generation(x_train), y_train)

print(cv)
print('Checking the cv errors:')

t = cv.predict(feature_generation(x_train))
print("cv (train set error)=", calc_error(y_train, t))

v = cv.predict(feature_generation(x_valid))
print("cv (dev set error)=", calc_error(y_valid, v))

In [None]:
# best parameters found after GridSearch
print(cv.best_params_)

In [None]:
plot_results(y_train, t)
plt.show()

plot_results(y_valid, v)
plt.show()