In [None]:
# My simple forecasting COVID-19 confirmed cases and fatalities 
# My submission predict with CatBoost

import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler 

# model
from catboost import Pool
from catboost import CatBoostRegressor
from catboost import CatBoostClassifier
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor

#plot
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# load training and testing data 
subm = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-4/submission.csv')
training_data = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-4/train.csv', index_col='Id', parse_dates=True)
testing_data = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-4/test.csv', index_col='ForecastId', parse_dates=True)
happiest_data = pd.read_csv('/kaggle/input/world-happiness/2019.csv')

In [None]:
# load additional data 
happiest_data.rename(columns={'Country or region':'Country_Region'}, inplace=True)

In [None]:
# add information to the training data from happiest_data
train_data = training_data.copy()
train_data = train_data.merge(happiest_data, how='left', left_index=True, on=['Country_Region'])
train_data.index = training_data.index

In [None]:
# ... and to the test data
test_data = testing_data.copy()
test_data = test_data.merge(happiest_data, how='left', on=['Country_Region'])
test_data.index = testing_data.index

In [None]:
# see testing data
test_data

In [None]:
# ...and training data
train_data

In [None]:
# detect missing values in training
train_data.isna().sum()

In [None]:
# ...in testing data
test_data.isna().sum()

In [None]:
# Convert data in integer
train_data['Date']= pd.to_datetime(train_data['Date']).dt.strftime("%m%d").astype(int)
test_data['Date']= pd.to_datetime(test_data['Date']).dt.strftime("%m%d").astype(int)

In [None]:
# define the minimum and maximum dates after convertion in train data
train_data['Date'].min(), train_data['Date'].max()

In [None]:
# define the minimum and maximum dates after convertion in train data
test_data['Date'].min(), test_data['Date'].max()

In [None]:
# create a list with dates that intersect in the training and test data
drop_date = [i for i in range(test_data['Date'].min(), train_data['Date'].max()+1)]

In [None]:
# see it
drop_date

In [None]:
# throw out the dates coinciding with the test data from the train data
train_data = train_data.loc[~train_data['Date'].isin(drop_date)]

In [None]:
#check the minimum and maximum dates 
train_data['Date'].min(), train_data['Date'].max()

In [None]:
# separate the vector correct answers ('ConfirmedCases' and 'Fatalities') from the training data
y_conf = train_data.ConfirmedCases
y_fatal = train_data.Fatalities
train_data.drop(['ConfirmedCases'], axis=1, inplace=True)
train_data.drop(['Fatalities'], axis=1, inplace=True)

In [None]:
# Select categorical columns in training and testing data
categorical_cols = [cname for cname in train_data.columns if
                    train_data[cname].dtype == "object"]

In [None]:
# Select non type columns in training and testing data
non_cols = [cname for cname in train_data.columns if
                    train_data[cname].dtype == None]

In [None]:
non_cols, categorical_cols

In [None]:
# replace missing values in training and testing data
train_data[categorical_cols] = train_data[categorical_cols].fillna('-')
test_data[categorical_cols] = test_data[categorical_cols].fillna('-')

In [None]:
train_data.isna().sum()

In [None]:
# replace missing non type values in training and testing data
train_data[non_cols] = train_data[non_cols].fillna(0)
test_data[non_cols] = test_data[non_cols].fillna(0)

In [None]:
# perform LabelEncoder with categorical data (categorical_cols)
state_encoder = LabelEncoder()
counrty_encoder = LabelEncoder()
ord_encoder = OrdinalEncoder()
encod_train_data = train_data.copy()
encod_test_data = test_data.copy()
#for col in categorical_cols:
    #encod_train_data[col] = encoder.fit_transform(train_data[col])
    #encod_test_data[col] = encoder.transform(test_data[col])
    
encod_train_data[categorical_cols] = ord_encoder.fit_transform(train_data[categorical_cols])
encod_test_data[categorical_cols] = ord_encoder.transform(test_data[categorical_cols])
#encod_train_data['Province_State'] = state_encoder.fit_transform(encod_train_data['Province_State'])
#encod_test_data['Province_State'] = state_encoder.transform(encod_test_data['Province_State'])
#encod_train_data['Country_Region'] = counrty_encoder.fit_transform(encod_train_data['Country_Region'])
#encod_test_data['Country_Region'] = counrty_encoder.transform(encod_test_data['Country_Region'])

In [None]:
#scaler = MinMaxScaler()
#scale_train_data = scaler.fit_transform(encod_train_data)
#scaler_test_data = scaler.transform(encod_test_data)

In [None]:
encod_train_data.loc[500], encod_test_data.loc[500]

In [None]:
# split encod_train_data into training(X_train) and validation(X_valid) data
# and split vector correct answers ('ConfirmedCases')
X_train, X_valid, y_train, y_valid = train_test_split(encod_train_data, y_conf, train_size=0.95, 
                                                      test_size=0.05, random_state=0)

In [None]:
#model_0 = CatBoostRegressor(iterations=12000, 
#                          depth=9, 
#                          learning_rate=0.4, 
#                          loss_function='RMSE',
#                          random_seed=20,
#                          verbose=False)
#model_1 = CatBoostRegressor(iterations=12000, 
#                          depth=9, 
#                          learning_rate=0.4, 
#                          loss_function='RMSE',
#                          verbose=False)

In [None]:
#def rmse_score(random_seed):
#    rmse = np.sqrt(-cross_val_score(CatBoostRegressor(iterations=8000, 
#                          depth=6, 
#                          learning_rate=0.4, 
#                          loss_function='RMSE',
#                          random_seed=random_seed,
#                          verbose=False),X_train, y_train, scoring="neg_mean_squared_error", cv = 3))
#    return(rmse)

In [None]:
#models = {}
#for num, model in enumerate([model_0, model_1]):
    #models[num] = rmse_score(model)

In [None]:
#models

In [None]:
#plt.figure(figsize=(12,8))
#for i in models:
    #sns.lineplot(data=models[i], label=i)

In [None]:
#for x in models:
    #print(x, models[x].mean())

In [None]:
#metrics = [0, 30, 50]
#results = {}
#for x in metrics:
    #results[x] = rmse_score(x)

In [None]:
#results

In [None]:
#plt.figure(figsize=(12,8))
#for i in results:
    #sns.lineplot(data=results[i], label=i)

In [None]:
#for x in metrics:
    #print(x, results[x].mean())

In [None]:
# select model and install parameters
model = CatBoostRegressor(iterations=12000, 
                          depth=6, 
                          learning_rate=0.4, 
                          loss_function='RMSE',
                          verbose=False)

In [None]:
# train the model
model.fit(X_train,y_train, plot = True)

In [None]:
# preprocessing of validation data, get predictions
preds = model.predict(X_valid)

print('MAE:', mean_absolute_error(y_valid, preds))

In [None]:
# make the prediction using the resulting model
preds = model.predict(X_valid)

print('MSE:', mean_squared_error(y_valid, preds))

In [None]:
x_list = [X_train, X_valid]
y_list = [y_train, y_valid]

scoring = list(map(lambda x,y: round(model.score(x,y)*100, 2), x_list, y_list)) 
scoring

In [None]:
# get predictions test data
final_preds_conf = model.predict(encod_test_data)

In [None]:
train_data_preds_confirm = encod_train_data.copy()
train_data_preds_confirm['ConfirmedCases'] = y_conf

In [None]:
test_data_preds_confirm = encod_test_data.copy()
test_data_preds_confirm['ConfirmedCases'] = final_preds_conf

In [None]:
# split encod_train_data into training(X_train) and validation(X_valid) data
# and split vector correct answers ('Fatalities')
X_train_f, X_valid_f, y_train_f, y_valid_f = train_test_split(train_data_preds_confirm, y_fatal, train_size=0.95, 
                                                      test_size=0.05, random_state=0)

In [None]:
#def rmse_score(learning_rate):
#    rmse = np.sqrt(-cross_val_score(CatBoostRegressor(iterations=12000, 
#                          depth=6, 
#                          learning_rate=learning_rate, 
#                          loss_function='RMSE',
#                          verbose=False),X_train_f, y_train_f, scoring="neg_mean_squared_error", cv = 5))
#    return(rmse)

In [None]:
#metrics = [0.08, 0.1, 0.13]
#results = {}
#for x in metrics:
    #results[x] = rmse_score(x)

In [None]:
#results

In [None]:
#plt.figure(figsize=(12,8))
#for i in results:
    #sns.lineplot(data=results[i], label=i)

In [None]:
#for x in metrics:
    #print(x, results[x].mean())

In [None]:
# select model and install parameters
model = CatBoostRegressor(iterations=12000, 
                          depth=6, 
                          learning_rate=0.1, 
                          loss_function='RMSE',
                          verbose=False)

In [None]:
# train the model
model.fit(X_train_f,y_train_f, plot = True)

In [None]:
# preprocessing of validation data, get predictions
preds_f = model.predict(X_valid_f)

print('MAE:', mean_absolute_error(y_valid_f, preds_f))

In [None]:
# make the prediction using the resulting model
preds_f = model.predict(X_valid_f)

print('MSE:', mean_squared_error(y_valid_f, preds_f))

In [None]:
x_list_f = [X_train_f, X_valid_f]
y_list_f = [y_train_f, y_valid_f]

scoring = list(map(lambda x,y: round(model.score(x,y)*100, 2), x_list_f, y_list_f)) 
scoring

In [None]:
# get predictions test data
final_preds_fatal = model.predict(test_data_preds_confirm)

In [None]:
# combine predictions 'ConfirmedCases' and 'Fatalities'
output = pd.DataFrame({'ForecastId': test_data.index,
                       'ConfirmedCases': final_preds_conf,
                       'Fatalities': final_preds_fatal})

In [None]:
# replace negative values with 0, because the predictions of 'ConfirmedCases' and 'Fatalities' cannot be negative
output.loc[output['ConfirmedCases'] < 0,'ConfirmedCases'] = 0
output.loc[output['Fatalities'] < 0,'Fatalities'] = 0

In [None]:
# and save test predictions to file
output.to_csv('submission.csv', index=False)
print('Complete!')

In [None]:
output.tail(10)

In [None]:
output.describe()

In [None]:
plt.figure(figsize=(12,8))
sns.lineplot(data=output['ConfirmedCases'], label="ConfirmedCases")
sns.lineplot(data=output['Fatalities'], label="Fatalities")

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x=output['ForecastId'], y=output['ConfirmedCases'], hue=output['Fatalities'])