In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.impute import SimpleImputer

from shapely.geometry import Point,Polygon
import requests

**Loading Training and Testing Data**

In [None]:
train_data = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-4/train.csv')
test_data = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-4/test.csv')
submission_csv = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-4/submission.csv')

In [None]:
train_data.head()

**Convert String Datetime to python datetime**

In [None]:
convert_dict = {'Province_State': str,'Country_Region':str,'ConfirmedCases':int,'Fatalities':int}
convert_dict_test = {'Province_State': str,'Country_Region':str}
train_data = train_data.astype(convert_dict)
test_data = test_data.astype(convert_dict_test)

In [None]:
train_data['Date'] = pd.to_datetime(train_data['Date'], infer_datetime_format=True)
test_data['Date'] = pd.to_datetime(test_data['Date'], infer_datetime_format=True)

In [None]:
train_data.loc[:, 'Date'] = train_data.Date.dt.strftime('%m%d')
train_data.loc[:, 'Date'] = train_data['Date'].astype(int)

test_data.loc[:, 'Date'] = test_data.Date.dt.strftime('%m%d')
test_data.loc[:, 'Date'] = test_data['Date'].astype(int)

In [None]:
train_data['Country_Region'] = np.where(train_data['Province_State'] == 'nan',train_data['Country_Region'],train_data['Province_State']+' '+train_data['Country_Region'])
test_data['Country_Region'] = np.where(test_data['Province_State'] == 'nan',test_data['Country_Region'],test_data['Province_State']+' '+test_data['Country_Region'])

#train_data['Province_State'] = np.where(train_data['Province_State'] == 'nan',train_data['Country_Region'],train_data['Province_State']+train_data['Country_Region'])
#test_data['Province_State'] = np.where(test_data['Province_State'] == 'nan',test_data['Country_Region'],test_data['Province_State']+test_data['Country_Region'])

In [None]:
train_data = train_data.drop(columns=['Province_State'])
test_data = test_data.drop(columns=['Province_State'])

In [None]:
test_data.head(2)

**Label Encoding Country**

In [None]:
#get list of categorical variables
s = (train_data.dtypes == 'object')
object_cols = list(s[s].index)

In [None]:
from sklearn.preprocessing import LabelEncoder

**Try using Label Encoder**

In [None]:
label_encoder1 = LabelEncoder()
label_encoder2 = LabelEncoder()

#train_data['Province_State'] = label_encoder1.fit_transform(train_data['Province_State'])
#test_data['Province_State'] = label_encoder1.transform(test_data['Province_State'])

train_data['Country_Region'] = label_encoder2.fit_transform(train_data['Country_Region'])
test_data['Country_Region'] = label_encoder2.transform(test_data['Country_Region'])

In [None]:
train_data.head(2)

In [None]:
test_data.head(2)

In [None]:
Test_id = test_data.ForecastId

In [None]:
train_data.drop(['Id'], axis=1, inplace=True)
test_data.drop('ForecastId', axis=1, inplace=True)

**Check missing value**

In [None]:
missing_val_count_by_column = (train_data.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column>0])

**Make model XGBRegressor**

In [None]:
from xgboost.sklearn import XGBRegressor

In [None]:
train_data.head(1)

In [None]:
X_train = train_data[['Country_Region','Date']]
y_train = train_data[['ConfirmedCases', 'Fatalities']]

**Make decision tree regressor model**

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
tree_regressor1 = DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=6967, splitter='best')

In [None]:
tree_regressor2 = DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=6967, splitter='best')

In [None]:
tree_regressor1.fit(X_train, y_train.ConfirmedCases)

In [None]:
tree_regressor2.fit(X_train, y_train.Fatalities)

**prediction**

In [None]:
best_best_estimate_1 = tree_regressor1.predict(test_data)

In [None]:
best_best_estimate_2 = tree_regressor2.predict(test_data)

**Submission**

In [None]:
df_sub = pd.DataFrame()

In [None]:

df_sub['ForecastId'] = Test_id
df_sub['ConfirmedCases'] = np.round(best_best_estimate_1,0)
df_sub['Fatalities'] = np.round(best_best_estimate_2,0)

df_sub.to_csv('submission.csv', index=False)

In [None]:
df_sub