In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from pandas_datareader import data
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Dataset

In [None]:
train_data = pd.read_csv('../input/covid19-global-forecasting-week-1/train.csv')
test_data = pd.read_csv('../input/covid19-global-forecasting-week-1/test.csv')

data = [train_data, test_data]


In [None]:
train_data.shape

In [None]:
train_data.columns

In [None]:
train_data.isnull().sum()

In [None]:
arr1 = train_data['Country/Region'].unique()
arr2 = arr2 = [i for i in range(163)]

country_dict = dict(zip(arr1, arr2))

In [None]:
country_dict

In [None]:
for dataset in data:
    dataset['Country_int'] = dataset['Country/Region'].map(country_dict).astype(int)

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
test_data.columns

In [None]:
for j in train_data['Country/Region'].unique():
    l = train_data[(train_data['Country/Region'] == j)]
    len_state = len(l['Province/State'].unique())
    
    if len_state != 1:
        arr_state1 = l['Province/State'].unique()
        arr_state2 = [i for i in range(1,len_state+1)]
        state_dict = dict(zip(arr_state1, arr_state2))

        for dataset in data:
            dataset['Province/State'] = dataset['Province/State'].map(state_dict)

In [None]:
train_data[train_data['Country/Region'] == 'China'].head()

In [None]:
for dataset in data:
    dataset['Province/State'].fillna(0, inplace=True)

In [None]:
train_data.head()

# Selecting Target & Feature Variables

In [None]:
X= train_data.iloc[:,0:2]
Y = train_data.iloc[:,2]

#  Casting to a specified dtype

In [None]:
type(train_data.Date)

In [None]:
for dataset in data:
    dataset['Date'] = pd.to_datetime(dataset['Date'])

In [None]:
for dataset in data:
    dataset['Day'] = dataset.Date.apply(lambda x: x.day)
    dataset['Month'] = dataset.Date.apply(lambda x: x.month)

In [None]:
train_data['ConfirmedCases'] = train_data['ConfirmedCases'].astype(int)
train_data['Fatalities'] = train_data['Fatalities'].astype(int)

# creating a grid

In [None]:
corr = train_data.corr()
plt.figure(figsize=(11,7))
sns.heatmap(corr, annot=True)

# Creating a model

In [None]:
X = train_data.drop(['Id', 'Country/Region', 'Lat', 'Long', 'Date', 'ConfirmedCases', 'Fatalities', 'Province/State'], axis=1)
y1 = train_data['ConfirmedCases']
y2 = train_data['Fatalities']
X_test = test_data.drop(['ForecastId', 'Country/Region', 'Lat', 'Long', 'Date', 'Province/State'], axis=1)

In [None]:
rt= DecisionTreeRegressor(random_state=0)
model_r = rt.fit(X, y1)


In [None]:
XGB_regressor = XGBRegressor()
XGB_regressor.fit(X, y1)
XGB_regressor.score(X, y1)

In [None]:
y_pred = XGB_regressor.predict(X_test)

In [None]:
Random_Forest_regressor = RandomForestRegressor()
Random_Forest_regressor.fit(X, y1)
Random_Forest_regressor.score(X, y1)

# Visualing model

In [None]:
from sklearn import tree
import graphviz
dot_data = tree.export_graphviz(rt, feature_names=list(X), class_names=sorted(Y.unique()), filled=True)
graphviz.Source(dot_data)

# Pred

In [None]:
FatalitiesPred = XGB_regressor.predict(X_test)
FatalitiesPred = pd.DataFrame(FatalitiesPred, columns=['Fatalities'])

In [None]:
ConfirmedCasesPred = XGB_regressor.predict(X_test)
ConfirmedCasesPred = pd.DataFrame(ConfirmedCasesPred, columns=['ConfirmedCases'])

In [None]:
ForecastId = test_data.ForecastId
ForecastId = pd.DataFrame(ForecastId)

In [None]:
pred_file = pd.concat([ForecastId, ConfirmedCasesPred, FatalitiesPred], axis=1)

In [None]:
pred_file.head()

In [None]:
pred_file.describe()

In [None]:
Y_true = [X_test]  # Y_true = Y (original values)
  
# Calculated values
Y_pred = [pred_file]  # Y_pred = Y'
  
# Mean Squared Error
MSE = np.square(np.subtract(Y_true,Y_pred)).mean()

In [None]:
MSE

In [None]:
pred_file.to_csv('submission.csv', index=False)