In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor

In [None]:
pd.pandas.set_option('display.max_columns',None)

In [None]:
train=pd.read_csv('../input/covid19-global-forecasting-week-5/train.csv')
test=pd.read_csv('../input/covid19-global-forecasting-week-5/test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.groupby('Country_Region')['Population'].max()

### Now lets extract date and month from date column

In [None]:
train['date']=pd.to_datetime(train.Date,format='%Y-%m-%d').dt.day
train['month']=pd.to_datetime(train.Date,format='%Y-%m-%d').dt.month

#### similarly for test dataset

In [None]:
test['date']=pd.to_datetime(test.Date,format='%Y-%m-%d').dt.day
test['month']=pd.to_datetime(test.Date,format='%Y-%m-%d').dt.month

# Data visualization

### Population Vs Country

In [None]:
x=train.groupby('Country_Region')['Population'].max()
y=train.groupby('Country_Region')['Country_Region'].max()
fig=px.pie(data_frame=train,values=x,names=y,hole=0.30)
fig.update_traces(textposition='inside',textinfo='percent+label')

## Target Value Vs Country

In [None]:
fig=px.pie(data_frame=train,values='TargetValue',names='Country_Region',hole=0.30)
fig.update_traces(textposition='inside',textinfo='percent+label')

## Confirmed Case Vs Fatalities

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x='Target',y='TargetValue',data=train)

## Top 10
## Confirmed cases Vs Country

In [None]:
conf_train=train[train['Target']=='ConfirmedCases']
large10=conf_train.groupby('Country_Region')['TargetValue'].sum()
large10=large10.nlargest(10)
Large10=large10.to_frame()
Large10.reset_index(level=0, inplace=True)

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(data=Large10,x='Country_Region',y='TargetValue')

## Top 10
## Fatalities Vs Country 

In [None]:
conf_train=train[train['Target']=='Fatalities']
large10=conf_train.groupby('Country_Region')['TargetValue'].sum()
large10=large10.nlargest(10)
Large10=large10.to_frame()
Large10.reset_index(level=0, inplace=True)

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(data=Large10,x='Country_Region',y='TargetValue')

In [None]:
top5=['US','United Kingdom','Brazil','Russia','India']
conf_train=train[train['Target']=='ConfirmedCases']
fdf=conf_train.loc[conf_train['Country_Region'].isin(top5)]
px.line(data_frame=fdf,y='TargetValue',x="Date",color='Country_Region')

In [None]:
conf_train=train[(train.Target=='ConfirmedCases') & (train.Country_Region=='India') ]
px.line(data_frame=conf_train,y='TargetValue',x="Date",color='Country_Region',hover_name="Target")

### the first confirmed case of covid-19 in india was found on 30 jan

In [None]:
conf_train=train[(train.Target=='Fatalities') & (train.Country_Region=='India') ]
px.line(data_frame=conf_train,y='TargetValue',x="Date",color='Country_Region',hover_name="Target")

### the first fatalities from covid-19 in india was reported on 11th march

### lets handle categorical Feature

In [None]:
le=LabelEncoder()

In [None]:
train["Country_Region"]=le.fit_transform(train["Country_Region"])

#### Similarly for test dataset

In [None]:
test["Country_Region"]=le.fit_transform(test["Country_Region"])

In [None]:
train_id=train['Id']
train.drop(['County','Province_State','Target','Date','Id'],axis=1,inplace=True)

In [None]:
train.head()

In [None]:
test_id=test['ForecastId']
test.drop(['County','Province_State','Target','Date','ForecastId'],axis=1,inplace=True)

In [None]:
test.head()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [None]:
X=train.drop(['TargetValue'],axis=1)
y=train['TargetValue']

In [None]:
train_scaled=scaler.fit_transform(X)

In [None]:
test_scaled=scaler.transform(test)

## XGBoostRegressor

In [None]:
xgbr=XGBRegressor(n_estimators=1500,max_depth=5)
xgbr.fit(train_scaled,y)

In [None]:
prediction=xgbr.predict(test_scaled)

In [None]:
output = pd.DataFrame({'Id': test_id  , 'TargetValue': prediction})
output.head()

In [None]:
a=output.groupby(['Id'])['TargetValue'].quantile(q=0.05).reset_index()
b=output.groupby(['Id'])['TargetValue'].quantile(q=0.5).reset_index()
c=output.groupby(['Id'])['TargetValue'].quantile(q=0.95).reset_index()

In [None]:
a.columns=['Id','q0.05']
b.columns=['Id','q0.5']
c.columns=['Id','q0.95']
a=pd.concat([a,b['q0.5'],c['q0.95']],1)
a['q0.05']=a['q0.05']
a['q0.5']=a['q0.5']
a['q0.95']=a['q0.95']

In [None]:
sub=pd.melt(a, id_vars=['Id'], value_vars=['q0.05','q0.5','q0.95'])
sub['variable']=sub['variable'].str.replace("q","", regex=False)
sub['ForecastId_Quantile']=sub['Id'].astype(str)+'_'+sub['variable']
sub['TargetValue']=sub['value']
sub=sub[['ForecastId_Quantile','TargetValue']]
sub.reset_index(drop=True,inplace=True)
sub.to_csv("submission.csv",index=False)
sub.head()