Importing all the necessary libraries.

In [None]:
import numpy as np # linear algebra
import pandas as pd #for input/output and data processing
import matplotlib.pyplot as plt    #for visualizations
import plotly.express as px
from sklearn.ensemble import RandomForestRegressor
!pip install plotly
import plotly.express as px
%matplotlib inline

Picking up the dataset.

In [None]:
d_train = pd.read_csv('../input/covid19-global-forecasting-week-5/train.csv')
d_test = pd.read_csv('../input/covid19-global-forecasting-week-5/test.csv')
d_sample = pd.read_csv('../input/covid19-global-forecasting-week-5/submission.csv')

In [None]:
d_train.info()

In [None]:
d_test.info()

In [None]:
d_sample.info()

In [None]:
d_sample

In [None]:
d_train.isnull().sum()

In [None]:
d_test.isnull().sum()

In [None]:
d_train.sort_values(by=['TargetValue'])

# Exploratory Data Analysis

In [None]:
fig_pie = px.pie(d_train, values='TargetValue', names='Country_Region')
fig_pie.update_traces(textposition='inside')
fig_pie.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig_pie.show()
#Here, we get an exact idea about the top 10 countries with most cases with COVID-19.

In [None]:
d_train.groupby('Country_Region')['TargetValue'].mean().plot(kind = 'bar', figsize= (40,20), title= "Countries with COVID-19 MAX", color='red')

In [None]:
last_date = d_train.Date.max()
df_countries = d_train[d_train['Date']==last_date]
df_countries = df_countries.groupby('Country_Region', as_index=False)['TargetValue'].sum()
df_countries = df_countries.nlargest(10,'TargetValue')
df_trend = d_train.groupby(['Date','Country_Region'], as_index=False)['TargetValue'].sum()
df_trend = df_trend.merge(df_countries, on='Country_Region')
df_trend.rename(columns={'Country_Region':'Country', 'TargetValue_x':'Cases'}, inplace=True)
px.line(df_trend, x='Date', y='Cases', color='Country', title='COVID19 Total Cases growth for top 10 worst affected countries in the World')

# Data Pre-Processing

In [None]:
d_train.head()

In [None]:
d_train = d_train.drop(['County','Province_State','Country_Region','Target'], axis=1)
d_test = d_test.drop(['County','Province_State','Country_Region','Target'], axis=1)

d_train

In [None]:
from sklearn.preprocessing import OrdinalEncoder

def create_feature(df):
    df['day'] = df['Date'].dt.day
    df['month'] = df['Date'].dt.month
    df['dayofweek'] = df['Date'].dt.dayofweek
    df['dayofyear'] = df['Date'].dt.dayofyear
    df['quarter'] = df['Date'].dt.quarter
    df['weekofyear'] = df['Date'].dt.weekofyear
    return df

In [None]:
def train_split(data, days):
    date = data['Date'].max() - dt.timedelta(days=days)
    return data[data['Date'] <= date], data[data['Date'] > date]

In [None]:
test_date_min = d_test['Date'].min()
test_date_max = d_test['Date'].max()

In [None]:
def avoid_date_leakage(data, date=test_date_min):
    return data[data['Date'] < date]

In [None]:
def to_integer(dt_time):
    return 10000*dt_time.year + 100*dt_time.month + dt_time.day

In [None]:
d_train['Date'] = pd.to_datetime(d_train['Date'])
d_test['Date'] = pd.to_datetime(d_test['Date'])

In [None]:
d_train['Date'] = d_train['Date'].dt.strftime('%Y%m%d')
d_test['Date'] = d_test['Date'].dt.strftime('%Y%m%d')

Date is fixed and in a string form now.

In [None]:
d_train.head()

**Now, doing train and test split to the model.**

In [None]:
X = d_train.iloc[:,1:4]
Y = d_train.iloc[:,4]
from sklearn.model_selection import train_test_split

predictors = d_train.drop(['TargetValue', 'Id'], axis=1)
target = d_train['TargetValue']
X_train, X_test, Y_train, Y_test = train_test_split(predictors,target, test_size=0.2, random_state=0)

Applying Random Forest using 100 estimators.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
pipeline_dt = Pipeline([('scaler2' , StandardScaler()),('RandomForestRegressor: ', RandomForestRegressor(n_jobs = -1 , random_state = 0))])
pipeline_dt.fit(X_train , Y_train)
prediction = pipeline_dt.predict(X_test)

Calculating the score.

In [None]:
# Score
score = pipeline_dt.score(X_test, Y_test)
print("Score: "+ str(score))

In [None]:
d_test.drop(['ForecastId'], axis=1, inplace=True)
d_test.index.name = 'Id'
d_test

# **Submission**

In [None]:
y_pred2 = pipeline_dt.predict(X_test)
y_pred2

In [None]:
predictions = pipeline_dt.predict(d_test)

pred_list = [int(x) for x in predictions]

output = pd.DataFrame({'Id': d_test.index, 'TargetValue': pred_list})
print(output)

# Output

In [None]:
f = output.groupby(['Id'])['TargetValue'].quantile(q=0.05).reset_index() 
g = output.groupby(['Id'])['TargetValue'].quantile(q=0.5).reset_index() 
h = output.groupby(['Id'])['TargetValue'].quantile(q=0.05).reset_index() 

In [None]:
f.columns=['Id','q0.05']
g.columns=['Id','q0.5']
h.columns=['Id','q0.95']
f=pd.concat([f,g['q0.5'],h['q0.95']],1)
f['q0.05']=f['q0.05'].clip(0,10000)
f['q0.5']=f['q0.5'].clip(0,10000)
f['q0.95']=f['q0.95'].clip(0,10000)
f

In [None]:
f['Id'] = f['Id'] + 1
f

# Submission File

In [None]:
sub=pd.melt(f, id_vars=['Id'], value_vars=['q0.05','q0.5','q0.95'])
sub['variable']=sub['variable'].str.replace("q","", regex=False)
sub['ForecastId_Quantile']=sub['Id'].astype(str)+'_'+sub['variable']
sub['TargetValue']=sub['value']
sub=sub[['ForecastId_Quantile','TargetValue']]
sub.reset_index(drop=True,inplace=True)
sub.to_csv("submission.csv",index=False)
sub

# Make sure to press that Upvote button if you like it!
# Will surely try to update and improve the score...