In [0]:
# Required Python Lbraries

import numpy as np 
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
import plotly.express as px
import os
import math

from datetime import datetime
from sklearn.metrics import mean_squared_error
%matplotlib inline
#Model Libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold
#from sklearn.tree import DecisionTreeRegressor
#from sklearn.ensemble import BaggingRegressor
#from sklearn.tree import ExtraTreeRegressor



## Read Data

In [0]:
# Reading data for training and test datset
train = pd.read_csv('../input/covid19-global-forecasting-week-5/train.csv')
test = pd.read_csv('../input/covid19-global-forecasting-week-5/test.csv')

In [0]:
#Finding features in the dataset 
train.columns

Index(['Id', 'County', 'Province_State', 'Country_Region', 'Population',
       'Weight', 'Date', 'Target', 'TargetValue'],
      dtype='object')

### Checking Missing Values

In [0]:
#Checking missing values in the training dataset
train.isnull().sum()

Id                    0
County            67840
Province_State    39644
Country_Region        0
Population            0
Weight                0
Date                  0
Target                0
TargetValue           0
dtype: int64

In [0]:
#Checking missing values in the testing dataset
test.isnull().sum()

ForecastId            0
County            28800
Province_State    16830
Country_Region        0
Population            0
Weight                0
Date                  0
Target                0
dtype: int64

In [0]:
# Sorting data as per TargetValue'
train.sort_values(by=['TargetValue'])

Unnamed: 0,Id,County,Province_State,Country_Region,Population,Weight,Date,Target,TargetValue
48096,63465,,,Spain,46438422,0.056646,2020-04-24,ConfirmedCases,-10034.0
27118,35755,,,France,66710000,0.055507,2020-04-29,ConfirmedCases,-2512.0
27104,35741,,,France,66710000,0.055507,2020-04-22,ConfirmedCases,-2206.0
22894,30171,,,Ecuador,16545799,0.060163,2020-05-07,ConfirmedCases,-1583.0
32416,42753,,,Japan,126960000,0.053592,2020-04-28,ConfirmedCases,-417.0
...,...,...,...,...,...,...,...,...,...
729220,963073,,,US,324141489,0.051029,2020-04-08,ConfirmedCases,32826.0
729212,963065,,,US,324141489,0.051029,2020-04-04,ConfirmedCases,33267.0
729266,963119,,,US,324141489,0.051029,2020-05-01,ConfirmedCases,34037.0
729224,963077,,,US,324141489,0.051029,2020-04-10,ConfirmedCases,35098.0


## Data Preprocessing

In [0]:
#Dropping the features County and Province for null values and Country Region and Target as they are not required
train = train.drop(['County', 'Province_State','Country_Region','Target'], axis=1)
test = test.drop(['County', 'Province_State','Country_Region','Target'], axis=1)
train.head()

Unnamed: 0,Id,Population,Weight,Date,TargetValue
0,1,27657145,0.058359,2020-01-23,0.0
1,2,27657145,0.583587,2020-01-23,0.0
2,3,27657145,0.058359,2020-01-24,0.0
3,4,27657145,0.583587,2020-01-24,0.0
4,5,27657145,0.058359,2020-01-25,0.0


In [0]:
#Encoding the features to get quantittative feature from qualititative feature
from sklearn.preprocessing import OrdinalEncoder

def create_feature(df):
    df['day'] = df['Date'].dt.day
    df['month'] = df['Date'].dt.month
    df['dayofweek'] = df['Date'].dt.dayofweek
    df['dayofyear'] = df['Date'].dt.dayofyear
    df['quarter'] = df['Date'].dt.quarter
    df['weekofyear'] = df['Date'].dt.weekofyear
    return df

In [0]:
def train_dev_split(df, days):
    date = df['Date'].max() - dt.timedelta(days=days)
    return df[df['Date'] <= date], df[df['Date'] > date]

In [0]:
test_date_min = test['Date'].min()
test_date_max = test['Date'].max()

In [0]:
def avoid_date_leakage(df, date=test_date_min):
    return df[df['Date'] < date]

In [0]:
def to_integer(dt_time):#Data Conversion
    return 10000*dt_time.year + 100*dt_time.month + dt_time.day

In [0]:
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])

In [0]:
train['Date'] = train['Date'].dt.strftime('%Y%m%d')
test['Date'] = test['Date'].dt.strftime('%Y%m%d')

In [0]:
train.head()#Modified training dataset

Unnamed: 0,Id,Population,Weight,Date,TargetValue
0,1,27657145,0.058359,20200123,0.0
1,2,27657145,0.583587,20200123,0.0
2,3,27657145,0.058359,20200124,0.0
3,4,27657145,0.583587,20200124,0.0
4,5,27657145,0.058359,20200125,0.0


In [0]:
train.info()#Training dataset information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 734156 entries, 0 to 734155
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Id           734156 non-null  int64  
 1   Population   734156 non-null  int64  
 2   Weight       734156 non-null  float64
 3   Date         734156 non-null  object 
 4   TargetValue  734156 non-null  float64
dtypes: float64(2), int64(2), object(1)
memory usage: 28.0+ MB


In [0]:
test.head()#Checking modified testing dataset

Unnamed: 0,ForecastId,Population,Weight,Date
0,1,27657145,0.058359,20200427
1,2,27657145,0.583587,20200427
2,3,27657145,0.058359,20200428
3,4,27657145,0.583587,20200428
4,5,27657145,0.058359,20200429


## Using Random Forest Regressor to find Target values

In [0]:
from sklearn.model_selection import train_test_split

predictors = train.drop(['TargetValue', 'Id'], axis=1)
target = train['TargetValue'] #Splitting dataset into test and valid dataset
X_train, X_valid, y_train, y_valid = train_test_split(predictors, target, test_size=0.33, random_state=0)

In [0]:
model = RandomForestRegressor(n_estimators=100,n_jobs = -1)
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [0]:
#Validation dateset prediction
predictions=model.predict(X_valid)
predictions

array([0.        , 0.        , 0.        , ..., 1.26833694, 0.        ,
       0.015     ])

In [0]:
val_mse = mean_squared_error(predictions,y_valid)
val_mse#Validation mean square error

3501.2657841950395

In [0]:
test.head()

Unnamed: 0,ForecastId,Population,Weight,Date
0,1,27657145,0.058359,20200427
1,2,27657145,0.583587,20200427
2,3,27657145,0.058359,20200428
3,4,27657145,0.583587,20200428
4,5,27657145,0.058359,20200429


In [0]:
features = ['Population', 'Weight','Date']
X_test = test[features]
test_preds = model.predict(X_test)

In [0]:
test_preds#Predictions based on test dataset

array([114.07      ,   3.27463492, 114.07      , ...,   2.5       ,
         0.20166667,   2.5       ])

In [0]:
pred_list =[int(x) for x in test_preds]
output = pd.DataFrame({'Id': test.index, 'TargetValue': pred_list})











































## Finding Quanlite values from the output

In [0]:
a = output.groupby(['Id'])['TargetValue'].quantile(q=0.05).reset_index() 
b = output.groupby(['Id'])['TargetValue'].quantile(q=0.5).reset_index() 
c = output.groupby(['Id'])['TargetValue'].quantile(q=0.05).reset_index()

In [0]:
a.columns = ['Id', 'q0.05']
b.columns = ['Id', 'q0.5']
c.columns = ['Id', 'q0.95']

a = pd.concat([a,b['q0.5'],c['q0.95']], 1)

a['q0.05'] = a['q0.05'].clip(0, 10000)
a['q0.5'] = a['q0.5'].clip(0, 10000)
a['q0.95'] = a['q0.95'].clip(0, 10000)

a

Unnamed: 0,Id,q0.05,q0.5,q0.95
0,0,114.0,114.0,114.0
1,1,3.0,3.0,3.0
2,2,114.0,114.0,114.0
3,3,3.0,3.0,3.0
4,4,114.0,114.0,114.0
...,...,...,...,...
311665,311665,2.0,2.0,2.0
311666,311666,0.0,0.0,0.0
311667,311667,2.0,2.0,2.0
311668,311668,0.0,0.0,0.0


In [0]:
a['Id'] = a['Id'] + 1
a

Unnamed: 0,Id,q0.05,q0.5,q0.95
0,1,114.0,114.0,114.0
1,2,3.0,3.0,3.0
2,3,114.0,114.0,114.0
3,4,3.0,3.0,3.0
4,5,114.0,114.0,114.0
...,...,...,...,...
311665,311666,2.0,2.0,2.0
311666,311667,0.0,0.0,0.0
311667,311668,2.0,2.0,2.0
311668,311669,0.0,0.0,0.0


## Submission

In [0]:
sub=pd.melt(a, id_vars=['Id'], value_vars=['q0.05','q0.5','q0.95'])
sub['variable']=sub['variable'].str.replace("q","", regex=False)
sub['ForecastId_Quantile']=sub['Id'].astype(str)+'_'+sub['variable']
sub['TargetValue']=sub['value']
sub=sub[['ForecastId_Quantile','TargetValue']]
sub.reset_index(drop=True,inplace=True)
sub.to_csv("submission.csv",index=False)