In [None]:
"""
“I confirm that this is my own work, except where clearly indicated.”
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import os
import numpy as np 
from scipy.stats import norm 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly
plotly.offline.init_notebook_mode() # For not show up chart error
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML
%matplotlib inline
from tqdm import tqdm

pd.set_option('display.max_columns', 100)

In [None]:
# Loading the data
train = pd.read_csv('../input/train-3/train_3.csv')
test = pd.read_csv('../input/test-3/test_3.csv')


Before proceeding with the analysis, let's first make sure that there are no duplicate observations.

## Data at first sight

Here is an excerpt of the the data description for the competition:

* Values of -1 indicate that the feature was missing from the observation.
* We are after 2 outputs from the model we are going to build: ConfirmedCases and Fatalities. These columns are labeld as inter later on

Ok, that's important information to get us started. Let's have a quick look at the first and last rows to confirm all of this.


In [None]:
# Projecting the first 5 rows
train.head()

In [None]:
train.tail()

In [None]:
# Checking the dimensions
train.shape

Let's see if there are any duplicate observations in the training set.

In [None]:
# Dropping duplicates
train.drop_duplicates()

# Checking the dimensions again
train.shape

No duplicates. Let's make sure that test set has the dimensions that is suppose to have

In [None]:
# Checking 
test.shape

We are missing 2 variables but these are the columns that we are after, so we are good. Next, let's take a first look at the data types in the training set.

In [None]:
train.info()

There are quite a few interval data as denoted by the data type **int64** and **float64**. There are also some categorical as denoted by the dtype **object**. This implies that later on we shall create dummy variables as we will see below. But first, let's turn object variables to category to let python know that these are categorical variables. 

In [None]:
# Setting categorical variables for training set
train['continent'] = train['continent'].astype('category')
train['country_code'] = train['country_code'].astype('category')
train['Country_Region'] = train['Country_Region'].astype('category')

# Setting categorical variables for test set
test['continent'] = test['continent'].astype('category')
test['country_code'] = test['country_code'].astype('category')
test['Country_Region'] = test['Country_Region'].astype('category')


## Data Management

To facilitate the data management, we'll store meta-information about the variables in a DataFrame. The method for the preparation of meta-data is mainly inspired from Bert Careman's kernel from another competition https://www.kaggle.com/bertcarremans/data-preparation-exploration. It's great how we can learn new things via participating in competitions such as Kaggle. 

So all kudos for the technique of data management seen here go to Bert.

As for the the meta data, the structure is will be as follows:

**role**: input, ID, ConfirmedCases, Fatalities

**level**: nominal, interval, ordinal keep: True or False dtype: int, float, str

**keep**: True or False

**dtype**: int, float, str

In [None]:
# Creating the meta data


## Something to store information
data = []

## Creating a loop
for f in train.columns:
    
    # Defining the role for each variable
    if f == 'ConfirmedCases':
        role = 'ConfirmedCases'
    elif f == 'Fatalities':
        role = 'Fatalities'
    elif f == 'Id':
        role = 'Id'
    elif f == 'Date':
        role = 'Date'
    else:
        role = 'input'
         
    # Defining the level
    if 'int' in f or f == 'ConfirmedCases':
        level = 'inter'
    elif 'int' in f or f == 'Fatalities':
        level = 'inter'
    elif 'int' in f or f == 'population':
        level = 'interval'
    elif 'cat' in f or f == 'continent':
        level = 'nominal'
    elif 'cat' in f or f == 'country_code':
        level = 'nominal'
    elif 'cat' in f or f == 'Country_Region':
        level = 'nominal'
    elif 'cat' in f or f == 'Id':
        level = 'nominal'
    elif train[f].dtype == float:
        level = 'interval'
    elif train[f].dtype == int:
        level = 'interval'
    elif train[f].dtype == 'object':
        level = 'ordinal'

        
    # Initialize keep to True for all variables except for id
    keep = True
    if f == 'Id':
        keep = False
    
    # Defining the data type 
    dtype = train[f].dtype
    
    # Creating a Dict that contains all the metadata for the variable
    f_dict = {
        'varname': f,
        'role': role,
        'level': level,
        'keep': keep,
        'dtype': dtype
    }
    data.append(f_dict)
    
#Saving the meta-train data
meta = pd.DataFrame(data, columns=['varname', 'role', 'level', 'keep', 'dtype'])
meta.set_index('varname', inplace=True)

# Saving the meta-test data
meta_test = pd.DataFrame(data, columns=['varname', 'role', 'level', 'keep', 'dtype'])
meta_test.set_index('varname', inplace=True)

Below the number of variables per role and level are displayed.

In [None]:
 meta

Here is also a more consise version of the data types in the data set

In [None]:
pd.DataFrame({'count' : meta.groupby(['role', 'level'])['role'].size()}).reset_index()

Let's also look at the test set. This also helps verify that everything is okay with the test set as well.

In [None]:
pd.DataFrame({'count' : meta_test.groupby(['role', 'level'])['role'].size()}).reset_index()

Great. Everything seem to be working fine. Let's proceed with the analysis

## Descriptive statistics

### Interval Data

Let's start via looking at the distribution of the interval data first.

In [None]:
# Calling the interval data
v = meta[(meta.level == 'interval') & (meta.keep)].index
train[v].describe()

First, it seems that the distributions of the variables in the dataset differ quite significantly. More precisely, the mean and the  standard deviation differs by large across variables. Also min and max are quite volatile as well. This suggests the need to scale the variables later on. 

It is also clear that we have missing values (denoted by -1) in quite a few of the columns in the dataset. 

Let's see the quality of the data, how many values are missing from each variable.

In [None]:
# Initiating an empty vector to store information
vars_with_missing = []

# Going through every column in the interval data and calculating how many missing values we have
for f in train.columns:
    missings = train[train[f] == -1][f].count()
    if missings > 0:
        vars_with_missing.append(f)
        missings_perc = missings/train.shape[0]
        
        print('Variable {} has {} records ({:.2%}) with missing values'.format(f, missings, missings_perc))
        
print('In total, there are {} variables with missing values'.format(len(vars_with_missing)))

We have approximately 1% of observations missing from each column. Since I was the one who compiled information from various sources, I knew this already. One of the countries/regions in the given sets is the cruise ship "Diamond Princess" which obviously doesn't have any demographic information as a region as the rest of the countries/regions. There are also a couple other countries mainly from Africa for which the WHO did not have any available demographic information however, these observations do not constitute a large number of the observations. Hence, I decide to leave these observations untouched for now.

### Nominal Data

Let's look at the nominal data next. Let's start with the cardinality.¶


In [None]:
# Calling the nominal data
v = meta[(meta.level == 'nominal') & (meta.keep)].index
train[v].describe()

Quick observations:

* 188 distinct _**Country_Code**_ values
* 173 distinct _**Country_Region**_ values
* 5 distinct _**Continent**_ values
* America seems to be the continent with the most observations in the dataset. More precisely, America seems to capture appx 33% of the total observations.

The fact that we have fewer **Country_Region** values steams from that we have some Provinces included in country_code and have their own distinct country code.

## Exploratory Data Visualization

### Nominal

Let's visualise the Confirmed cases and Fatalities per Country/Region. 

The code for this visualization comes from the Kee's kernel found in this link https://www.kaggle.com/keedong/covid19-exponential-model2-kee


In [None]:
df_now = train.groupby(['Date','Country_Region']).sum().sort_values(['Country_Region','Date']).reset_index()
df_now['New Cases'] = df_now['ConfirmedCases'].diff()
df_now['New Fatalities'] = df_now['Fatalities'].diff()
df_now = df_now.groupby('Country_Region').apply(lambda group: group.iloc[-1:]).reset_index(drop = True)


df_now = df_now.sort_values('ConfirmedCases', ascending = False)
fig = make_subplots(rows = 2, cols = 2)
fig.add_bar(x=df_now['Country_Region'].head(10), y = df_now['ConfirmedCases'].head(10), row=1, col=1, name = 'Total cases')

df_now = df_now.sort_values('Fatalities', ascending=False)
fig.add_bar(x=df_now['Country_Region'].head(10), y = df_now['Fatalities'].head(10), row=1, col=2, name = 'Total Fatalities')

Total cases in the US are most than any other country in the world. In fatalities however, Italy has the most followed by Spain.

In [None]:
# Calling the nominal data
v = meta[(meta.level == 'nominal') & (meta.keep)].index

for f in v:
    plt.figure()
    fig, ax = plt.subplots(figsize=(20,10))
    
     # Calculate the Fatalities per category value
    cat_perc = train[[f, 'Fatalities']].groupby([f],as_index=False).mean()
    cat_perc.sort_values(by='Fatalities', ascending=False, inplace=True)
    
    # Bar plot
    # Order the bars descending on target mean
    sns.barplot(ax=ax, x=f, y='Fatalities', data=cat_perc, order=cat_perc[f])
    plt.ylabel('Fatalities', fontsize=18)
    plt.xlabel(f, fontsize=18)
    plt.tick_params(axis='both', which='major', labelsize=18)
    plt.show();

It appears that Confirmed cases in Asia top the confirmed cases anywhere else but the Fatalities are more severe in Europe. As we saw above Italy and Spain play a major role to that.

### Interval

In [None]:
v = meta[(meta.level == 'interval') & (meta.keep)].index

for f in v:
    plt.figure()
    fig, ax = plt.subplots(figsize=(20,10))
    
    # Calculate the percentage of target=1 per category value
    cat_perc = train[[f, 'ConfirmedCases']].groupby([f],as_index=False).mean()
    cat_perc.sort_values(by='ConfirmedCases', ascending=False, inplace=True)
    
    # Bar plot
    # Order the bars descending on target mean
    sns.barplot(ax=ax, x=f, y='ConfirmedCases', data=cat_perc, order=cat_perc[f])
    plt.ylabel('ConfirmedCases', fontsize=18)
    plt.xlabel(f, fontsize=18)
    plt.tick_params(axis='both', which='major', labelsize=18)
    plt.show();

We confirm our observation from the descriptive statistics part above that interval variables' distribution vary substantially across the board. We can also see here that the data seem to be right skewed, meaning we have some high potive values. 

In [None]:
# Correlation matrix
corrmat = train.corr() 
  
# Creating the plot
cg = sns.clustermap(corrmat, cmap ="YlGnBu", linewidths = 0.1); 
plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation = 0) 
  
cg 

In [None]:
# ConfirmedCases correlation matrix 
# k : number of variables for heatmap 
k = 30
  
cols = corrmat.nlargest(k, 'ConfirmedCases')['ConfirmedCases'].index 
  
cm = np.corrcoef(train[cols].values.T) 
f, ax = plt.subplots(figsize =(12, 10)) 
  
sns.heatmap(cm, ax = ax, cmap ="binary", 
            linewidths = 0.1, yticklabels = cols.values,  
                              xticklabels = cols.values) 

Clearly, there are high correlations amongst some variables. Will let the algorithm further below handle this when we are dealing with feature selection. 

 ## Feature Engineering

### Creating dummy variables¶

The values of the categorical variables do not represent any order or magnitude. For instance, category 2 is not twice the value of category 1. Therefore we can create dummy variables to deal with that. We drop the first dummy variable as this information can be derived from the other dummy variables generated for the categories of the original variable.

In [None]:
# Calling the nominal data
v = meta[(meta.level == 'nominal') & (meta.keep)].index
print('Before dummification we have {} variables in train'.format(train.shape[1]))
train = pd.get_dummies(train, columns=v, drop_first=True)
print('After dummification we have {} variables in train'.format(train.shape[1]))

Doing the same thing for test set.

In [None]:
# Calling the nominal data
v = meta_test[(meta_test.level == 'nominal') & (meta_test.keep)].index
print('Before dummification we have {} variables in train'.format(test.shape[1]))
test = pd.get_dummies(test, columns=v, drop_first=True)
print('After dummification we have {} variables in train'.format(test.shape[1]))

Next, we raise the interval variables to **polynomial degree=2** and create interactions between variables. Thanks to the get_feature_names method we can assign column names to these new variables.

In [None]:
# Calling the interval data
v = meta[(meta.level == 'interval') & (meta.keep)].index
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)

# Creating the df with the interactions
interactions = pd.DataFrame(data=poly.fit_transform(train[v]), columns=poly.get_feature_names(v))
interactions.drop(v, axis=1, inplace=True)  # Remove the original columns

# Concat the interaction variables to the train data
print('Before creating interactions we have {} variables in train'.format(train.shape[1]))
train = pd.concat([train, interactions], axis=1)
print('After creating interactions we have {} variables in train'.format(train.shape[1]))

Applying the same technique to the test set.

In [None]:
# Calling the interval data
v = meta_test[(meta_test.level == 'interval') & (meta_test.keep)].index
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)

# Creating the df with the interactions
interactions = pd.DataFrame(data=poly.fit_transform(test[v]), columns=poly.get_feature_names(v))
interactions.drop(v, axis=1, inplace=True)  # Remove the original columns

# Concat the interaction variables to the train data
print('Before creating interactions we have {} variables in train'.format(test.shape[1]))
test = pd.concat([test, interactions], axis=1)
print('After creating interactions we have {} variables in train'.format(test.shape[1]))

Making sure that the dataset contains no NA values.

In [None]:
# Dropping NA values
train = train.dropna()

# Verifying that no N/A values exist
train.isnull().sum().sum()

No NA values. Let's move on with Feature selection.

## Feature selection¶

Personally, I prefer to let the classifier algorithm chose which features to keep as i find it more robust. Here we use RandomForest to do the job. But there is one thing that we can do ourselves. That is removing features with no or a very low variance. Sklearn has a handy method to do that; VarianceThreshold

### VarianceThreshold

By default it removes features with zero variance. This will be really helpful here as we will see below that there are quite a few zero-variance variables. If we choose to remove features with less than 1% variance, we remove 346 variables as seen below.

In [None]:
# Setting the variance threshold
selector = VarianceThreshold(threshold=.01)
selector.fit(train.drop(['Id', 'ConfirmedCases','Fatalities','Date'], axis=1)) # Fit to train without the variables we need for submitting

f = np.vectorize(lambda x : not x) # Function to toggle boolean array elements

# finding variables with lower variance than threshold
v = train.drop(['Id', 'ConfirmedCases','Fatalities','Date'], axis=1).columns[f(selector.get_support())]
print('{} variables have too low variance.'.format(len(v)))

Training the RandomForest.

In [None]:
# Getting the train and labels
X_train = train.drop(['Id', 'ConfirmedCases','Fatalities','Date'], axis=1)
y_train = train['Fatalities']

# Getting the columns
feat_labels = X_train.columns

# Fitting a Random Forest Classifier
rf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)

rf.fit(X_train, y_train)

# Getting the importances calculated from the RFC
importances = rf.feature_importances_

# Sorting the variables by importance
indices = np.argsort(rf.feature_importances_)[::-1]

# Creating a loop that is going to show the importances per variable ranked from most important to less important
for f in range(20):
    print("%2d) %-*s %f" % (f + 1, 30,feat_labels[indices[f]], importances[indices[f]]))

In [None]:
# Setting the threshold for which variables to keep based on their variance contribution
sfm = SelectFromModel(rf, threshold='median', prefit=True)
print('Number of features before selection: {}'.format(X_train.shape[1]))

# Throwing away all the variables which fall below the threshold level specified above
n_features = sfm.transform(X_train).shape[1]
print('Number of features after selection: {}'.format(n_features))

# Creating a list with the selected variables
selected_vars = list(feat_labels[sfm.get_support()])

In [None]:
# Forming the final training set based on the feature selection 
train = train[selected_vars + ['ConfirmedCases','Fatalities','Date']]

# Applying the selected variables to the test set as well
test = test[selected_vars + ['Date']]

train_copy = train
test_copy = test

## Feature normalization

In the last step prior to fitting a model, there are 2 things that remain to be done:
1. Encode the **Date** variable
2. Scale all numerical variables

The problem with the former is that the training set and the test set have a different number of observations and different dates in each set. Thus, if we try to apply the **OneHotEncoder** this leads to different size of columns for the two sets which wouldn't work for modelling since we want both datasets to have the same exact columns to be able to predict. A get around technique is applied to make the 2 column sets equal.

In [None]:
# Creating a copy of the training and test sets
train_unscaled = train_copy
test_unscaled = test_copy

Before proceeding with encoding, we split the training set to training and validation sets using **Stratified Sampling** based on the population column which as we saw earlier is skewed. 

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit( n_splits = 1, test_size = 0.2)
for train_index, test_index in split.split(train_unscaled, train_unscaled["population"]):
    train_strat = train_unscaled.loc[train_index]
    valid_strat = train_unscaled.loc[test_index]

y_train = train_strat[['ConfirmedCases','Fatalities']]
x_train = train_strat.drop(['ConfirmedCases','Fatalities'], axis=1)

y_valid = valid_strat[['ConfirmedCases','Fatalities']]
x_valid = valid_strat.drop(['ConfirmedCases','Fatalities'], axis=1)

In [None]:
# Creating copies of the datasets
train_1 = x_train
test_1 = test_unscaled
valid1 = x_valid

In [None]:
## Start encoding

## Assigning distinct numbers to every set
train_1['train_1']=2
valid1['train_1']=1
test_1['train_1']=0

## Combining the 3 sets
combined = pd.concat([train_1, valid1, test_1])

# Getting dummies from the combined dataset
df = pd.get_dummies(combined['Date'])

# Concatinating the dummy set with the combined set
combined = pd.concat([combined,df], axis = 1)

## Forming the 3 sets using the distinct numbers that we initially set.
train_df = combined[combined["train_1"]== 2]
valid_df = combined[combined["train_1"]== 1]
test_df = combined[combined["train_1"]==0]

# Forming the end sets
train_df.drop(["train_1"], axis = 1, inplace = True)
valid_df.drop(["train_1"], axis = 1, inplace = True)
test_df.drop(["train_1"], axis = 1, inplace = True)

Okay, now we are ready to encode the categorical variables (**Date**) and standardise the data. Doing both at the same time for all three sets (x_train, x_valid, test) using the handy tool called **pipeline** .

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# Splitting between numerical and other variables
train_num = train_df.select_dtypes(include=["number"])
train_cat = train_df.select_dtypes(exclude=["number"])

# Creating a pipeline
num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
])

## Getting the numerical and categorical variables
num_attribs = list(train_num)
cat_attribs = list(train_cat)

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
])

# Applying the transformation
x_train = full_pipeline.fit_transform(train_df)
x_valid = full_pipeline.fit_transform(valid_df)
test_pip = full_pipeline.fit_transform(test_df)

In [None]:
test_df

In [None]:
test_pip.shape

# Model Fitting

## Model 1 - Decision Tree Regressor

Starting the model fitting part wih a **DecisionTreeRegressor**. DecisionTreeRegressor is one the most powerful algorithms there are, mainly because of its ability to fit both parametric and non-parametric data. 

While fitting the model, will use 10-fold cross-validation with 3 repeats. As a performance metric, I use **Root Mean Squared Log Error** as required from the competition.

In [None]:
from numpy import absolute
from numpy import mean
from sklearn.metrics import mean_squared_log_error
from numpy import std
from sklearn.datasets import make_regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import  make_scorer

np.random.seed(9)
# Creating the mean squared log error metric to let Scikit library use it in cross-validation
scorer = make_scorer(mean_squared_log_error, greater_is_better=False)

# define model
model = DecisionTreeRegressor()

# evaluate model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, x_train, y_train, scoring= scorer, cv=cv, n_jobs=1)

# summarize performance
n_scores = absolute(n_scores)
n_scores = np.sqrt(n_scores)
print('Result: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

The model performed well. Let's evaluate it on the validation set.

In [None]:
np.random.seed(15)

# Fitting the model on the training set
model.fit(x_train,y_train)

# Getting the predictions
y_pred = model.predict(x_valid)

# Calculating the loss
loss = np.sqrt(mean_squared_log_error( y_valid, y_pred ))
print(loss)

## Model 2 - Deep Neural Network using Dropout

A Neural network model can be a good option for the purposes of this competition. The algorithm is extremely useful in finding patterns that are too complex for being manually extracted and taught to recognize to the machine. So let’s fit a simple DNN with a small dropout rate.

For activation function in the hidden layers, **selu** is being used in order to avoid the **vanishing/exploding gradients** problem. For further explanation about the vanishing/exploding gradients problem feel free to see to this article https://www.semanticscholar.org/paper/Understanding-the-exploding-gradient-problem-Pascanu-Mikolov/c5145b1d15fea9340840cc8bb6f0e46e8934827f. 

Using selu as activation functions also leads to self-regularization which is good. Since we are using selu activation, one of the conditions for selu to work is to use **LeCun initialization**. For further information about selu activation function feel free to read this article https://arxiv.org/pdf/1804.02763.pdf

As for the output layer, **Relu** is being used in order ensure we only get positive values. Lastly, I am adding a **Learning Scheduler**, namely **ReduceOnPlateu**, to help improve the learning rate of the algorithm when the validation loss does not improve after 5 rounds.

In [None]:
from tensorflow import keras
from functools import partial
from sklearn.model_selection import KFold
from tensorflow import keras
import tensorflow as tf
import pandas as pd


# Adding early stopping rules, checkpoint rules and Learning scheduling to improve the learning rate.
checkpoint_cb = keras.callbacks.ModelCheckpoint("keras_model_assign_2.h5",save_best_only = True) # making sure the model is saved at every epoch and we are saving the best weights
early_stopping_cb = keras.callbacks.EarlyStopping( patience = 10, restore_best_weights=True) # Early stopping rule while preserving best weights
lr_scheduler = keras.callbacks.ReduceLROnPlateau(factor = 0.5, patience=5) # reduces the learning rate by 0.5 when the validation score doesn't improve for 5 rounds

optimizer = keras.optimizers.SGD(lr= 0.001, momentum = 0.9, nesterov=True) #adding an optimization parameter to improve learning rate


####################
###################
seed = 7
cvscores = []

# Converting the train set to array for indexing
X = np.array(x_train)
Y = np.array(y_train)

X_valid = np.array(x_valid)
Y_valid = np.array(y_valid)

np.random.seed(seed)
# define 5-fold cross validation test harness
kfold = KFold(5, True, 1)

for train, test in kfold.split(X, Y):
  # create model
    model2 = tf.keras.models.Sequential([
    keras.layers.Flatten(input_shape = x_train.shape[1:]),
    keras.layers.Dense(100, activation = "selu", kernel_initializer = "lecun_normal"),
    keras.layers.Dense(100, activation = "selu", kernel_initializer = "lecun_normal"),
    keras.layers.Dense(100,activation = "selu", kernel_initializer = "lecun_normal"),
    keras.layers.Dense(50, activation = "selu", kernel_initializer = "lecun_normal"),
    keras.layers.Dropout(rate = 0.1),
    keras.layers.Dense(2, activation = "relu", kernel_initializer = "he_normal")
    ])
    
    # Compiling the model
    model2.compile(loss="mean_squared_logarithmic_error",
             optimizer = optimizer)
    
    # Fitting the model
    history_2 = model2.fit(X[train], Y[train],epochs=200, verbose=0,
                        validation_data = (X_valid, Y_valid),
                        callbacks = [checkpoint_cb, early_stopping_cb, lr_scheduler])
    
    # Evaluating the model
    y_pred = model2.predict(X[test])
    loss = np.sqrt(mean_squared_log_error( Y[test], y_pred ))
    cvscores.append(loss)
    
print("Scores:",cvscores)
print("Mean:",np.mean(cvscores))
print("Standard Deviation:",np.std(cvscores))

Looking at the scores from every iteration, it appears that as the trainng set is shuffled in each iteration, the performance of the model deteriorates.

# Model 3 - DNN using MC Dropout

Instead of using Dropout, let's try and use Monte Carlo Dropout. MC Dropout attempts to mitigate the problem of representing model uncertainty without sacrificing either computational complexity or test accuracy so let's give it a try.

In [None]:
checkpoint_cb_2 = keras.callbacks.ModelCheckpoint("keras_model2_assign_2.h5",
                                               save_best_only = True) # making sure the model is saved at every epoch


# defining Monte Carlo Dropout layers
class MCDropout(keras.layers.Dropout):
    def call (self,inputs):
        return super().call(inputs, training = True)

################
################
seed = 7
cvscores = []
np.random.seed(seed)

# define 5-fold cross validation test harness
kfold = KFold(5, True, 1)

# Initiating K-Fold Cross-Validation while fitting the model
for train, test in kfold.split(X, Y):
  # create model
    model3 = tf.keras.models.Sequential([
    keras.layers.Flatten(input_shape = x_train.shape[1:]),
    keras.layers.Dense(100, activation = "selu", kernel_initializer = "lecun_normal"),
    keras.layers.Dense(100, activation = "selu", kernel_initializer = "lecun_normal"),
    keras.layers.Dense(100,activation = "selu", kernel_initializer = "lecun_normal"),
    keras.layers.Dense(50, activation = "selu", kernel_initializer = "lecun_normal"),
    MCDropout(rate =0.1),
    keras.layers.Dense(2, activation = "relu", kernel_initializer = "he_normal")
    ])
    
    # Compiling the model
    model3.compile(loss="mean_squared_logarithmic_error",
             optimizer = optimizer)
    
    # Fitting the model
    history_3 = model3.fit(X[train], Y[train],epochs=200,verbose=0, 
                        validation_data = (X_valid, Y_valid),
                        callbacks = [early_stopping_cb,checkpoint_cb_2, lr_scheduler])
    
    # Evaluating the model
    y_pred = model3.predict(X[test])
    loss = np.sqrt(mean_squared_log_error( Y[test], y_pred ))
    cvscores.append(loss)
    
print("Scores:",cvscores)
print("Mean:",np.mean(cvscores))
print("Standard Deviation:",np.std(cvscores))

Adding a MCDropout layer improved the model's performance however the same problem with the first model persists. Let's fit a model that requires less tuning of hypermarameters and see how it performs.

# Model 4 - DNN using Batch Normalisation

Batch Normalization makes the networks much less sensitive to the weight initialization. The drawback is that we are adding extra computation at each layer which makes the model slower to converge and predict. Also, I choose to retain the Monte Carlo Dropout layer since it appears to improve the performance.

In [None]:
# Creating a new checkpoint for a new model
checkpoint_3_cb = keras.callbacks.ModelCheckpoint("keras_model3_assign_2.h5",
                                               save_best_only = True) # making sure the model is saved at every epoch

# Setting the model
################
################
seed = 7
cvscores = []
np.random.seed(seed)

# define 5-fold cross validation test harness
kfold = KFold(5, True, 1)

# Initiating K-Fold Cross-Validation while fitting the model
for train, test in kfold.split(X, Y):
  # create model
    model_4 = keras.models.Sequential([
    keras.layers.Flatten(input_shape = x_train.shape[1:]),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(100, activation = "selu", kernel_initializer = "lecun_normal"),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("selu"),
    keras.layers.Dense(100, activation = "selu", kernel_initializer = "lecun_normal"),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("selu"),
    keras.layers.Dense(100, activation = "selu", kernel_initializer = "lecun_normal"),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("selu"),
    keras.layers.Dense(50, activation = "selu", kernel_initializer = "lecun_normal"),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("selu"),
    MCDropout(rate =0.15),
    keras.layers.Dense(2, activation = "relu", kernel_initializer = "he_normal")
])

    # Compiling the model
    model_4.compile(loss="mean_squared_logarithmic_error",
             optimizer = optimizer)
    
    # Fit the model
    history_4 = model_4.fit(X[train], Y[train],epochs=200, verbose=0,
                        validation_data = (X_valid, Y_valid),
                        callbacks = [early_stopping_cb, checkpoint_3_cb, lr_scheduler])
    
    # evaluate the model
    y_pred = model_4.predict(X[test])
    loss = np.sqrt(mean_squared_log_error( Y[test], y_pred ))
    cvscores.append(loss)
    
print("Scores:",cvscores)
print("Mean:",np.mean(cvscores))
print("Standard Deviation:",np.std(cvscores))

# Fine Tuning

The better two models in terms of performance on the validation set are the **DecisionTreeClassifier** and the **DNN with Dropout layer** (model2). To decide which one to use, let's optimize the Decision Tree Classifier using **RandomizedSearchCV** and test them both again on the validation set. 

Through trial and error, I found that **sample_split** above 50 leads to overfit so I limit this variable to 50. I also set **max_depth** to 90:155 again because through trial and error values between 90:155 lead to smaller generalization error.

In [None]:
from scipy.stats import reciprocal 
from sklearn.model_selection import RandomizedSearchCV


samples_split = range(25,50)
max_depth = range(90,155)

parameters={'min_samples_split': samples_split,
            'max_depth': max_depth}
seed = 7
rnd_search_cv = RandomizedSearchCV(model, 
                                   parameters, 
                                   n_iter = 100, 
                                   cv=3, 
                                   scoring = scorer, 
                                   random_state=0)

rnd_search_cv.fit(x_train, y_train)

Let's see which values were trialled during Randomized Search CV.

In [None]:
# Collecting the results
cvres = rnd_search_cv.cv_results_

# Creating a loop that goes through the values tested and their associated scores
for mean_score, params in zip(-cvres["mean_test_score"], cvres["params"]):
    print(mean_score,params)

In [None]:
## Best parameters from optimization
rnd_search_cv.best_params_

Let's compare.

In [None]:
# Forming the final model
optimized_dtc = rnd_search_cv.best_estimator_

# Getting predictions
opti_dtc_final_predictions = optimized_dtc.predict(x_valid)
model2_final_predictions = model2.predict(x_valid)

dtc_loss = np.sqrt(mean_squared_log_error( y_valid, opti_dtc_final_predictions ))
model2_loss = np.sqrt(mean_squared_log_error( y_valid, model2_final_predictions ))
    
print("DTC Score:",dtc_loss)
print("DNN Score:",model2_loss)

The DNN model performs better, hence this will be the final model.

## Output

All done, creating the output file.

In [None]:
# Getting predictions
final_predictions = model2_loss.predict(test_pip)

# Creating the final submission file
sub = pd.DataFrame(final_predictions)
sub["ConfirmedCases"] = sub[0].astype(int)
sub["Fatalities"] = sub[1].astype(int)
cols = [0,1]
sub.drop(sub.columns[cols],axis=1,inplace=True)
sub.round() 
sub['ForecastId'] = range(1, len(sub) + 1)
sub = sub[['ForecastId', 'ConfirmedCases','Fatalities']]
sub.to_csv("submission.csv", index=False)