In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random as rnd
import sklearn as sk
from tqdm.notebook import tqdm

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Data & Preprocess

In [None]:
full_table = (pd.read_csv("/kaggle/input/covid19factorsimpact/fullCOVIDtable.csv",sep=";").
        drop(columns="Unnamed: 0").
        loc[:,["SCHOOL",
               "WORK",
               "EVENTS",
               "GATHERINGS",
               "TRANSPORTATION",
               "ATHOME",
               "NATIONAL",
               "INTERNATIONAL",
               "INFORMATION",
               "TESTING",
               "TRACING",
               "DAY",
               "COUNTRY",
               "OLD",
               "YOUNG",
               "URBAN",
               "DENSITY",
               "POPULATION",
               "PHYSICIANS",
               "BEDS",
               "REFF",
               "INFECTED",
               "INFECTEDINCREASE",
               "MORTALITY",
               "FATALITYINCREASE",
               "TEMPERATURE",
               "HUMIDITY",
               "WIND",
               "CONTINENT",
               "DATE"]].
        dropna()
       )

full_table

In [None]:
# drop all countries with less than 30 days of useful data
# create columns representing relevant data in the past by means of shift.

daysPerCountry = full_table.groupby(by="COUNTRY")["REFF"].count()
daysPerCountry[daysPerCountry<30]

In [None]:
factorsTS = ["SCHOOL",
               "WORK",
               "EVENTS",
               "GATHERINGS",
               "TRANSPORTATION",
               "ATHOME",
               "NATIONAL",
               "INTERNATIONAL",
               "INFORMATION",
               "TESTING",
               "TRACING",
               "INFECTED",
               "INFECTEDINCREASE",
               "MORTALITY",
               "FATALITYINCREASE",
               "TEMPERATURE",
               "HUMIDITY",
               "WIND"]

for factor in factorsTS:
    for daysInPast in range(1,31):
        full_table[factor+"-"+str(daysInPast)] = full_table.groupby("COUNTRY")[factor].shift(daysInPast)
        
        
full_table

In [None]:
def binReff(reff):
    binnedReff = 0
    if reff >= 1 and reff <1.5:
        binnedReff = 1
    elif reff>= 1.5 and reff <=2:
        binnedReff = 2
    elif reff>2:
        binnedReff = 3
        
    return binnedReff


def binMortality(mort):
    binMort = 0
    if mort <=1:
        binMort = 1
    elif mort <=5:
        binMort = 5
    elif mort <=10:
        binMort = 10
    elif mort >10:
        binMort = 15
        
    return binMort

In [None]:
full_table["REFFBINNED"] = full_table["REFF"].apply(binReff)
full_table["MORTALITYBINNED"] = full_table["MORTALITY"].apply(binMortality)

In [None]:
full_table.to_csv("preppedData.csv",sep=";")

In [None]:
rnd.seed(574638)

countryList = full_table["COUNTRY"].drop_duplicates().to_list()

print(countryList)

rnd.shuffle(countryList)

print("\nShuffled country list :")
print(countryList)

testCountries = countryList[0:20]
validationCountries = countryList[20:30]
trainingCountries = countryList[30:]

In [None]:
factors = [
    "SCHOOL",
    "WORK",
    "EVENTS",
    "GATHERINGS",
    "TRANSPORTATION",
    "ATHOME",
    "NATIONAL",
    "INTERNATIONAL",
    "INFORMATION",
    "TESTING",
    "TRACING",
    "DAY",
    "OLD",
    "YOUNG",
    "URBAN",
    "DENSITY",
    "POPULATION",
    "PHYSICIANS",
    "BEDS",
    "INFECTED",
    "INFECTEDINCREASE",
    "FATALITYINCREASE",
    "TEMPERATURE",
    "HUMIDITY",
    "WIND"
]

for factorTS in factorsTS:
    for daysInPast in range(1,31):
        factors += [factor+"-"+str(daysInPast)]


testSet = full_table[full_table["COUNTRY"].isin(testCountries)].dropna()
X_testPropagation = testSet.loc[:,factors+["MORTALITY"]].to_numpy()
X_testMortality = testSet.loc[:,factors].to_numpy()
Y_testPropagation = testSet["REFFBINNED"].to_numpy()
Y_testMortality = testSet["MORTALITYBINNED"].to_numpy()

validationSet = full_table[full_table["COUNTRY"].isin(validationCountries)].dropna()
X_valPropagation = validationSet.loc[:,factors+["MORTALITY"]].to_numpy()
X_valMortality = validationSet.loc[:,factors].to_numpy()
Y_valPropagation = validationSet["REFFBINNED"].to_numpy()
Y_valMortality = validationSet["MORTALITYBINNED"].to_numpy()


trainingSet = full_table[full_table["COUNTRY"].isin(trainingCountries)].dropna()
X_trainPropagation = trainingSet.loc[:,factors+["MORTALITY"]].to_numpy()
X_trainMortality = trainingSet.loc[:,factors].to_numpy()
Y_trainPropagation = trainingSet["REFFBINNED"].to_numpy()
Y_trainMortality = trainingSet["MORTALITYBINNED"].to_numpy()


In [None]:
X_trainPropagation.shape

# Decision trees

In [None]:
from sklearn import tree

In [None]:
hyperTreeParamGrid = {
    "max_depth" : [None,3,5,10,15,30],
    "min_samples_split" : [2,3,5,7,10,20,50],
    "min_samples_leaf" : [1,5,10,20,50],
    "max_leaf_nodes" : [None, 2,5,10,15],
    "criterion" : ["gini","entropy"]
}

In [None]:
from tqdm.notebook import tqdm

In [None]:
treeClassifiers = pd.DataFrame(columns=[
    "max_depth",
    "min_samples_split",
    "min_samples_leaf",
    "max_leaf_nodes",
    "criterion",
    "train_score",
    "val_score"
])

counter = tqdm(total=2100)
for max_depth in hyperTreeParamGrid["max_depth"] :
    for min_samples_split in hyperTreeParamGrid["min_samples_split"] :
        for min_samples_leaf in hyperTreeParamGrid["min_samples_leaf"] :
            for max_leaf_nodes in hyperTreeParamGrid["max_leaf_nodes"] :
                for criterion in hyperTreeParamGrid["criterion"] :
                    clf = tree.DecisionTreeClassifier(
                       max_depth = max_depth,
                       min_samples_split = min_samples_split,
                       min_samples_leaf = min_samples_leaf,
                       max_leaf_nodes = max_leaf_nodes, 
                       criterion = criterion
                    )
                    
                    clf = clf.fit(X_trainPropagation,Y_trainPropagation)
                    train_score  = clf.score(X_trainPropagation,Y_trainPropagation)
                    val_score = clf.score(X_valPropagation,Y_valPropagation)
                    
                    treeClassifiers = treeClassifiers.append({
                        "max_depth":max_depth,
                        "min_samples_split":min_samples_split,
                        "min_samples_leaf":min_samples_leaf,
                        "max_leaf_nodes":max_leaf_nodes,
                        "criterion":criterion,
                        "train_score":train_score,
                        "val_score":val_score
                    }, ignore_index=True)
                    
                    counter.update(1)

treeClassifiers

In [None]:
treeClassifiers.to_csv("trainedTrees.csv")

In [None]:
topTreeParams = treeClassifiers[treeClassifiers["val_score"]==treeClassifiers["val_score"].max()]

firstBest = min(topTreeParams.index) 

clf = tree.DecisionTreeClassifier(
    max_depth = topTreeParams.loc[firstBest,"max_depth"],
    min_samples_split = topTreeParams.loc[firstBest,"min_samples_split"],
    min_samples_leaf = topTreeParams.loc[firstBest,"min_samples_leaf"],
    max_leaf_nodes = topTreeParams.loc[firstBest,"max_leaf_nodes"], 
    criterion = topTreeParams.loc[firstBest,"criterion"]
)

clf = clf.fit(X_trainPropagation,Y_trainPropagation)
train_score = clf.score(X_trainPropagation,Y_trainPropagation)
val_score = clf.score(X_valPropagation,Y_valPropagation)
test_score = clf.score(X_testPropagation,Y_testPropagation)

print(train_score)
print(val_score)
print(test_score)

# Ensemble Methods

## Gradient Boosted Trees

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
hyperGradientParamGrid = {
    "loss":["deviance"],
    "n_estimators":[10,20,25,50],
    "min_samples_split" : [2],
    "min_samples_leaf" : [50],
    "max_leaf_nodes" : [None],
}

In [None]:
# Bufort suggests xgboost, catboost
gradientClassifiers = pd.DataFrame(columns=[
    "loss",
    "n_estimators",
    "min_samples_split",
    "min_samples_leaf",
    "max_leaf_nodes",
    "train_score",
    "val_score"
])

counter = tqdm(total=4)
for loss in hyperGradientParamGrid["loss"] :
    for n_estimators in hyperGradientParamGrid["n_estimators"] :
            for min_samples_split in hyperGradientParamGrid["min_samples_split"] :
                for min_samples_leaf in hyperGradientParamGrid["min_samples_leaf"] :
                    for max_leaf_nodes in hyperGradientParamGrid["max_leaf_nodes"] :
                            clf = GradientBoostingClassifier(
                                loss=loss,
                                n_estimators=n_estimators,
                                min_samples_split=min_samples_split,
                                min_samples_leaf=min_samples_leaf,
                                max_leaf_nodes=max_leaf_nodes
                            )

                            clf = clf.fit(X_trainPropagation,Y_trainPropagation)
                            train_score  = clf.score(X_trainPropagation,Y_trainPropagation)
                            val_score = clf.score(X_valPropagation,Y_valPropagation)

                            gradientClassifiers = gradientClassifiers.append({
                                "loss":loss,
                                "n_estimators":n_estimators,
                                "min_samples_split":min_samples_split,
                                "min_samples_leaf":min_samples_leaf,
                                "max_leaf_nodes":max_leaf_nodes,
                                "train_score":train_score,
                                "val_score":val_score
                            }, ignore_index=True)
                            counter.update(1)
                            
gradientClassifiers

In [None]:
gradientClassifiers.to_csv("trainedPropagationGradBoost.csv")

In [None]:
topGradParams = gradientClassifiers[gradientClassifiers["val_score"]==gradientClassifiers["val_score"].max()]

firstBest = min(topGradParams.index) 


clf = GradientBoostingClassifier(
    loss=topGradParams.loc[firstBest,"loss"],
    n_estimators=topGradParams.loc[firstBest,"n_estimators"],
    min_samples_split=topGradParams.loc[firstBest,"min_samples_split"],
    min_samples_leaf=topGradParams.loc[firstBest,"min_samples_leaf"],
    max_leaf_nodes=topGradParams.loc[firstBest,"max_leaf_nodes"]
)

clf = clf.fit(X_trainPropagation,Y_trainPropagation)
train_score = clf.score(X_trainPropagation,Y_trainPropagation)
val_score = clf.score(X_valPropagation,Y_valPropagation)
test_score = clf.score(X_testPropagation,Y_testPropagation)

print(train_score)
print(val_score)
print(test_score)

# Multi Layer Peceptron

In [None]:
from sklearn.neural_network import MLPClassifier
# bufort suggests tabnet https://pypi.org/project/tabnet/

In [None]:
nFeats = X_trainPropagation.shape[1]
print(nFeats)

In [None]:
inLayerN = X_trainPropagation.shape[1]

hyperMLP = {
    "alpha":[1e-3,1,1e4],
    "solver":["lbfgs","adam"],
    "layers":[(inLayerN),(inLayerN,inLayerN),(inLayerN,inLayerN,inLayerN),(inLayerN,inLayerN,inLayerN,inLayerN)]
    
}

In [None]:
mlpClassifiers = pd.DataFrame(columns=[
    "alpha",
    "solver",
    "layers",
    "train_score",
    "val_score"
])

counter = tqdm(total=30)

for alpha in hyperMLP["alpha"] :
    for solver in hyperMLP["solver"] :
        for layers in hyperMLP["layers"] :
            clf = MLPClassifier(
                alpha = alpha,
                solver=solver,
                hidden_layer_sizes=layers
            )

            clf = clf.fit(X_trainPropagation,Y_trainPropagation)
            train_score  = clf.score(X_trainPropagation,Y_trainPropagation)
            val_score = clf.score(X_valPropagation,Y_valPropagation)

            mlpClassifiers = mlpClassifiers.append({
                "alpha":alpha,
                "solver":solver,
                "layers":layers,
                "train_score":train_score,
                "val_score":val_score
            }, ignore_index=True)


            counter.update(1)

mlpClassifiers

In [None]:
mlpClassifiers.to_csv("trainedMLP.csv")

In [None]:
topMLPParams = mlpClassifiers[mlpClassifiers["val_score"]==mlpClassifiers["val_score"].max()]

firstBest = min(topMLPParams.index) 

clf = MLPClassifier(
    alpha = topMLPParams.loc[firstBest,"alpha"],
    solver=topMLPParams.loc[firstBest,"solver"],
    hidden_layer_sizes=topMLPParams.loc[firstBest,"layers"]
)

clf = clf.fit(X_trainPropagation,Y_trainPropagation)
train_score = clf.score(X_trainPropagation,Y_trainPropagation)
val_score = clf.score(X_valPropagation,Y_valPropagation)
test_score = clf.score(X_testPropagation,Y_testPropagation)

print(train_score)
print(val_score)
print(test_score)

# Ridge Classifier

In [None]:
from sklearn.linear_model import RidgeClassifier

In [None]:
hyperRidge = {
    "alpha":[1e-3,1e-2,1e-1,1,10,100,1000,10000,1e5,1e6,1e7]}

In [None]:
# Bufort suggests xgboost, catboost
ridgeClassifiers = pd.DataFrame(columns=[
    "alpha",
    "train_score",
    "val_score"
])

counter = tqdm(total=len(hyperRidge["alpha"]))
for alpha in hyperRidge["alpha"] :
    clf = RidgeClassifier(
        alpha = alpha
    )

    clf = clf.fit(X_trainPropagation,Y_trainPropagation)
    train_score  = clf.score(X_trainPropagation,Y_trainPropagation)
    val_score = clf.score(X_valPropagation,Y_valPropagation)

    ridgeClassifiers = ridgeClassifiers.append({
        "alpha":alpha,
        "train_score":train_score,
        "val_score":val_score
    }, ignore_index=True)
    counter.update(1)

ridgeClassifiers

In [None]:
ridgeClassifiers.to_csv("trainedRidge.csv")

In [None]:
topRidgeParams = ridgeClassifiers[ridgeClassifiers["val_score"]==ridgeClassifiers["val_score"].max()]

firstBest = min(topRidgeParams.index) 

clf = RidgeClassifier(
    alpha=topRidgeParams.loc[firstBest,"alpha"],
)

clf = clf.fit(X_trainPropagation,Y_trainPropagation)
train_score = clf.score(X_trainPropagation,Y_trainPropagation)
val_score = clf.score(X_valPropagation,Y_valPropagation)
test_score = clf.score(X_testPropagation,Y_testPropagation)

print(train_score)
print(val_score)
print(test_score)