In [5]:
import argparse
import os
import pandas as pd
import tensorflow as tf
import numpy as np
import datetime
import time
import logging
from joblib import dump, load
from datetime import timedelta
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from sklearn import svm
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_error
from sklearn.multioutput import RegressorChain
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
import xgboost as xgb
import lightgbm as lightgbm


# settings:
numberOfInputWeeks = 3 # must be equal to the number of input weeks set in data preperator
numberOfOutputWeeks = 4 # must be equal to the number of output week set in data preperator



# data preperation
cantonKeys = ['AG','AI','AR', 'BE', 'BL', 'BS', 'FR', 'GE', 'GL', 'GR', 'JU', 'LU', 'NE', 'NW', 'OW', 'SG', 'SH', 'SO', 'SZ', 'TG', 'TI', 'UR', 'VD', 'VS', 'ZG','ZH']
data = pd.read_csv("completedata.csv")


outputCategories = ['case_inz_entries_7dayAverage',
                  'hosp_inz_entries_7dayAverage',
                  'death_inz_entries_7dayAverage',
                  'testPositvity_7dayAverage',
                  'transit_stations_percent_change_from_baseline_7dayAverage',
                  'workplaces_percent_change_from_baseline_7dayAverage'
                   ]
numberOfOutputs = len(outputCategories)

split = numberOfOutputs * numberOfOutputWeeks + 2
train_features = data[data['category']=='train'].iloc[:,0:-split].drop(['lastInputDay'], axis=1)
train_labels = data[data['category']=='train'].iloc[:,-split:-2]

validation1_features = data[data['category']=='validation 1'].iloc[:,0:-split].drop(['lastInputDay'], axis=1)
validation1_labels = data[data['category']=='validation 1'].iloc[:,-split:-2]
validation2_features = data[data['category']=='validation 2'].iloc[:,0:-split].drop(['lastInputDay'], axis=1)
validation2_labels = data[data['category']=='validation 2'].iloc[:,-split:-2]
validation1And2_labels = data[(data['category']=='validation 1') | (data['category']=='validation 2')].iloc[:,-split:-2]
validation1And2_features = data[(data['category']=='validation 1') | (data['category']=='validation 2')].iloc[:,0:-split].drop(['lastInputDay'], axis=1)

pip = Pipeline([('minmax_scaler', MinMaxScaler())])
X_train = pip.fit_transform(train_features[train_features.columns].values)
X_valid1 = pip.transform(validation1_features[train_features.columns].values)
X_valid2 = pip.transform(validation2_features[train_features.columns].values)
X_valid1And2 = pip.transform(validation1And2_features[train_features.columns].values)



In [16]:
estimators = []
numberOfEstimators = 0
results = pd.DataFrame()

for alpha in [0.00001, 0.0001,0.001,0.01,0.1,0,1,10, 100]:
    for tol in [1e-3]:
         estimators.append(
                {
                  "modelClass": "Ridge",
                  "isMultiWeek": True,
                  "alpha": alpha,
                  "tol": tol
                })          
print("Ridge: "+str(len(estimators)-numberOfEstimators))
numberOfEstimators = len(estimators)
                    
for alpha in [0.00001, 0.0001,0.001,0.01,0.1,0,1,10, 100]:
    for tol in [1e-4]:
         estimators.append(
                {
                  "modelClass": "Lasso",
                  "isMultiWeek": True,
                  "alpha": alpha,
                  "tol": tol
                })
print("Lasso: "+str(len(estimators)-numberOfEstimators))
numberOfEstimators = len(estimators)             
             

                    
for alpha in [0.00001, 0.0001,0.001,0.01,0.1,0,1,10, 100]:
    for tol in [1e-4]:
        estimators.append(
                {
                  "modelClass": "MultiTaskLasso",
                  "isMultiWeek": True,
                  "alpha": alpha,
                  "tol": tol
                })
print("MultiTaskLasso: "+str(len(estimators)-numberOfEstimators))
numberOfEstimators = len(estimators)

for alpha in [0.00001, 0.0001,0.001,0.01,0.1,0,1,10, 100]:
    for l1ratio in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
        for tol in [1e-4]:
            estimators.append(
                    {
                      "modelClass": "ElasticNet",
                      "isMultiWeek": True,
                      "alpha": alpha,
                      "l1_ratio": l1ratio,
                      "tol": tol
                    })
print("ElasticNet: "+str(len(estimators)-numberOfEstimators))
numberOfEstimators = len(estimators)
                    
                    
for max_features in ["auto","sqrt","log2"]:
    for n_estimators in [100,500,1000]:
        estimators.append(
            {
                "modelClass": "RandomForrest",
                "isMultiWeek": True,
                "n_estimators": n_estimators,
                "max_features": max_features
            }
        )
print("RandomForrest: "+str(len(estimators)-numberOfEstimators))
numberOfEstimators = len(estimators)

                    
print("from here: "+str(len(estimators)))


# fine tune search for the cases ----------------------

for alpha in [1]:
    estimators.append(
            {
              "modelClass": "MultiTaskLasso",
              "isMultiWeek": True,
              "alpha": alpha,
               "tol": 1e-5
            })

for alpha in [2,3,4,5,6,7,8,9,0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2]:
    for tol in [1e-4,1e-5]:
        estimators.append(
                {
                  "modelClass": "MultiTaskLasso",
                  "isMultiWeek": True,
                  "alpha": alpha,
                   "tol": tol
                })
    
# hosp
for alpha in [0.02, 0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.009,0.008,0.007,0.006,0.005,0.004,0.003,0.002]:
    for tol in [1e-4,1e-5]:
        estimators.append(
                {
                  "modelClass": "MultiTaskLasso",
                  "isMultiWeek": True,
                  "alpha": alpha,
                   "tol": tol
                })
    
# death

for alpha in [0.00001, 0.0001,0.001,0.01,0.1,0,1,10, 100]:
    for tol in [1e-5]:
         estimators.append(
                {
                  "modelClass": "Lasso",
                  "isMultiWeek": True,
                  "alpha": alpha,
                  "tol": tol
                })
            
for alpha in [0.002,0.003,0.004,0.005,0.006,0.007,0.008,0.009,0.0009,0.0008,0.0007,0.0006,0.0005,0.0004,0.0003,0.0002]:
    for tol in [1e-4,1e-5]:
         estimators.append(
                {
                  "modelClass": "Lasso",
                  "isMultiWeek": True,
                  "alpha": alpha,
                  "tol": tol
                })


# testpositivity
for alpha in [20,30,40,50,60,70,80,90,9,8,7,6,5,4,3,2]:
    for tol in [1e-3,1e-4]:
         estimators.append(
                {
                  "modelClass": "Ridge",
                  "isMultiWeek": True,
                  "alpha": alpha,
                  "tol": tol
                }) 
            
for alpha in [0.0001]:
    for l1ratio in [0.7]:
        for tol in [1e-5]:
            estimators.append(
                    {
                      "modelClass": "ElasticNet",
                      "isMultiWeek": True,
                      "alpha": alpha,
                      "l1_ratio": l1ratio,
                      "tol": tol
                    })

for alpha in [0.0001]:
    for l1ratio in [0.61,0.62,0.63,0.64,0.65,0.66,0.67,0.68,0.69,0.71,0.72,0.73,0.74,0.75,0.76,0.77,0.78,0.79]:
        for tol in [1e-4, 1e-5]:
            estimators.append(
                    {
                      "modelClass": "ElasticNet",
                      "isMultiWeek": True,
                      "alpha": alpha,
                      "l1_ratio": l1ratio,
                      "tol": tol
                    })

for alpha in [0.0002,0.0003,0.0004,0.0005,0.0006,0.0007,0.0008,0.0009,0.00002,0.00003,0.00004,0.00005,0.00006,0.00007,0.00008,0.00009]:
    for l1ratio in [0.7]:
        for tol in [1e-4, 1e-5]:
            estimators.append(
                    {
                      "modelClass": "ElasticNet",
                      "isMultiWeek": True,
                      "alpha": alpha,
                      "l1_ratio": l1ratio,
                      "tol": tol
                    })
print("to here (sklearn): "+str(len(estimators)))
# working            
for max_features in ["auto",0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    for n_estimators in [1000]:
        for min_samples_leaf in [1,2,4,8]:
            estimators.append(
                {
                    "modelClass": "RandomForrest",
                    "isMultiWeek": True,
                    "n_estimators": n_estimators,
                    "max_features": max_features,
                    "min_samples_leaf": min_samples_leaf
                }
            )
            
print("until random forrest: "+str(len(estimators)))
 

for alpha in [0.021,0.022,0.023,0.024,0.025,0.026,0.027,0.028,0.029,0.011,0.012,0.013,0.014,0.015,0.016,0.017,0.018,0.019]:
    for tol in [1e-4,1e-5]:
        estimators.append(
                {
                  "modelClass": "MultiTaskLasso",
                  "isMultiWeek": True,
                  "alpha": alpha,
                   "tol": tol
                })            

            
print("total: "+str(len(estimators)))
        



parser = argparse.ArgumentParser()
parser.add_argument("estimatorId", help="computes estimator with given id",type=int)
args = parser.parse_args()
estimatorId = args.estimatorId

'''
estimatorId = 108
''' 

# constructs and saves the results of a fitted estimator    
def constructResults(estimator, task):
   
    # predictions for all weeks
    predictions1 = pd.DataFrame(estimator.predict(X_valid1), index=validation1_labels.index, columns=["pred_week_"+task+"_"+str(outputWeekNumber) for outputWeekNumber in range(0,numberOfOutputWeeks)]) 
    predictions2 = pd.DataFrame(estimator.predict(X_valid2), index=validation2_labels.index, columns=["pred_week_"+task+"_"+str(outputWeekNumber) for outputWeekNumber in range(0,numberOfOutputWeeks)])

    # validation for all weeks
    y_valid1 = validation1_labels[["output_"+task+"_"+str(outputWeekNumber) for outputWeekNumber in range(0,numberOfOutputWeeks)]]
    y_valid2 = validation2_labels[["output_"+task+"_"+str(outputWeekNumber) for outputWeekNumber in range(0,numberOfOutputWeeks)]]

    resultsDf = pd.DataFrame()
    # compute and safe results for every week
    for outputWeekNumber in range(0,numberOfOutputWeeks):
        # compute root mean squared error for validation sets
        rmse1 = np.sqrt(mean_squared_error(predictions1["pred_week_"+task+"_"+str(outputWeekNumber)], y_valid1["output_"+task+"_"+str(outputWeekNumber)]))
        rmse2 = np.sqrt(mean_squared_error(predictions2["pred_week_"+task+"_"+str(outputWeekNumber)], y_valid2["output_"+task+"_"+str(outputWeekNumber)]))
        # safe the results and all model parameters
        res = {}
        if estimators[estimatorId]["modelClass"] == "RandomForrest":
            res = {'modelClass':[estimators[estimatorId]["modelClass"]],
                 'task':[task],
                 'week':[outputWeekNumber], 
                 'model rmse 1':[rmse1], 
                 'model rmse 2':[rmse2],
                 'isMultiWeek':[estimators[estimatorId]["isMultiWeek"]],
                 "n_estimators": [estimators[estimatorId]["n_estimators"]],
                 "max_features": [estimators[estimatorId]["max_features"]],
                 "min_samples_leaf": [estimators[estimatorId]["min_samples_leaf"]]
                }               
        elif estimators[estimatorId]["modelClass"] == "ElasticNet":
            res = {'modelClass':[estimators[estimatorId]["modelClass"]],
                 'task':[task],
                 'week':[outputWeekNumber], 
                 'model rmse 1':[rmse1], 
                 'model rmse 2':[rmse2],
                 'isMultiWeek':[estimators[estimatorId]["isMultiWeek"]],
                 "alpha": [estimators[estimatorId]["alpha"]],
                 "l1_ratio": [estimators[estimatorId]["l1_ratio"]],
                 "tol": [estimators[estimatorId]["tol"]]
                }
        elif estimators[estimatorId]["modelClass"] == "Ridge":
            res = {'modelClass':[estimators[estimatorId]["modelClass"]],
                 'task':[task],
                 'week':[outputWeekNumber], 
                 'model rmse 1':[rmse1], 
                 'model rmse 2':[rmse2],
                 'isMultiWeek':[estimators[estimatorId]["isMultiWeek"]],
                 "alpha": [estimators[estimatorId]["alpha"]],
                 "tol": [estimators[estimatorId]["tol"]]
                }
        elif estimators[estimatorId]["modelClass"] == "Lasso":
            res = {'modelClass':[estimators[estimatorId]["modelClass"]],
                 'task':[task],
                 'week':[outputWeekNumber], 
                 'model rmse 1':[rmse1], 
                 'model rmse 2':[rmse2],
                 'isMultiWeek':[estimators[estimatorId]["isMultiWeek"]],
                 "alpha": [estimators[estimatorId]["alpha"]],
                 "tol": [estimators[estimatorId]["tol"]]
                }
        elif estimators[estimatorId]["modelClass"] == "MultiTaskLasso":
            res = {'modelClass':[estimators[estimatorId]["modelClass"]],
                 'task':[task],
                 'week':[outputWeekNumber], 
                 'model rmse 1':[rmse1], 
                 'model rmse 2':[rmse2],
                 'isMultiWeek':[estimators[estimatorId]["isMultiWeek"]],
                 "alpha": [estimators[estimatorId]["alpha"]],
                 "tol": [estimators[estimatorId]["tol"]]
                }
        else:
            raise ValueError('Tried to save results for an unsupported estimator')
        resultsDf = resultsDf.append(pd.DataFrame(data=res), ignore_index = True)
    return resultsDf


# training


# we just train one model per task
for task in outputCategories:
    # get train labels for all weeks
    y_train = train_labels[["output_"+task+"_"+str(outputWeekNumber) for outputWeekNumber in range(0,numberOfOutputWeeks)]].values
    
    if estimators[estimatorId]["modelClass"] == "RandomForrest":
        estimator = RandomForestRegressor(n_estimators=estimators[estimatorId]["n_estimators"], 
                                          max_features=estimators[estimatorId]["max_features"],
                                          n_jobs=-1,
                                          min_samples_leaf=estimators[estimatorId]["min_samples_leaf"]
                                         )
        estimator.fit(X_train,y_train)
        if not os.path.exists('models_sklearn/'):
            os.makedirs('models_sklearn')
        dump(estimator, "models_sklearn/"+str(estimatorId)+"_"+task+"_sklearn.pkl")
        results = results.append(constructResults(estimator, task), ignore_index = True)
    elif estimators[estimatorId]["modelClass"] == "Ridge":
        estimator = linear_model.Ridge(alpha=estimators[estimatorId]["alpha"], tol=estimators[estimatorId]["tol"])
        estimator.fit(X_train,y_train)
        if not os.path.exists('models_sklearn/'):
            os.makedirs('models_sklearn')
        dump(estimator, "models_sklearn/"+str(estimatorId)+"_"+task+"_sklearn.pkl")
        results = results.append(constructResults(estimator, task), ignore_index = True)
    elif estimators[estimatorId]["modelClass"] == "Lasso":
        estimator = linear_model.Lasso(alpha=estimators[estimatorId]["alpha"], tol=estimators[estimatorId]["tol"])
        estimator.fit(X_train,y_train)
        if not os.path.exists('models_sklearn/'):
            os.makedirs('models_sklearn')
        dump(estimator, "models_sklearn/"+str(estimatorId)+"_"+task+"_sklearn.pkl")
        results = results.append(constructResults(estimator, task), ignore_index = True)
    elif estimators[estimatorId]["modelClass"] == "ElasticNet":
        estimator = linear_model.ElasticNet(alpha=estimators[estimatorId]["alpha"],
                                            l1_ratio=estimators[estimatorId]["l1_ratio"],
                                            tol=estimators[estimatorId]["tol"])
        estimator.fit(X_train,y_train)
        if not os.path.exists('models_sklearn/'):
            os.makedirs('models_sklearn')
        dump(estimator, "models_sklearn/"+str(estimatorId)+"_"+task+"_sklearn.pkl")
        results = results.append(constructResults(estimator, task), ignore_index = True)
    elif estimators[estimatorId]["modelClass"] == "MultiTaskLasso":
        estimator = linear_model.MultiTaskLasso(alpha = estimators[estimatorId]["alpha"],
                                               tol = estimators[estimatorId]["tol"])
        estimator.fit(X_train,y_train)
        if not os.path.exists('models_sklearn/'):
            os.makedirs('models_sklearn')
        dump(estimator, "models_sklearn/"+str(estimatorId)+"_"+task+"_sklearn.pkl")
        results = results.append(constructResults(estimator, task), ignore_index = True)
    else:
        raise ValueError('Tried to fit an unsupported estimator')            


def generateModelId(dictionary):
    modelId = ""
    for key in dictionary.keys():
        modelId = modelId + key +"="+ str(dictionary[key]) +"/"
    modelId = modelId[0:-1]
    return modelId

# add a modelId
results["modelId"] = generateModelId(estimators[estimatorId])
results["modelIdNumber"] = str(estimatorId)+"_sklearn"
                
if not os.path.exists('results_sklearn/'):
    os.makedirs('results_sklearn')
results.to_csv("results_sklearn/"+str(estimatorId)+".csv", header=True, index=False)



usage: ipykernel_launcher.py [-h] estimatorId
ipykernel_launcher.py: error: argument estimatorId: invalid int value: '/home/david/.local/share/jupyter/runtime/kernel-372cb1c9-5054-45b4-8e0a-70005099deae.json'
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Ridge: 9
Lasso: 9
MultiTaskLasso: 9
ElasticNet: 81
RandomForrest: 9
from here: 117
to here (sklearn): 324
until random forrest: 364
total: 400
Traceback (most recent call last):
  File "/usr/lib/python3.9/argparse.py", line 2476, in _get_value
    result = type_func(arg_string)
ValueError: invalid literal for int() with base 10: '/home/david/.local/share/jupyter/runtime/kernel-372cb1c9-5054-45b4-8e0a-70005099deae.json'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.9/argparse.py", line 1851, in parse_known_args
    namespace, args = self._parse_known_args(args, namespace)
  File "/usr/lib/python3.9/argparse.py", line 2063, in _parse_known_args
    stop_index = consume_positionals(start_index)
  File "/usr/lib/python3.9/argparse.py", line 2019, in consume_positionals
    take_action(action, args)
  File "/usr/lib/python3.9/argparse.py", line 1912, in take_action
    argument_values = self._get_values(acti

TypeError: object of type 'NoneType' has no len()

In [None]:
'''
estimator = load("models_sklearn/"+str(0)+"_"+'case_inz_entries_7dayAverage'+"_sklearn.pkl")
estimator
'''

In [18]:
#estimators[400]

IndexError: list index out of range