## **Dataset**

In [None]:
import math
import pandas as pd
import numpy as np
from io import StringIO
from sklearn.preprocessing import MinMaxScaler
from utils import hold_out_validation

problem         = "ML-CUP23"
filename        = f"datasets/ML-CUP/{problem}"
train           = "-TR.csv"
test            = "-TS.csv"


def has_nan(lst):
    return any(math.isnan(x) for x in lst)

def string_dataset_to_float(dataset):
    new_dataset = []
    for i in range(len(dataset)):
        if has_nan(dataset[i]) == False:
            new_dataset.append([float(j) for j in dataset[i]])
    return np.array(new_dataset)

def retrieveData(filename, column_names, column_features):
    data = pd.read_csv(filename, sep=',', header=None, comment="#", names=column_names)
    data=data.iloc[np.random.permutation(len(data))]
    column_names = column_names[1:]
    df_scaled = pd.DataFrame(data.to_numpy(), columns=data.columns.values)
    del df_scaled['id']
    df_train = df_scaled
    features = len(column_features)
    X_train = df_train.iloc[ : , :features].values
    y_train = df_train.iloc[:,features:].values
    #print(column_names)
    X_train = string_dataset_to_float(X_train)
    y_train = string_dataset_to_float(y_train)
    return X_train, y_train

column_names    = ["id", "i1", "i2", "i3", "i4", "i5", "i6", "i7", "i8", "i9", "i10", "t1", "t2", "t3"]
column_features = ["i1", "i2", "i3", "i4", "i5", "i6", "i7", "i8", "i9", "i10"]
X_data, y_data = retrieveData(filename + train, column_names, column_features)
dataset = hold_out_validation(X_data, y_data)[0]
X_train = dataset["X_train"]
y_train = dataset["y_train"]
X_test = dataset["X_val"]
y_test = dataset["y_val"]

## **Model Testing**

In [None]:
from activation_function import instantiate_act_func
from layer import Layer
from mlp import MLP
from losses import instantiate_loss
from grid_search import create_test
from weigth_init import instantiate_initializer
from utils import hold_out_validation
import matplotlib.pyplot as plt
from datetime import datetime
from losses import MeanEuclideanError, MeanSquaredError

In [None]:
'''
filename = "results/model-assessment/ML-CUP23/release/ML-CUP23.pkl"
res = []
model = MLP()
model.load(filename)
mse, mse_norm = 0, 0
mee, mee_norm = 0, 0
mee_func, mse_func = MeanEuclideanError(), MeanSquaredError()

for x, y in zip(X_train, y_train):
    out = model.run(x)
    mse += mse_func.error(y, out)
    mee += mee_func.error(y, out)
    res.append(out)

y_train = scaler.inverse_transform(y_train)
res = scaler.inverse_transform(res)

for y, o in zip(y_train, res):
    mse_norm += mse_func.error(y, o)
    mee_norm += mee_func.error(y, o)
print("mee: ", mee/len(y_train))
print("mee norm: ", mee_norm/len(y_train))
print("mse: ", mse/len(y_train))
print("mse norm: ", mse_norm/len(y_train))
''' 

## **Model Selection**

In [None]:
from activation_function import instantiate_act_func
from layer import Layer
from mlp import MLP
from losses import instantiate_loss
from grid_search import create_test
from weigth_init import instantiate_initializer
from utils import k_fold_cross_validation, hold_out_validation
import matplotlib.pyplot as plt
from datetime import datetime
from losses import MeanEuclideanError, MeanSquaredError

In [None]:
model_path = "models/ML-CUP"
json_file_config = [
    f"{model_path}/model1.json",
    f"{model_path}/model2.json",
    f"{model_path}/model3.json",
    f"{model_path}/model4.json",
]
tests = create_test(json_file_config)

In [None]:
def create_model_from_test(test):
    layers = []
    n_processes = None
    for layer in test['layers']:
        layers.append(
            Layer(
                layer['units'],
                instantiate_act_func(layer['act_func']),
                layer['inputs'],
                weights_initializer=instantiate_initializer(test['weights_initializer']),
                kernel_regularizer=test['kernel_regularizer'],
                bias_regularizer=test['bias_regularizer'],
                momentum=test['momentum'],
                Nesterov=test['Nesterov'],
                n_processes=n_processes
            )
        )
    mlp = MLP(layers)
    mlp.compile(test['learning_rate'],instantiate_loss(test['loss']), test['metrics'])
    return mlp

In [None]:
def save_result(path, test, mse_train, mee_train, mse_val, mee_val, summary):
    iso_date = datetime.now().replace(microsecond=0).isoformat()
    rep_train = f"mse_train:{str(round(mse_train, 6))}-mee_train:{str(round(mee_train, 6))}"
    rep_val = f"mse_val:{str(round(mse_val, 6))}-mee_val:{str(round(mee_val, 6))}"
    filename = f"{path}/{iso_date}-{rep_train}-{rep_val}"
    f = open(f"{filename}.logs", 'w')
    f.write(f"{str(test)}\n")
    f.write(f"{str(rep_train)}\n")
    f.write(f"{str(rep_val)}\n")
    f.write(f"{summary}\n")
    f.close()

In [None]:
path_model_selection_result = f"results/model-selection/{problem}"

In [None]:
def plot_chart(path, tr_res, vl_res, tr_label, vl_label, y_label):
    iso_date = datetime.now().replace(microsecond=0).isoformat()
    plt.plot(tr_res, label=tr_label, color='blue')
    plt.plot(vl_res, label=vl_label, color='red', linestyle='--')
    plt.xlabel('Epochs')
    plt.ylabel(y_label)
    plt.legend()
    filename = f"{path}/{iso_date}-{y_label}"
    plt.savefig(f'{filename}.png')
    plt.show()

In [None]:
scaler_f = MinMaxScaler()
scaler_eval = MinMaxScaler()

In [None]:
def compute_metrics(model, X, Y, scaler):
    mse = 0
    mee = 0
    mee_func, mse_func = MeanEuclideanError(), MeanSquaredError()
    outs = []
    for x, _ in zip(X, Y):
        out = model.run(x)
        outs.append(out)
    out_inv = scaler.inverse_transform(outs)
    for out, y in zip(out_inv, Y):
        mee += mee_func.error(out, y)
        mse += mse_func.error(out, y)
    return mse/len(Y), mee/len(Y)

In [None]:
from tqdm import trange
best_test = None
best_inst_model = None
best_error = 1000
k = 5

for test in tests:
    tr_errors = []
    vl_errors = []
    tr_accuracies = []
    vl_accuracies = []
    epochs = round(test['epochs'] / k)
    fold = k_fold_cross_validation(X_train, y_train, k)
    print(test)
    model = create_model_from_test(test)
    for f in fold:
        bar = trange(epochs)
        y_train_norm = scaler_f.fit_transform(f['y_train'])
        y_val_norm = scaler_eval.fit_transform(f['y_val'])
        for i in bar:
            tr_error, tr_accuracy = model.fit(f['X_train'], y_train_norm, 1)
            vl_error, vl_accuracy = model.evaluate(f['X_val'], y_val_norm)
            tr_errors.append(tr_error[0])
            vl_errors.append(vl_error)
            tr_accuracies.append(tr_accuracy[0])
            vl_accuracies.append(vl_accuracy)
            bar.set_description(f'(loss={vl_error})')
        mse_train, mee_train = compute_metrics(model, f['X_train'],f['y_train'], scaler_f)
        mse_val, mee_val = compute_metrics(model, f['X_val'],f['y_val'], scaler_eval)
        
    summary = model.summary()

    error = vl_errors[-1]
    accuracy = vl_accuracies[-1]
    save_result(path_model_selection_result, test, mse_train, mee_train, mse_val, mee_val, summary)
    plot_chart(path_model_selection_result, tr_errors, vl_errors, "Train Error", "Valid Error", "Error")
    plot_chart(path_model_selection_result, tr_accuracies, vl_accuracies, "Train MEE", "Valid MEE", "MEE")

    if best_error > error:
        best_error = error
        best_test = test
        best_inst_model = model


## **Model Assessment**

In [None]:
scaler_fit = MinMaxScaler()
scaler_test = MinMaxScaler()

In [None]:
def compute_metrics(model, X, Y, scaler):
    mse = 0
    mee = 0
    mee_func, mse_func = MeanEuclideanError(), MeanSquaredError()
    outs = []
    for x, _ in zip(X, Y):
        out = model.run(x)
        outs.append(out)
    out_inv = scaler.inverse_transform(outs)
    for out, y in zip(out_inv, Y):
        mee += mee_func.error(out, y)
        mse += mse_func.error(out, y)
    return mse/len(Y), mee/len(Y)

In [None]:
path_model_assessment_result = f"results/model-assessment/{problem}"
model = create_model_from_test(best_test)
print(best_test)
tr_errors = []
vl_errors = []
tr_accuracies = []
vl_accuracies = []
outs = []
bar = trange(best_test['epochs'])
for i in bar:
    y_train_norm = scaler_fit.fit_transform(y_train)
    y_test_norm = scaler_test.fit_transform(y_test)
    tr_error, tr_accuracy = model.fit(X_train, y_train_norm, 1)
    vl_error, vl_accuracy = model.evaluate(X_test, y_test_norm)
    tr_errors.append(tr_error[0])
    vl_errors.append(vl_error)
    tr_accuracies.append(tr_accuracy[0])
    vl_accuracies.append(vl_accuracy)
    bar.set_description(f'(loss={vl_error})')
mse, mee = compute_metrics(model, X_test, y_test, scaler_test)
summary = model.summary()
print("mee: ", mee, ", mse", mse)

In [None]:
def save_result(path, test, mee, mse, summary):
    iso_date = datetime.now().replace(microsecond=0).isoformat()
    rep_test = f"mse_test:{str(round(mse, 6))}-mee_test:{str(round(mee, 6))}"
    filename = f"{path}/{iso_date}-mse_test:{str(round(mse, 6))}-mee_test:{str(round(mee, 6))}"
    f = open(f"{filename}.logs", 'w')
    f.write(f"{str(test)}\n")
    f.write(f"{str(rep_test)}\n")
    f.write(f"{summary}\n")
    f.close()

In [None]:
save_result(path_model_assessment_result, best_test, mee, mse, summary)
plot_chart(path_model_assessment_result, tr_errors, vl_errors, "Train Error", "Valid Error", "Error")
plot_chart(path_model_assessment_result, tr_accuracies, vl_accuracies, "Train MEE", "Valid MEE", "MEE")

In [None]:
model.save(path_model_assessment_result)

# **Compute Results**

In [None]:
import math
import pandas as pd
import numpy as np

problem         = "ML-CUP23"
filename        = f"datasets/ML-CUP/{problem}"
test            = "-TS.csv"

def has_nan(lst):
    return any(math.isnan(x) for x in lst)

def string_dataset_to_float(dataset):
    new_dataset = []
    for i in range(len(dataset)):
        if has_nan(dataset[i]) == False:
            new_dataset.append([float(j) for j in dataset[i]])
    return np.array(new_dataset)

def retrieveData(filename, column_names, column_features):
    data = pd.read_csv(filename, sep=',', header=None, comment="#", names=column_names)
    data=data.iloc[np.random.permutation(len(data))]
    df_scaled = pd.DataFrame(data.to_numpy(), columns=data.columns.values)
    df_train = df_scaled
    X_train = df_train.iloc[ : , :].values
    X_train = string_dataset_to_float(X_train)
    return X_train

column_names    = ["id", "i1", "i2", "i3", "i4", "i5", "i6", "i7", "i8", "i9", "i10"]
column_features = ["id", "i1", "i2", "i3", "i4", "i5", "i6", "i7", "i8", "i9", "i10"]
X_test = retrieveData(filename + test, column_names, column_features)

In [None]:
l = MeanSquaredError()

res = []
ids = []
for x in X_test:
    out = model.run(x[1:])
    res.append(out)
    ids.append(x[0])


In [None]:
team = "team-name_ML-CUP23-TS.csv"
path_result = f"results/model-assessment/ML-CUP23/{team}"
f = open(path_result, "w")
res = scaler_test.inverse_transform(res)
for i in range(len(res)):
    res_str = [str(r) for r in res[i]]
    ids_str = str(ids[i])
    string = f"{ids_str}, " + ", ".join(res_str)
    f.write(string + "\n")
f.close()