## **Dataset**

In [None]:
import math
import pandas as pd
import numpy as np
from io import StringIO
from sklearn.preprocessing import MinMaxScaler
from utils import hold_out_validation

problem         = "ML-CUP23"
filename        = f"datasets/ML-CUP/{problem}"
train           = "-TR.csv"
test            = "-TS.csv"

scaler = MinMaxScaler()

def has_nan(lst):
    return any(math.isnan(x) for x in lst)

def reduce_target(y):
    return scaler.fit_transform(y)

def string_dataset_to_float(dataset):
    new_dataset = []
    for i in range(len(dataset)):
        if has_nan(dataset[i]) == False:
            new_dataset.append([float(j) for j in dataset[i]])
    return np.array(new_dataset)

def retrieveData(filename, column_names, column_features):
    data = pd.read_csv(filename, sep=',', header=None, comment="#", names=column_names)
    data=data.iloc[np.random.permutation(len(data))]
    column_names = column_names[1:]
    df_scaled = pd.DataFrame(data.to_numpy(), columns=data.columns.values)
    del df_scaled['id']
    df_train = df_scaled
    features = len(column_features)
    X_train = df_train.iloc[ : , :features].values
    y_train = df_train.iloc[:,features:].values
    #print(column_names)
    X_train = string_dataset_to_float(X_train)
    y_train = string_dataset_to_float(y_train)
    return X_train, y_train

def oneHotEncoding(X_data, l):
    X_result = []
    for x in X_data:
        p = []
        for i in range(len(x)):
            d = [0] * l[i]
            if x[i] == 1:
                d[0] = 1
            elif x[i] == 2:
                d[1] = 1
            elif x[i] == 3:
                d[2] = 1
            elif x[i] == 4:
                d[3] = 1
            p += d
        X_result.append(p)
    return X_result

column_names    = ["id", "i1", "i2", "i3", "i4", "i5", "i6", "i7", "i8", "i9", "i10", "t1", "t2", "t3"]
column_features = ["i1", "i2", "i3", "i4", "i5", "i6", "i7", "i8", "i9", "i10"]
X_train, y_train = retrieveData(filename + train, column_names, column_features)
y_train = reduce_target(y_train)

## **Model Testing**

In [None]:
from activation_function import instantiate_act_func
from layer import Layer
from mlp import MLP
from losses import instantiate_loss
from grid_search import create_test
from weigth_init import instantiate_initializer
from utils import k_fold_cross_validation, hold_out_validation
import matplotlib.pyplot as plt
from datetime import datetime
from losses import MeanEuclideanError, MeanSquaredError

In [None]:
filename = "results/model-assessment/ML-CUP23/release/ML-CUP23.pkl"
dataset = k_fold_cross_validation(X_train, y_train, 5)
for data in dataset:
    X_train, y_train, X_test, y_test = data["X_train"], data['y_train'], data['X_val'], data['y_val']
    model = MLP()
    model.load(filename)
    mse, mse_norm = 0, 0
    mee, mee_norm = 0, 0
    mee_func, mse_func = MeanEuclideanError(), MeanSquaredError()
    for x, y in zip(X_train, y_train):
        out = model.run(x)
        mse += mse_func.error(y, out)
        mee += mee_func.error(y, out)
        mse_norm += mse_func.error(scaler.inverse_transform([y])[0], scaler.inverse_transform([out])[0])
        mee_norm += mee_func.error(scaler.inverse_transform([y])[0], scaler.inverse_transform([out])[0])
    print("mee: ", mee/len(X_train))
    print("mee norm: ", mee_norm/len(X_train))
    print("mse: ", mse/len(X_train))
    print("mse norm: ", mse_norm/len(X_train))
    

## **Model Selection**

In [None]:
from activation_function import instantiate_act_func
from layer import Layer
from mlp import MLP
from losses import instantiate_loss
from grid_search import create_test
from weigth_init import instantiate_initializer
from utils import k_fold_cross_validation, hold_out_validation
import matplotlib.pyplot as plt
from datetime import datetime
from losses import MeanEuclideanError, MeanSquaredError

In [None]:
model_path = "models/ML-CUP"
json_file_config = [
    f"{model_path}/model1.json",
    f"{model_path}/model2.json",
    f"{model_path}/model3.json",
]
tests = create_test(json_file_config)

In [None]:
def create_model_from_test(test):
    layers = []
    n_processes = None
    for layer in test['layers']:
        layers.append(
            Layer(
                layer['units'],
                instantiate_act_func(layer['act_func']),
                layer['inputs'],
                weights_initializer=instantiate_initializer(test['weights_initializer']),
                kernel_regularizer=test['kernel_regularizer'],
                bias_regularizer=test['bias_regularizer'],
                momentum=test['momentum'],
                Nesterov=test['Nesterov'],
                n_processes=n_processes
            )
        )
    mlp = MLP(layers)
    mlp.compile(test['learning_rate'],instantiate_loss(test['loss']), test['metrics'])
    return mlp

In [None]:
def save_result(path, test, error, accuracy, summary):
    iso_date = datetime.now().replace(microsecond=0).isoformat()
    filename = f"{path}/{iso_date}-err:{str(round(error, 6))}-mee:{str(round(accuracy, 6))}"
    f = open(f"{filename}.logs", 'w')
    f.write(f"{str(test)}\n")
    f.write(f"{summary}\n")
    f.close()

In [None]:
path_model_selection_result = f"results/model-selection/{problem}"

In [None]:
def plot_chart(path, tr_res, vl_res, tr_label, vl_label, y_label):
    iso_date = datetime.now().replace(microsecond=0).isoformat()
    plt.plot(tr_res, label=tr_label, color='blue')
    plt.plot(vl_res, label=vl_label, color='red', linestyle='--')
    plt.xlabel('Epochs')
    plt.ylabel(y_label)
    plt.legend()
    filename = f"{path}/{iso_date}-{y_label}"
    plt.savefig(f'{filename}.png')
    plt.show()

In [None]:
from tqdm import trange
best_test = None
best_inst_model = None
best_error = 1000
k = 5
for test in tests:
    epochs = test['epochs']
    model = create_model_from_test(test)
    dataset = k_fold_cross_validation(X_train, y_train, k)
    print(test)
    tr_errors = []
    vl_errors = []
    tr_accuracies = []
    vl_accuracies = []
    for fold in dataset:
        bar = trange(epochs)
        for _ in bar:
            tr_error, tr_accuracy = model.fit(fold['X_train'], fold['y_train'], 1)
            vl_error, vl_accuracy = model.evaluate(fold['X_val'], fold['y_val'])
            tr_errors.append(tr_error[0])
            vl_errors.append(vl_error)
            tr_accuracies.append(tr_accuracy[0])
            vl_accuracies.append(vl_accuracy)
            bar.set_description(f'(loss={vl_error})')
        summary = model.summary()


    error = vl_errors[-1]
    accuracy = vl_accuracies[-1]

    if best_error > error:
        best_error = error
        best_test = test
        best_inst_model = model

    save_result(path_model_selection_result, test, error, accuracy, summary)
    plot_chart(path_model_selection_result, tr_errors, vl_errors, "Train Error", "Valid Error", "Error")
    plot_chart(path_model_selection_result, tr_accuracies, vl_accuracies, "Train MEE", "Valid MEE", "MEE")

## **Model Assessment**

In [None]:
path_model_assessment_result = f"results/model-assessment/{problem}"
for i in range(10):
    model = create_model_from_test(best_test)
    tr_errors = []
    vl_errors = []
    tr_accuracies = []
    vl_accuracies = []
    bar = trange(best_test['epochs'])

    for _ in bar:
        tr_error, tr_accuracy = model.fit(X_train, y_train, 1)
        vl_error, vl_accuracy = model.evaluate(X_test, y_test)
        tr_errors.append(tr_error[0])
        vl_errors.append(vl_error)
        tr_accuracies.append(tr_accuracy[0])
        vl_accuracies.append(vl_accuracy)
    summary = model.summary()

    save_result(path_model_assessment_result, best_test, vl_error, vl_accuracy, summary)
    plot_chart(path_model_assessment_result, tr_errors, vl_errors, "Train Error", "Valid Error", "Error")
    plot_chart(path_model_assessment_result, tr_accuracies, vl_accuracies, "Train MEE", "Valid MEE", "MEE")

In [None]:
model.save(path_model_assessment_result)

In [None]:
from losses import MeanEuclideanError
l = MeanEuclideanError()
global_error = 0
global_error_transform = 0

for i in range(len(X_train)):
    out = model.run(X_train[i])
    global_error += l.error(out, y_train[i])
    global_error_transform += l.error(scaler.inverse_transform([out]), scaler.inverse_transform([y_train[i]]))
print(global_error/len(X_train))
print(global_error_transform/len(X_train))

In [None]:
l = MeanSquaredError()
global_error = 0
for i in range(len(X_test)):
    out = model.run(X_test[i])
    print(scaler.inverse_transform([out]))

# **Compute Results**

In [None]:
import math
import pandas as pd
import numpy as np
from io import StringIO
from sklearn.preprocessing import MinMaxScaler
from utils import hold_out_validation

problem         = "ML-CUP23"
filename        = f"datasets/ML-CUP/{problem}"
test            = "-TS.csv"


def has_nan(lst):
    return any(math.isnan(x) for x in lst)

def string_dataset_to_float(dataset):
    new_dataset = []
    for i in range(len(dataset)):
        if has_nan(dataset[i]) == False:
            new_dataset.append([float(j) for j in dataset[i]])
    return np.array(new_dataset)

def retrieveData(filename, column_names, column_features):
    data = pd.read_csv(filename, sep=',', header=None, comment="#", names=column_names)
    data=data.iloc[np.random.permutation(len(data))]
    df_scaled = pd.DataFrame(data.to_numpy(), columns=data.columns.values)
    df_train = df_scaled
    X_train = df_train.iloc[ : , :].values
    X_train = string_dataset_to_float(X_train)
    return X_train

def oneHotEncoding(X_data, l):
    X_result = []
    for x in X_data:
        p = []
        for i in range(len(x)):
            d = [0] * l[i]
            if x[i] == 1:
                d[0] = 1
            elif x[i] == 2:
                d[1] = 1
            elif x[i] == 3:
                d[2] = 1
            elif x[i] == 4:
                d[3] = 1
            p += d
        X_result.append(p)
    return X_result

column_names    = ["id", "i1", "i2", "i3", "i4", "i5", "i6", "i7", "i8", "i9", "i10"]
column_features = ["id", "i1", "i2", "i3", "i4", "i5", "i6", "i7", "i8", "i9", "i10"]
X_test = retrieveData(filename + test, column_names, column_features)

In [None]:
for x in X_test:
    print(x[0])

In [None]:
team = "team-name_ML-CUP23-TS.csv"
path_result = f"results/model-assessment/ML-CUP23/{team}"
l = MeanSquaredError()
f = open(path_result, "w")

for x in X_test:
    out = model.run(x[1:])
    res = scaler.inverse_transform([out])
    res = [str(i) for i in res[0]]
    string = f"{str(x[0])}, " + ", ".join(res)
    f.write(string + "\n")
f.close()