## **Dataset**

In [None]:
import math
import pandas as pd
import numpy as np
from io import StringIO
from sklearn.preprocessing import MinMaxScaler

problem         = "ML-CUP23"
filename        = f"datasets/ML-CUP/{problem}"
train           = "-TR.csv"
test            = "-TS.csv"

scaler = MinMaxScaler()

def has_nan(lst):
    return any(math.isnan(x) for x in lst)

def reduce_target(y):
    return scaler.fit_transform(y)

def string_dataset_to_float(dataset):
    new_dataset = []
    for i in range(len(dataset)):
        if has_nan(dataset[i]) == False:
            new_dataset.append([float(j) for j in dataset[i]])
    return np.array(new_dataset)

def retrieveData(filename):
    column_names    = ["id", "i1", "i2", "i3", "i4", "i5", "i6", "i7", "i8", "i9", "i10", "t1", "t2", "t3"]
    column_features = ["i1", "i2", "i3", "i4", "i5", "i6", "i7", "i8", "i9", "i10"]
    data = pd.read_csv(filename, sep=',', header=None, comment="#", names=column_names)
    data=data.iloc[np.random.permutation(len(data))]
    column_names = column_names[1:]
    df_scaled = pd.DataFrame(data.to_numpy(), columns=data.columns.values)
    del df_scaled['id']
    df_train = df_scaled
    features = len(column_features)
    X_train = df_train.iloc[ : , :features].values
    y_train = df_train.iloc[:,features:].values
    #print(column_names)
    X_train = string_dataset_to_float(X_train)
    y_train = string_dataset_to_float(y_train)
    return X_train, y_train

def oneHotEncoding(X_data, l):
    X_result = []
    for x in X_data:
        p = []
        for i in range(len(x)):
            d = [0] * l[i]
            if x[i] == 1:
                d[0] = 1
            elif x[i] == 2:
                d[1] = 1
            elif x[i] == 3:
                d[2] = 1
            elif x[i] == 4:
                d[3] = 1
            p += d
        X_result.append(p)
    return X_result

X_train, y_train = retrieveData(filename + train)
#X_test, y_test   = retrieveData(filename + test)
y_train = reduce_target(y_train)
y_train


## **Model Selection**

In [None]:
from activation_function import instantiate_act_func
from layer import Layer
from mlp import MLP
from losses import instantiate_loss
from grid_search import create_test
from weigth_init import instantiate_initializer
from utils import k_fold_cross_validation, hold_out_validation
import matplotlib.pyplot as plt
from datetime import datetime

In [None]:
json_file_config = [
    "models/model1.json",
    # "models/model2.json",
    # "models/model3.json",
    # "models/model4.json",
]
tests = create_test(json_file_config)

In [None]:
def create_model_from_test(test):
    layers = []
    n_processes = None
    for layer in test['layers']:
        layers.append(
            Layer(
                layer['units'],
                instantiate_act_func(layer['act_func']),
                layer['inputs'],
                weights_initializer=instantiate_initializer(test['weights_initializer']),
                kernel_regularizer=test['kernel_regularizer'],
                bias_regularizer=test['bias_regularizer'],
                momentum=test['momentum'],
                Nesterov=test['Nesterov'],
                n_processes=n_processes
            )
        )
    mlp = MLP(layers)
    mlp.compile(test['learning_rate'],instantiate_loss(test['loss']), test['metrics'])
    return mlp

In [None]:
def save_result(path, test, error, summary):
    iso_date = datetime.now().replace(microsecond=0).isoformat()
    filename = f"{path}/{iso_date}-err:{str(round(error, 2))}"
    f = open(f"{filename}.logs", 'w')
    f.write(f"{str(test)}\n")
    f.write(f"{summary}\n")
    f.close()

In [None]:
path_model_selection_result = f"results/model-selection/{problem}"

In [None]:
def plot_errors(train_error, val_error):
    plt.plot(train_error, label='Train Error', color='blue')
    plt.plot(val_error, label='Val Error', color='red', linestyle='--')
    plt.xlabel('epochs')
    plt.ylabel('error')
    plt.legend()
    plt.savefig(f'{filename}.png')
    plt.show()

In [None]:
from tqdm import trange
k = 1
best_test = None
best_inst_model = None
best_error = 1000
for test in tests:
    epochs = round(test['epochs'] / k)
    model = create_model_from_test(test)
    dataset = hold_out_validation(X_train, y_train)
    print(test)
    tr_errors = []
    vl_errors = []
    error = 0
    for fold in dataset:
        bar = trange(epochs, desc='ML')
        for _ in bar:
            tr_error = model.fit(fold['X_train'], fold['y_train'], 1)
            vl_error = model.evaluate(fold['X_val'], fold['y_val'])
            tr_errors.append(tr_error)
            vl_errors.append(vl_error)
            bar.set_description(f'ML (loss={vl_error})')
        summary = model.summary()

    plot_errors(tr_errors, vl_errors)
    error = vl_errors[-1]

    if best_error > error:
        best_error = error
        best_test = test
        best_inst_model = model

    save_result(path_model_selection_result, test, error, summary)

In [None]:
path_model_assessment_result = f"results/model-assessment/{problem}"
model = create_model_from_test(best_test)
from losses import MeanSquaredError
l = MeanSquaredError()

for fold in dataset:
    error = 0
    for x,y in zip(fold['X_train'], fold['y_train']):
        out = best_inst_model.run(x)
        print(scaler.inverse_transform([out]), scaler.inverse_transform([y]))
        error += l.error(scaler.inverse_transform([out]), scaler.inverse_transform([y]))
    print(error / len(fold['X_train']))
        

## **Model Assessment**

In [None]:
path_model_assessment_result = f"results/model-assessment/{problem}"
model = create_model_from_test(best_test)

errors = model.fit(X_train, y_train, best_test['epochs'])
#error = model.evaluate(X_test, y_test)
summary = model.summary()

#save_result(path_model_assessment_result, best_model, error, errors, summary)

In [None]:
l = MeanSquaredError()
global_error = 0
for i in range(len(X_train)):
    out = model.run(X_train[i])
    print(scaler.inverse_transform([out]) - scaler.inverse_transform([y_train[i]]))
    l.error(out, y_train[i])