# Import files and libraries

In [1]:
import os 
import sys

dir_path = os.getcwd().split(os.path.sep)
root_index = dir_path.index('Machine_Learning_project')
root_path = os.path.sep.join(dir_path[:root_index + 1])
sys.path.append(root_path + '/code/')
sys.path.append(root_path + '/code/data_loaders/')
sys.path.append(root_path + '/code/utils_keras')
sys.path.append(root_path + '/code/utils_sklearn')


In [2]:
import numpy as np
from data import *
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn_utils import *
import matplotlib.pyplot as plt
from data_cup import *
from Trainer_Cup import *
from mee import *
from sklearn.multioutput import MultiOutputRegressor



# Monk 1

#### Import Monk 1 datasets

In [4]:
m1_train = MonksDataset('monk1_train')
m1_test= MonksDataset('monk1_test')

#Splitting the data into train/dev, and test sets
X_dev, y_dev, X_test_m1, y_test_m1 = get_monks_data(m1_train, m1_test)

#### Encoding

In [5]:
# Encoding with the O.H.E. method using pandas get_dummies()
X_monk_train_ohe_pd_cat = pd.get_dummies(X_dev, columns=['a1', 'a2', 'a3', 'a4', 'a5', 'a6'])
X_monk_test_ohe_pd_cat = pd.get_dummies(X_test_m1, columns=['a1', 'a2', 'a3', 'a4', 'a5', 'a6'])

#### Grid search and evaluation

In [6]:
svc_hyperparam = [
 {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
 {'C': [1, 10, 100, 1000], 'gamma': ['scale', 'auto'], 'kernel': ['rbf']},
]

In [None]:
svc_grid_search = CustomGridSearch(estimator='svc', hyperparameters=svc_hyperparam)

svc_grid_search.fit(X_monk_train_ohe_pd_cat, y_dev.values.ravel())

print()

accuracy_train, mse_train = svc_grid_search.evaluate(X_monk_train_ohe_pd_cat, y_dev)
print()
print("Accuracy Train:", accuracy_train)
print("MSE Train:", mse_train)

accuracy_test, mse_test = svc_grid_search.evaluate(X_monk_test_ohe_pd_cat, y_test_m1)
print()
print("Accuracy Test:", accuracy_test)
print("MSE Test:", mse_test)

# MONK 2

#### Import Monk 2 datasets

In [8]:
m2_train = MonksDataset('monk2_train')
m2_test= MonksDataset('monk2_test')
X_dev_m2, y_dev_m2, X_test_m2, y_test_m2 = get_monks_data(m2_train, m2_test)

#### Encoding

In [9]:
X_monk2_dev_ohe_pd_cat = pd.get_dummies(X_dev_m2, columns=['a1', 'a2', 'a3', 'a4', 'a5', 'a6'])
X_monk2_test_ohe_pd_cat = pd.get_dummies(X_test_m2, columns=['a1', 'a2', 'a3', 'a4', 'a5', 'a6'])

#### Grid search and evaluation

In [10]:
svc_hyperparam = [
 {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
 {'C': [1, 10, 100, 1000], 'gamma': ['scale', 'auto'], 'kernel': ['rbf']},
 {'C': [1, 10, 50, 100], 'gamma': ['scale', 'auto'], 'degree': [2, 3, 4], 'kernel': ['poly']}
]

In [None]:
svc_grid_search = CustomGridSearch(estimator='svc', hyperparameters=svc_hyperparam)

svc_grid_search.fit(X_monk2_dev_ohe_pd_cat, y_dev_m2.values.ravel())

accuracy_train, mse_train = svc_grid_search.evaluate(X_monk2_dev_ohe_pd_cat, y_dev_m2)
print()
print("Accuracy Train:", accuracy_train)
print("MSE Train:", mse_train)

accuracy_test, mse_test = svc_grid_search.evaluate(X_monk2_test_ohe_pd_cat, y_test_m2)
print()
print("Accuracy Test:", accuracy_test)
print("MSE Test:", mse_test)

# MONK 3

#### Import Monk 3 datasets

In [12]:
m3_train = MonksDataset('monk3_train')
m3_test= MonksDataset('monk3_test')
X_dev_m3, y_dev_m3, X_test_m3, y_test_m3 = get_monks_data(m3_train, m3_test)

#### Encoding

In [13]:
X_monk3_dev_ohe_pd_cat = pd.get_dummies(X_dev_m3, columns=['a1', 'a2', 'a3', 'a4', 'a5', 'a6'])
X_monk3_test_ohe_pd_cat = pd.get_dummies(X_test_m3, columns=['a1', 'a2', 'a3', 'a4', 'a5', 'a6'])

#### Grid search and evaluation

In [None]:
svc_grid_search = CustomGridSearch(estimator='svc', hyperparameters=svc_hyperparam)

svc_grid_search.fit(X_monk3_dev_ohe_pd_cat, y_dev_m3.values.ravel())

accuracy_train, mse_train = svc_grid_search.evaluate(X_monk3_dev_ohe_pd_cat, y_dev_m3)
print()
print("Accuracy Train:", accuracy_train)
print("MSE Train:", mse_train)

accuracy_test, mse_test = svc_grid_search.evaluate(X_monk3_test_ohe_pd_cat, y_test_m3)
print()
print("Accuracy Test:", accuracy_test)
print("MSE Test:", mse_test)

# CUP

In [3]:
# Create an instance of the dataset for Cup training and Cup test

cup = CupDataset('Cup_tr')
blind = CupDataset('Cup_ts')

blind = blind.data
df =cup.data

#### Train/Val - Test split

The dataset are splitted into three parts: Train, Val, Test. The Dev set (90%), include Train (90%) and Val (10%) for model selection and Test set (10%) is used for final evaluation for model assessment.

In [4]:
# Split data into train/validation and test sets
cup.split_data(test_size=0.1, random_state=0)

# X_dev and y_dev represent the features and labels of the development set (train/validation combined), X_final_test and y_final_test represent the features and labels of the final test set
X_dev,  X_final_test, y_dev, y_final_test = cup.get_splits()

# Further split the development set (X_dev, y_dev) into training and internal test sets
X_train, X_internal_test, y_train, y_internal_test = train_test_split(X_dev, y_dev, test_size=0.111, random_state=0)

# Extract the features from the 'blind' dataset
X_blind = blind[['a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'a10']]


#### Preprocessing

To preprocess the data, a polynomial transformation (***PolynomialFeatures***) was applied using the polynomial degree fixed. Next, the arctanh (***hyperbolic arcotangent***) function was applied to scale the data.

In [5]:
poly = PolynomialFeatures(degree=2)
X_train_poly = np.arctanh(poly.fit_transform(X_train)[:,1:])
X_internal_test_poly = np.arctanh(poly.transform(X_internal_test)[:,1:])
X_final_test_poly = np.arctanh(poly.transform(X_final_test)[:,1:])
X_dev_poly = np.arctanh(poly.transform(X_dev)[:,1:])
X_blind_poly = np.arctanh(poly.transform(X_blind)[:,1:])

#### Grid search for linear kernel

In [6]:
svr_hyperparam_cup = {'estimator__C': [1, 10, 100, 500, 1000], 'estimator__kernel': ['linear'], 'estimator__epsilon': [0.001, 0.01, 0.05, 0.1], 'estimator__gamma': ['scale', 'auto'],
'estimator__max_iter': [100000]}



In [None]:
svc_grid_search = CustomGridSearch_cv(estimator = 'multi_svr', cv_strategy = 'kfold', hyperparameters=svr_hyperparam_cup, cv_splits = 5, multi_output = True)
svc_grid_search.fit(X_train_poly, y_train)
print()

In [None]:
best_params = {'C': 10, 'gamma': 'scale', 'kernel': 'linear', 'epsilon': 0.1}

best_svr = MultiOutputRegressor(SVR(**best_params))
best_svr.fit(X_train_poly, y_train)

In [None]:
mse_train = mean_squared_error(y_train, best_svr.predict(X_train_poly))
mee_train = MEE(y_train, best_svr.predict(X_train_poly))

print(mse_train, mee_train)

In [None]:
mse_internal = mean_squared_error(y_internal_test, best_svr.predict(X_internal_test_poly))
mee_internal = MEE(y_internal_test, best_svr.predict(X_internal_test_poly))

print(mse_internal, mee_internal)

##### Model trained on validation set and estimate error on final test set

In [None]:
best_svr.fit(X_dev_poly, y_dev)

In [None]:
mse_dev = mean_squared_error(y_dev, best_svr.predict(X_dev_poly))
mee_dev = MEE(y_dev, best_svr.predict(X_dev_poly))

print(mse_dev, mee_dev)

In [18]:
test_pred = best_svr.predict(X_final_test_poly)

In [None]:
mse_test = mean_squared_error(y_final_test, test_pred)
mee_test = MEE(y_final_test, test_pred)
print(f"Test loss: {mse_test:.4f}, Test MEE: {mee_test:.4f}")

#### Grid Search for poly kernel

In [38]:
svr_hyperparam_cup = {'estimator__C': [0.1, 1, 10, 100, 1000], 'estimator__coef0': [0.0], 'estimator__gamma': ['scale'], 'estimator__degree': [2, 3, 4], 'estimator__epsilon': [0.01, 0.1],
  'estimator__kernel': ['poly'], 'estimator__max_iter': [100000]}

In [None]:
svc_grid_search = CustomGridSearch_cv(estimator = 'multi_svr', cv_strategy = 'kfold', hyperparameters=svr_hyperparam_cup, multi_output = True, cv_splits = 5)
svc_grid_search.fit(X_train_poly, y_train)
print()

In [None]:
best_params = {'C': 1000, 'gamma': 'scale', 'kernel': 'poly', 'coef0': 0.0, 'degree': 2, 'epsilon': 0.01}

best_svr = MultiOutputRegressor(SVR(**best_params))
best_svr.fit(X_train_poly, y_train)

In [None]:
mse_train = mean_squared_error(y_train, best_svr.predict(X_train_poly))
mee_train = MEE(y_train, best_svr.predict(X_train_poly))

print(mse_train, mee_train)

In [None]:
mse_internal = mean_squared_error(y_internal_test, best_svr.predict(X_internal_test_poly))
mee_internal = MEE(y_internal_test, best_svr.predict(X_internal_test_poly))

print(mse_internal, mee_internal)

##### Model trained on validation set and estimate error on final test set

In [None]:
best_svr.fit(X_dev_poly, y_dev)

In [None]:
mse_dev = mean_squared_error(y_dev, best_svr.predict(X_dev_poly))
mee_dev = MEE(y_dev, best_svr.predict(X_dev_poly))

print(mse_dev, mee_dev)

In [None]:
test_pred = best_svr.predict(X_final_test_poly)

In [None]:
mse_test = mean_squared_error(y_final_test, test_pred)
mee_test = MEE(y_final_test, test_pred)
print(f"Test loss: {mse_test:.4f}, Test MEE: {mee_test:.4f}")

#### Grid search for rbf kernel

In [1]:
svr_hyperparam_cup = {'estimator__C': [1, 10, 100, 500, 1000], 'estimator__gamma': ['scale', 'auto'], 'estimator__kernel': ['rbf'], 'estimator__epsilon': [0.001, 0.01, 0.1]}

In [None]:
svc_grid_search = CustomGridSearch_cv(estimator = 'multi_svr', cv_strategy = 'kfold', hyperparameters=svr_hyperparam_cup, multi_output = True, cv_splits = 5)
svc_grid_search.fit(X_train_poly, y_train)
print()

In [None]:
best_params = {'C': 1000, 'gamma': 'scale', 'kernel': 'rbf', 'epsilon': 0.01}

best_svr = MultiOutputRegressor(SVR(**best_params))
best_svr.fit(X_train_poly, y_train)

In [None]:
mse_train = mean_squared_error(y_train, best_svr.predict(X_train_poly))
mee_train = MEE(y_train, best_svr.predict(X_train_poly))

print(mse_train, mee_train)

In [None]:
mse_internal = mean_squared_error(y_internal_test, best_svr.predict(X_internal_test_poly))
mee_internal = MEE(y_internal_test, best_svr.predict(X_internal_test_poly))

print(mse_internal, mee_internal)

##### Model trained on validation set and estimate error on final test set

In [None]:
best_svr.fit(X_dev_poly, y_dev)

In [None]:
mse_dev = mean_squared_error(y_dev, best_svr.predict(X_dev_poly))
mee_dev = MEE(y_dev, best_svr.predict(X_dev_poly))

print(mse_dev, mee_dev)

In [None]:
test_pred = best_svr.predict(X_final_test_poly)

In [None]:
mse_test = mean_squared_error(y_final_test, test_pred)
mee_test = MEE(y_final_test, test_pred)
print(f"Test loss: {mse_test:.4f}, Test MEE: {mee_test:.4f}")