In [1]:
# Modify sys.path
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

In [2]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, InputLayer

import config.config as config
from src.data_processing import read_arff, preprocess_data
from src.evaluation import predictions_h_stepsahead_LSTM, calculate_H, train_evaluate_lstm_model

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
def surrogate_creation(Tx, Ty, modelSurrogate, path, model_type="ML", nSplit=5, nRepeat=1):
    rmseScores, maeScores, ccScores = [], [], []
    train_X_cv, train_Y_cv, test_X_cv, test_Y_cv = [], [], [], []
    
    Tx_array = np.asarray(Tx)
    Ty_array = np.asarray(Ty)
    rkf = RepeatedKFold(n_splits=nSplit, n_repeats=nRepeat, random_state=config.SEED_VALUE)
    
    for idx, (train_index, test_index) in enumerate(rkf.split(Tx_array)):
        X_train, X_test = Tx_array[train_index], Tx_array[test_index]
        y_train, y_test = Ty_array[train_index], Ty_array[test_index]

        train_X_cv.append(pd.DataFrame(X_train, columns=Tx.columns))
        train_Y_cv.append(y_train.ravel())
        test_X_cv.append(pd.DataFrame(X_test, columns=Tx.columns))
        test_Y_cv.append(y_test.ravel())
        
        if model_type == "DL":
            X_train, X_test = X_train.reshape((X_train.shape[0], 1, X_train.shape[1])), X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
        
        model = modelSurrogate() 
        
        if model_type == "ML":
            model.fit(X_train, np.array(y_train).ravel())
        elif model_type == "DL":
            random.seed(config.SEED_VALUE)
            np.random.seed(config.SEED_VALUE)
            tf.random.set_seed(config.SEED_VALUE)
            
            model.fit(X_train, y_train, batch_size=config.BATCH_SIZE, epochs=config.EPOCHS, verbose=0)
        
        pred = model.predict(X_test).ravel()
        
        rmseScores.append(root_mean_squared_error(y_test, pred))
        maeScores.append(mean_absolute_error(y_test, pred))
        ccScores.append(np.corrcoef(y_test.T, pred.T)[1,0])
        
        if model_type == "ML":
            with open(f"{path}-{idx}.pkl", 'wb') as file:
                pickle.dump(model, file)
        else:
            model.save(f"{path}-{idx}.h5")
        
    print(f"RMSE: {np.mean(rmseScores):.4f} ± {np.std(rmseScores):.4f}")
    print(f"MAE: {np.mean(maeScores):.4f} ± {np.std(maeScores):.4f}")
    print(f"CC: {np.mean(ccScores):.4f} ± {np.std(ccScores):.4f}")

    return train_X_cv, train_Y_cv, test_X_cv, test_Y_cv

In [5]:
def RF_model():
    return RandomForestRegressor(random_state=config.SEED_VALUE)

In [6]:
def LSTM_model():
    
    model = Sequential([
            InputLayer(shape=(1, config.N_ATTRIB)),
            LSTM(units=config.N_NEURONS, activation='tanh', recurrent_activation = 'sigmoid', 
                   return_sequences=True),
            Dropout(0.2),
            Dense(1, activation="linear")
        ])

    model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer='adam',
                  metrics=[tf.keras.metrics.RootMeanSquaredError()])
    
    return model

In [7]:
# Load dataset
DATA_PATH = os.path.join('..', 'data', config.DATASET_NAME)

dataset = read_arff(DATA_PATH)
df_dict = preprocess_data(dataset)

train_X, train_Y, test_X, test_Y = df_dict['normalized']

In [None]:
train_X_cv_RF, train_Y_cv_RF, test_X_cv_RF, test_Y_cv_RF = surrogate_creation(train_X, train_Y, RF_model, 
                                                                              rf'../models/{config.DATASET_SAVE_NAME}-surrogate-RF', model_type="ML")
train_X_cv_LSTM, train_Y_cv_LSTM, test_X_cv_LSTM, test_Y_cv_LSTM = surrogate_creation(train_X, train_Y, LSTM_model, 
                                                                                      rf'../models/{config.DATASET_SAVE_NAME}-surrogate-LSTM', model_type="DL")

RMSE: 0.0726 ± 0.0056
MAE: 0.0490 ± 0.0028
CC: 0.8699 ± 0.0151
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step




[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step








[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step








[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step




[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step




RMSE: 0.0747 ± 0.0047
MAE: 0.0523 ± 0.0040
CC: 0.8644 ± 0.0241


In [12]:
# Save results
with open(f'../variables/{config.DATASET_SAVE_NAME}-dataset-cv-RF.pickle', 'wb') as f:
     pickle.dump([train_X_cv_RF, train_Y_cv_RF, test_X_cv_RF, test_Y_cv_RF], f)

with open(f'../variables/{config.DATASET_SAVE_NAME}-dataset-cv-LSTM.pickle', 'wb') as f:
     pickle.dump([train_X_cv_LSTM, train_Y_cv_LSTM, test_X_cv_LSTM, test_Y_cv_LSTM], f)