In [1]:
# Modify sys.path
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

In [2]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics import balanced_accuracy_score, roc_auc_score
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold
import pickle

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, InputLayer

import config.config as config
from src.data_processing import read_arff, preprocess_data_classification
from src.utils import lags

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
def surrogate_creation(Tx, Ty, modelSurrogate, path, model_type="ML", nSplit=3, nRepeat=1):
    baScores, rocScores = [], []
    train_X_cv, train_Y_cv, test_X_cv, test_Y_cv = [], [], [], []
    
    Tx_array = np.asarray(Tx)
    Ty_array = np.asarray(Ty).ravel()
    rskf = RepeatedStratifiedKFold(n_splits=nSplit, n_repeats=nRepeat, random_state=config.SEED_VALUE)
    
    for idx, (train_index, test_index) in enumerate(rskf.split(Tx_array, Ty_array)):
        X_train, X_test = Tx_array[train_index], Tx_array[test_index]
        y_train, y_test = Ty_array[train_index], Ty_array[test_index]

        train_X_cv.append(pd.DataFrame(X_train, columns=Tx.columns))
        train_Y_cv.append(y_train.ravel())
        test_X_cv.append(pd.DataFrame(X_test, columns=Tx.columns))
        test_Y_cv.append(y_test.ravel())
        
        if model_type == "DL":
            X_train, X_test = X_train.reshape((X_train.shape[0], 1, X_train.shape[1])), X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
        
        model = modelSurrogate() 
        
        if model_type == "ML":
            model.fit(X_train, np.array(y_train).ravel())
        elif model_type == "DL":
            random.seed(config.SEED_VALUE)
            np.random.seed(config.SEED_VALUE)
            tf.random.set_seed(config.SEED_VALUE)
            
            model.fit(X_train, y_train, batch_size=config.BATCH_SIZE, epochs=config.EPOCHS, verbose=0)
        
        pred = model.predict(X_test).ravel()
        
        baScores.append(balanced_accuracy_score(y_test, pred))

        lb = preprocessing.LabelBinarizer()
        lb.fit(y_test)
        y_test = lb.transform(y_test)
        pred = lb.transform(pred)
        rocScores.append(roc_auc_score(y_test, pred, multi_class='ovo', average='weighted'))
        
        if model_type == "ML":
            with open(f"{path}-{idx}.pkl", 'wb') as file:
                pickle.dump(model, file)
        else:
            model.save(f"{path}-{idx}.h5")
        
    print(f"Balanced accuracy: {np.mean(baScores):.4f} ± {np.std(baScores):.4f}")
    print(f"ROC-AUC score: {np.mean(rocScores):.4f} ± {np.std(rocScores):.4f}")

    return train_X_cv, train_Y_cv, test_X_cv, test_Y_cv

In [5]:
def RF_model():
    return RandomForestClassifier(random_state=config.SEED_VALUE)

In [6]:
def SVM_model():
    return svm.SVC(C=10, kernel='poly', random_state=config.SEED_VALUE)

In [7]:
# Load dataset
DATA_PATH = os.path.join('..', 'data', config.DATASET_NAME)

dataset = read_arff(DATA_PATH)

# Apply sliding window transformation
df_lagged = lags(dataset, config.N_STEPS).iloc[config.N_STEPS:,:].reset_index(drop=True)

df_dict = preprocess_data_classification(df_lagged)

train_X, train_Y, test_X, test_Y = df_dict['normalized']

In [7]:
# Load dataset
aljorra = read_arff('../data/LaAljorra-WS7-normalized-classification.arff')

import re
aljorraFixed = aljorra.copy()
for i in range(len(aljorra['NO2'])):
    processedStr = re.sub('\\\\', '', aljorra['NO2'][i].decode("utf-8"), count=0, flags=0)
    finalStr = re.sub('\'\'', '', processedStr, count=0, flags=0)
    aljorraFixed.loc[i, 'NO2'] = finalStr
    
    processedStr = re.sub('\\\\', '', aljorra['Lag_NO2-1'][i].decode("utf-8"), count=0, flags=0)
    finalStr = re.sub('\'\'', '', processedStr, count=0, flags=0)
    aljorraFixed.loc[i, 'Lag_NO2-1'] = finalStr
    
    processedStr = re.sub('\\\\', '', aljorra['Lag_NO2-2'][i].decode("utf-8"), count=0, flags=0)
    finalStr = re.sub('\'\'', '', processedStr, count=0, flags=0)
    aljorraFixed.loc[i, 'Lag_NO2-2'] = finalStr
    
    processedStr = re.sub('\\\\', '', aljorra['Lag_NO2-3'][i].decode("utf-8"), count=0, flags=0)
    finalStr = re.sub('\'\'', '', processedStr, count=0, flags=0)
    aljorraFixed.loc[i, 'Lag_NO2-3'] = finalStr
    
    processedStr = re.sub('\\\\', '', aljorra['Lag_NO2-4'][i].decode("utf-8"), count=0, flags=0)
    finalStr = re.sub('\'\'', '', processedStr, count=0, flags=0)
    aljorraFixed.loc[i, 'Lag_NO2-4'] = finalStr
    
    processedStr = re.sub('\\\\', '', aljorra['Lag_NO2-5'][i].decode("utf-8"), count=0, flags=0)
    finalStr = re.sub('\'\'', '', processedStr, count=0, flags=0)
    aljorraFixed.loc[i, 'Lag_NO2-5'] = finalStr
    
    processedStr = re.sub('\\\\', '', aljorra['Lag_NO2-6'][i].decode("utf-8"), count=0, flags=0)
    finalStr = re.sub('\'\'', '', processedStr, count=0, flags=0)
    aljorraFixed.loc[i, 'Lag_NO2-6'] = finalStr
    
    processedStr = re.sub('\\\\', '', aljorra['Lag_NO2-7'][i].decode("utf-8"), count=0, flags=0)
    finalStr = re.sub('\'\'', '', processedStr, count=0, flags=0)
    aljorraFixed.loc[i, 'Lag_NO2-7'] = finalStr

le = preprocessing.LabelEncoder()
le.fit(['B1of7', 'B2of7', 'B3of7', 'B4of7', 'B5of7', 'B6of7', 'B7of7', 'None'])

aljorraFixed['NO2'] = le.transform(aljorraFixed['NO2'])
aljorraFixed['Lag_NO2-1'] = le.transform(aljorraFixed['Lag_NO2-1'])
aljorraFixed['Lag_NO2-2'] = le.transform(aljorraFixed['Lag_NO2-2'])
aljorraFixed['Lag_NO2-3'] = le.transform(aljorraFixed['Lag_NO2-3'])
aljorraFixed['Lag_NO2-4'] = le.transform(aljorraFixed['Lag_NO2-4'])
aljorraFixed['Lag_NO2-5'] = le.transform(aljorraFixed['Lag_NO2-5'])
aljorraFixed['Lag_NO2-6'] = le.transform(aljorraFixed['Lag_NO2-6'])
aljorraFixed['Lag_NO2-7'] = le.transform(aljorraFixed['Lag_NO2-7'])

aljorraFixed.columns = [c.replace('-', '_') for c in aljorraFixed.columns]

from sklearn.model_selection import train_test_split
train_X, test_X, train_Y, test_Y = train_test_split(aljorraFixed.iloc[:,:-1], aljorraFixed.iloc[:,-1], test_size=0.2, random_state=1234, stratify=aljorraFixed.iloc[:,-1])

In [8]:
train_X_cv_RF, train_Y_cv_RF, test_X_cv_RF, test_Y_cv_RF = surrogate_creation(train_X, train_Y, RF_model, 
                                                                              rf'../models/{config.DATASET_SAVE_NAME}-surrogate-classification-RF', model_type="ML")
train_X_cv_SVM, train_Y_cv_SVM, test_X_cv_SVM, test_Y_cv_SVM = surrogate_creation(train_X, train_Y, SVM_model, 
                                                                                      rf'../models/{config.DATASET_SAVE_NAME}-surrogate-classification-SVM', model_type="ML")

Balanced accuracy: 0.9750 ± 0.0202
ROC-AUC score: 0.9810 ± 0.0154
Balanced accuracy: 0.9505 ± 0.0015
ROC-AUC score: 0.9618 ± 0.0003


In [9]:
# Save results
with open(f'../variables/{config.DATASET_SAVE_NAME}-dataset-cv-classification-RF.pickle', 'wb') as f:
     pickle.dump([train_X_cv_RF, train_Y_cv_RF, test_X_cv_RF, test_Y_cv_RF], f)

with open(f'../variables/{config.DATASET_SAVE_NAME}-dataset-cv-classification-SVM.pickle', 'wb') as f:
     pickle.dump([train_X_cv_SVM, train_Y_cv_SVM, test_X_cv_SVM, test_Y_cv_SVM], f)

In [10]:
Tx_array = np.asarray(Tx)
Ty_array = np.asarray(Ty).ravel()
rkf = RepeatedStratifiedKFold(n_splits=3, n_repeats=1, random_state=config.SEED_VALUE)

print(Tx_array)

for (train_index, test_index), idx in zip(rkf.split(Tx_array), range(3)):
    X_train, X_test = Tx_array[train_index], Tx_array[test_index]
    y_train, y_test = Ty_array[train_index], Ty_array[test_index]

NameError: name 'Tx' is not defined