In [1]:
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import f1_score, roc_auc_score, \
    recall_score, accuracy_score, precision_score, confusion_matrix
import json
import mlflow
import pickle
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import KFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import TimeSeriesSplit
import statsmodels.stats.api as sms
import numpy as np
import random
from tensorflow import keras
from tensorflow.keras.optimizers import RMSprop
from keras.layers import LSTM, Dense, Dropout
import keras.backend as K
from sklearn.preprocessing import StandardScaler
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import f1_score
import pandas as pd

In [2]:
search_space_lstm = hp.choice('classifier_type', [
    {
        'type': 'lstm',
        'activation': hp.choice('activation', ["relu"]),
        'units': hp.quniform('units', 576, 960, 32),
        'batch': hp.choice('batch', [2048]),
        'epochs': hp.quniform('epochs', 30, 70, 10),
        'dropout': hp.choice('dropout', [True, False]),
        'learning_rate': hp.loguniform('learning_rate', np.log(0.0001), np.log(0.006)),
        'preprocessing': hp.choice('p_lstm', ['scaler', 'filter', 'all', 'none', 'fi_ss',
                                    'fi_sm', 'ss_sm', 'smote'])
    }
])

In [3]:
def f_0(y):
    if y['INDISPONIBILIDADE'] == 0:
        val = 1
    else:
        val = 0
    return val


def f_1(y):
    if y['INDISPONIBILIDADE'] == 1:
        val = 1
    else:
        val = 0
    return val


def ajusta_y(y):
    y['0'] = y.apply(f_0, axis=1)
    y['1'] = y.apply(f_1, axis=1)
    y = y[['0', '1']]
    return y


#def create_sequences(values, time_steps=1):
#    output = []
#    for i in range(len(values) - time_steps + 1):
#        output.append(values[i:(i + time_steps)])
#    return np.stack(output)


def create_sequences(values, time_steps=1):
    return np.asarray([values[i : (i + time_steps)] for i in range(len(values) - time_steps + 1)])


def ajusta_y_timestep(y, time_steps=1):
    new_y = y[time_steps-1:]
    return new_y


def transform_dimension_timesteps(train_x, train_y, time_steps=1):

    train_x = create_sequences(train_x, time_steps)
    train_y = ajusta_y_timestep(train_y, time_steps)
    train_y = train_y.values.reshape(-1, 2)
    
    print(train_y.shape)
    return train_x, train_y


def ajusta_saida(y_pred):
    y_pred_c = []
    for x in y_pred:
        y_pred_c.append(np.argmax(x))
    return y_pred_c

"""def objective_lstm(params):
    units = params['units']
    model = keras.Sequential()
    model.add(LSTM(params['units'], activation=params['activation'],
                   return_sequences=False, input_shape=(1, shape)))
    
    model.add(Dense(params['units'], activation=params['activation'],
                    input_shape=(784,)))
    model.add(Dense(10, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy',
                  metrics=['accuracy'])
    model.fit(x_train, y_train, epochs=params['epochs'],
              batch_size=params['batch_size'], verbose=0)
    score = model.evaluate(x_test, y_test, verbose=0)
    return -score[1]  
"""

def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

class MyModel():
    def build(self, **kwargs):
        activation = kwargs.get('activation')
        shape = kwargs.get('shape')
        batch = int(kwargs.get('batch'))
        dropout = kwargs.get('dropout')
        lr = kwargs.get('learning_rate')
        units = int(float(kwargs.get('units')))

        model = keras.Sequential()
        model.add(LSTM(units, activation=activation, return_sequences=False,
                       input_shape=(1, shape)))
        model.add(Dense(units, activation=activation))
        if dropout:
            model.add(Dropout(rate=0.2))
        model.add(Dense(units, activation=activation))
        model.add(Dense(units, activation=activation))
        model.add(Dense(2, activation='sigmoid'))
        model.compile(loss='binary_crossentropy',
                      optimizer=RMSprop(learning_rate=lr),
                      metrics=[get_f1])
        return model

    def fit(self, model, *args, **kwargs):
        return model.fit(
            *args,
            batch_size=batch,
            **kwargs, verbose=0
        )


def predict_keras(model, test_x):
    x = create_sequences(test_x.copy(), 1)
    predicted = model.predict(x)
    predicted = ajusta_saida(predicted)
    return predicted


def test_model(model, l, x_train, y_train, x_val, y_val):
    batch, epochs = l
    # print("Converting training data")
    # x, y = transform_dimension_timesteps(train_x.copy(), train_y)

    print("Training the model")
    model.fit(x_train, y_train, batch_size=batch, epochs=epochs)

    pred = predict_keras(model, x_val)
    f1 = f1_score(y_val, pred)
    print(confusion_matrix(y_val, pred))

    return f1


def split(r, mat):
    i = int(len(mat)*r)

    return mat[:i], mat[i:]


def train_test(train, test):

    x_train = train.drop(["INDISPONIBILIDADE"], axis=1)
    y_train = train[['INDISPONIBILIDADE']]

    x_test = test.drop(["INDISPONIBILIDADE"], axis=1)
    y_test = test[['INDISPONIBILIDADE']]

    return x_train, y_train, x_test, y_test


def find_best_keras(df, evals):
    pass

In [7]:
def eval_metrics(actual, pred):
    f1 = f1_score(actual, pred)
    roc = roc_auc_score(actual, pred)
    rec = recall_score(actual, pred)
    pre = precision_score(actual, pred)
    acc = accuracy_score(actual, pred)
    print("F1-Score:", f1)
    print(confusion_matrix(actual, pred))
    return f1, roc, rec, pre, acc


def model_selection(m, x, y, p, c, clf):
    if m == 'train_test':
        # Ratio train test split
        r = 0.75
        return train_test_selection(r, x, y, p, c, clf)



def objective_keras(params):
    

    mlflow.set_tracking_uri("http://localhost:5000")
    with mlflow.start_run(nested=True):
        mlflow.log_params(params)
        
        x, y = data
        p = params['preprocessing']
        print("Preprocess:", p)
        df = pd.concat([x.reset_index(drop=True),
                    y.reset_index(drop=True)], axis=1) 
        train, test = split(.75, df)
        x_train, y_train, x_val, y_val = train_test(train, test)

        x_train, x_val, y_train = preprocessing(p, x_train, x_val, y_train)
        print("Converting training data")
        y_train = ajusta_y(y_train)
        x_train, y_train = transform_dimension_timesteps(x_train, y_train, time_steps=1)
        
        
        del params['preprocessing']
        del params['type']
        mlflow.log_param("model", 'lstm')
        mlflow.log_param("model_selection", split_strategy)
        mlflow.log_param("stage", 'tuning')

        params['shape'] = x_train.shape[2]
        clf = MyModel().build(**params)
        l = [int(params['batch']), int(params['epochs'])]

        print(params)

        f1 = test_model(clf, l, x_train, y_train, x_val, y_val)
        
        f1 = f1 + test_model(clf, l, x_train, y_train, x_val, y_val)
        
        f1 = f1 + test_model(clf, l, x_train, y_train, x_val, y_val)
        
        f1 = f1/3

        print("Média F1-SCORE", f1)
        mlflow.log_metric("f1_val", f1)

        # Because fmin() tries to minimize the objective,
        # this function must return the negative accuracy.
        return {'loss': -f1, 'status': STATUS_OK}


def get_best(key):
    f = open('params/best_hyper.json')
    data = json.load(f)
    f.close()
    return data[key]


def find_best(x, y, evals, space):
    

    global data
    data = [x, y]
    rstate = np.random.default_rng(42)
    trials = Trials()
    best_result = fmin(
        fn=objective_keras, space=space,
        algo=tpe.suggest, max_evals=evals,
        trials=trials, rstate=rstate)

    result = hyperopt.space_eval(space, best_result)
    print("Best in Search Space:", result)
    print('trials:')
    for trial in trials.trials[:2]:
        print(trial)

    key = result['type']
    del result['type']
    # update_hyper(result, key)

    print(result)

    return result, trials, key


def split(r, mat):
    i = int(len(mat)*r)

    return mat[:i], mat[i:]


def preprocess(filtering, scaler, smote, x_train, x_test, y_train):

    if filtering == 'True':
        print("Filtering")
        with open('../data/params/features.pkl', 'rb') as inp:
            features = pickle.load(inp)
        x_train = x_train[features]
        x_test = x_test[features]

    if scaler == 'True':
        print("Standard Scale")
        ss = StandardScaler() # .set_output(transform="pandas")
        ss.fit(x_train)
        x_train = ss.transform(x_train)
        x_test = ss.transform(x_test)

    if smote == 'True':
        print("SMOTE")

        # if isinstance(x_train, cd.DataFrame):
        #    x_train, y_train = x_train.to_pandas(), y_train.to_pandas()
        
        with open("../data/params/smote.pkl", "rb") as inp:
            samp_strat = pickle.load(inp)
        print("Sampling Strategy: ", samp_strat)
        smote = SMOTE(random_state=42, sampling_strategy=samp_strat)
        x_train, y_train = smote.fit_resample(x_train, y_train)

        # x_train, y_train = cd.from_pandas(x_train), cd.from_pandas(y_train)

    return x_train, x_test, y_train


def preprocessing(p, x_train, x_test,  y_train, ):

    if p == 'all':
        x_train, x_test, y_train = preprocess('True', 'True', 'True',
                                              x_train, x_test, y_train)
    elif p == 'filter':
        x_train, x_test, y_train = preprocess('True', 'False', 'False',
                                              x_train, x_test, y_train)
    elif p == 'scaler':
        x_train, x_test, y_train = preprocess('False', 'True', 'False',
                                              x_train, x_test, y_train)
    elif p == 'smote':
        x_train, x_test, y_train = preprocess('False', 'False', 'True',
                                              x_train, x_test, y_train)
    elif p == 'fi_sm':
        x_train, x_test, y_train = preprocess('True', 'False', 'True',
                                              x_train, x_test, y_train)
    elif p == 'fi_ss':
        x_train, x_test, y_train = preprocess('True', 'True', 'False',
                                              x_train, x_test, y_train)
    elif p == 'ss_sm':
        x_train, x_test, y_train = preprocess('False', 'True', 'True',
                                              x_train, x_test, y_train)

    return x_train, x_test, y_train


def get_data():

    mat = pd.read_csv('../data/raw/matomo.csv', dtype=np.int32)

    return mat


def train_test(train, test):

    x_train = train.drop(["INDISPONIBILIDADE"], axis=1)
    y_train = train[['INDISPONIBILIDADE']]

    x_test = test.drop(["INDISPONIBILIDADE"], axis=1)
    y_test = test[['INDISPONIBILIDADE']]

    return x_train, y_train, x_test, y_test


def get_model(k, params):
    if k == 'knn':
        clf = KNeighborsClassifier(**params)
    elif k == 'svm':
        clf = SVC(**params)
    elif k == 'nb':
        clf = GaussianNB(**params)
    elif k == 'rf':
        clf = RandomForestClassifier(**params)
    elif k == 'ada':
        clf = AdaBoostClassifier(**params)
    elif k == 'dt':
        clf = DecisionTreeClassifier(**params)
    elif k == 'lstm':
        clf = MyModel().build(**params)

    return clf


def test_one(key, params, x_train, y_train, x_test, y_test):
    p = params['preprocessing']
    del params['preprocessing']

    x_train, x_test, y_train = preprocessing(p, x_train,
                                             x_test, y_train)

    model = get_model(key, params)

    if key in ('ada', 'dt', 'nb'):
        model.fit(x_train.to_pandas(), y_train.to_pandas())
        pred = model.predict(x_test.to_pandas())
        f1, roc, rec, pre, acc = eval_metrics(y_test.to_pandas(), pred)
    else:
        y_train = y_train['INDISPONIBILIDADE'].values
        model.fit(x_train, y_train)
        pred = model.predict(x_test)
        f1, roc, rec, pre, acc = eval_metrics(y_test.to_pandas(),
                                              pred.to_pandas())

    return f1, roc, rec, pre, acc


def test_params(x_train, y_train, x_test, y_test, params):
    mlflow.set_tracking_uri("http://localhost:5000")

    for key in params:
        with mlflow.start_run(nested=True):
            mlflow.log_param("model", key)
            mlflow.log_param("model_selection", split_strategy)
            mlflow.log_param("stage", "Testing_algos")
            mlflow.log_params(params[key])

            print(key)
            f1, roc, rec, pre, acc = test_one(key, params[key],
                                              x_train, y_train,
                                              x_test, y_test)

            mlflow.log_metric('f1', f1)
            mlflow.log_metric('roc', roc)
            mlflow.log_metric('recall', rec)
            mlflow.log_metric('precision', pre)
            mlflow.log_metric('accuracy', acc)


def delete_runs():
    mlflow.set_tracking_uri("http://localhost:5000")
    runs = mlflow.search_runs()

    for run in runs.iterrows():
        mlflow.delete_run(run[1].run_id)


def is_sklearn(model):
    if type(model).__module__[:7] == 'sklearn':
        return True
    return False

def ajust_columns(results):
    for c in results.columns:
        if c[:7] == "params.":
            results = results.rename(columns={c: c[7:]})

    results = results.rename(columns={"metrics.f1_val": "f1_val"})
    return results


def correct_parameters(best_results):
    for result in best_results:
        if result != "knn":
            try:
                del best_results[result]["n_neighbors"]
                del best_results[result]["metric"]
            except KeyError:
                pass
    for k in best_results:
        for p in best_results[k]:
            if p in ("n_estimators", "n_neighbors"):
                best_results[k][p] = int(best_results[k][p])
            if p in ("C", "var_smoothing", "learning_rate"):
                best_results[k][p] = float(best_results[k][p])

    return best_results


def get_best_parameters(split_strategy):
    mlflow.set_tracking_uri("http://localhost:5000")
    results = mlflow.search_runs()

    results = ajust_columns(results)
    query = f'model_selection == "{split_strategy}"'
    grouped = results.query(query).groupby("type")
    indices_max = grouped["f1_val"].idxmax()
    best_results = {}

    for modelo, indice in indices_max.items():
        parametros = results.loc[
            indice,
            [
                "preprocessing",
                "C",
                "kernel",
                "n_estimators",
                "n_neighbors",
                "criterion",
                "var_smoothing",
                "learning_rate",
                "metric",
                "units",
                "activation",
                "batch",
                "dropout", 
                "epochs"
            ],
        ]
        parametros = {
            chave: valor
            for chave, valor in parametros.to_dict().items()
            if type(valor) == str
        }
        best_results[modelo] = parametros

    return correct_parameters(best_results)


def run():

    print("Reading data")
    mat = get_data()
    print("Spliting the data into train/test with 75/25 proportion")
    train, test = split(0.75, mat)
    print("Spliting the data into x and y features")
    x_train, y_train, x_test, y_test = train_test(train, test)

   
    print("Find best parameters for LSTM model")
    find_best(x_train, y_train, 50, search_space_lstm)
    


In [5]:
def eval_one_variance(test, model, key):
    results = []
    for i in range(10):
        with mlflow.start_run(nested=True):
            mlflow.log_param("model", key)
            mlflow.log_param("stage", "statistics_analysis")
            # mlflow.log_param("model_selection", split_strategy)
            mlflow.log_param("random_i", i)

            test_shuffle = test.sample(frac=0.5, random_state=i)

            x_test = test_shuffle.drop(["INDISPONIBILIDADE"], axis=1)
            y_test = test_shuffle[["INDISPONIBILIDADE"]]
            
            # y_test = ajusta_y(y_test)
            print("Transforming dimension")
            # x_test, y_test = transform_dimension_timesteps(x_test, y_test, time_steps=1)

            pred = predict_keras(model, x_test) # model.predict(x_test)
            pred = ajusta_saida(pred)
            
            # pred = pred.to_numpy()
            y_test = y_test.values

            f1, roc, rec, pre, acc = eval_metrics(y_test, pred)
            results.append(f1)

            mlflow.log_metric("f1", f1)
            mlflow.log_metric("roc", roc)
            mlflow.log_metric("recall", rec)
            mlflow.log_metric("precision", pre)
            mlflow.log_metric("accuracy", acc)

    media = np.mean(results)
    dp = np.std(results, ddof=1)
    ci = sms.DescrStatsW(results).tconfint_mean()
    return media, dp, ci


def eval_variance(x_train, y_train, x_test, y_test, params):
    metricas = {}
    for k in params:
        if k in ["lstm"]:
            print("Algo:", k)
            p = params[k]["preprocessing"]
            del params[k]["preprocessing"]
            (
                x_train_c,
                x_test_c,
                y_train_c,
            ) = preprocessing(p, x_train, x_test, y_train)
            params[k]["shape"] = x_train_c.shape[1]
            print(params[k])
            model = MyModel().build(**params[k])

            print("Ajusting y")
            y_train_c = ajusta_y(y_train_c)
            print("Transforming dimension")
            x_train_c, y_train_c = transform_dimension_timesteps(x_train_c, y_train_c, time_steps=1)
            # y_train_c = y_train_c["INDISPONIBILIDADE"].values
            batch = int(params[k]["batch"])
            print("Fitting model")
            model.fit(x_train_c, y_train_c, batch_size=batch, epochs=int(float(params[k]["epochs"])))
            
            (
                x_train_c,
                x_test_c,
                y_train_c,
            ) = preprocessing(p, x_train, x_test, y_train)
            
            
            x_test_c = pd.DataFrame(x_test_c)
            test = pd.concat(
                [x_test_c.reset_index(drop=True), y_test.reset_index(drop=True)], axis=1
            )

            media, dp, ci = eval_one_variance(test, model, k)
            metricas[k] = {}
            metricas[k]["mean"] = media
            metricas[k]["stand_dev"] = dp
            metricas[k]["conf_int"] = ci
    return metricas

In [None]:
global split_strategy
split_strategy = 'train_test'
run()

Reading data
Spliting the data into train/test with 75/25 proportion
Spliting the data into x and y features
Find best parameters for LSTM model
Preprocess:                                                                                                            
fi_sm                                                                                                                  
Filtering                                                                                                              
SMOTE                                                                                                                  
Sampling Strategy:                                                                                                     
0.004664936141760721                                                                                                   
Converting training data                                                                                               
  0%|          

In [6]:
print("Reading data")
mat = get_data()
print("Spliting the data into train/test with 75/25 proportion")
train, test = split(0.75, mat)
print("Spliting the data into x and y features")
x_train, y_train, x_test, y_test = train_test(train, test)
split_strategy = "train_test"
params = get_best_parameters(split_strategy)
print(params)
eval_variance(x_train, y_train, x_test, y_test, params)

Reading data
Spliting the data into train/test with 75/25 proportion
Spliting the data into x and y features
{'ada': {'preprocessing': 'ss_sm', 'n_estimators': 50, 'learning_rate': 1.0}, 'dt': {'preprocessing': 'fi_sm', 'criterion': 'entropy'}, 'knn': {'preprocessing': 'all', 'n_neighbors': 1, 'metric': 'manhattan'}, 'lstm': {'preprocessing': 'fi_ss', 'learning_rate': 0.0006299586720690262, 'units': '768.0', 'activation': 'tanh', 'batch': '2048', 'dropout': 'True', 'epochs': '50.0'}, 'nb': {'preprocessing': 'none', 'var_smoothing': 1e-05}, 'rf': {'preprocessing': 'none', 'n_estimators': 100}, 'svm': {'preprocessing': 'filter', 'C': 1.0, 'kernel': 'rbf'}}
Algo: lstm
Filtering
Standard Scale
{'learning_rate': 0.0006299586720690262, 'units': '768.0', 'activation': 'tanh', 'batch': '2048', 'dropout': 'True', 'epochs': '50.0', 'shape': 87}
Ajusting y
Transforming dimension
(600000, 2)
Fitting model
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch

{'lstm': {'mean': 0.0, 'stand_dev': 0.0, 'conf_int': (0.0, 0.0)}}

In [None]:
print("Reading data")
mat = get_data()
print("Spliting the data into train/test with 75/25 proportion")
train, test = split(0.75, mat)
print("Spliting the data into x and y features")
x_train, y_train, x_test, y_test = train_test(train, test)


# print("Find best parameters for LSTM model")
# find_best(x_train, y_train, 10, search_space_lstm)

x_train, y_train, x_test = preprocessing('filter', x_train, y_train, x_test)
print("Converting training data")
y_train = ajusta_y(y_train)
x_train, y_train = transform_dimension_timesteps(x_train, y_train, time_steps=1)

In [None]:
x_test.shape

In [None]:
import random as python_random
# Definir a semente para a geração de números aleatórios do numpy
np.random.seed(42)

# Definir a semente para a geração de números aleatórios do Python
python_random.seed(42)

# Definir a semente para a geração de números aleatórios do TensorFlow
tf.random.set_seed(42)

In [None]:
params= {'activation': 'relu',
        'units': 192,
        'batch': 2516,
        'dropout': True,
        'learning_rate': 0.00015209924599838263,
        'shape': x_train.shape[2]}

model = MyModel().build(**params)

print("Training the model")
model.fit(x_train, y_train, batch_size=4096, epochs=50)

In [None]:
pred = predict_keras(model, x_test)
f1 = f1_score(y_test, pred)
print(confusion_matrix(y_test, pred))
print(f1)

In [None]:
x_test_p = pd.DataFrame(x_test)
test = pd.concat([x_test_p.reset_index(drop=True),
                  y_test.reset_index(drop=True)], axis=1)

In [None]:
results = []
for i in range(10):
    with mlflow.start_run(nested=True):
        mlflow.log_param("model", 'lstm')
        mlflow.log_param("stage", "statistics_analysis")
        mlflow.log_param("model_selection", 'train_test')
        mlflow.log_param("random_i", i)

        test_shuffle = test.sample(frac=.5, random_state=i)

        x_test_n = test_shuffle.drop(["INDISPONIBILIDADE"], axis=1)
        y_test_n = test_shuffle[["INDISPONIBILIDADE"]]

        pred = predict_keras(model, x_test_n)

        f1, roc, rec, pre, acc = eval_metrics(y_test_n, pred)

        results.append(f1)

        mlflow.log_metric('f1', f1)
        mlflow.log_metric('roc', roc)
        mlflow.log_metric('recall', rec)
        mlflow.log_metric('precision', pre)
        mlflow.log_metric('accuracy', acc)
media = np.mean(results)
dp = np.std(results, ddof=1)
ci = sms.DescrStatsW(results).tconfint_mean()
print(media, dp, ci)