In [137]:
from sklearn.preprocessing import MinMaxScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (VotingClassifier, RandomForestClassifier)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import sys
from collections import defaultdict, namedtuple
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import (precision_recall_curve, f1_score, fbeta_score, auc, make_scorer, confusion_matrix)

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)
sns.set()
plt.rcParams["figure.figsize"] = (12, 6)

In [138]:
def read_arqs(pop):
    PATH = "../../eval/"
    name_space = []
    arq = open(PATH + "nsJanelas_" + pop + ".txt", "r")
    arqPath = lambda name: f"{PATH}/{name}_{pop}.txt"

    for line in arq:
        name_space.append(sorted(list(map(int, line.split()))))

    arq_acessos = pd.read_csv(arqPath("access"), low_memory=False, sep=" ", index_col="NameSpace")
    arq_classes = pd.read_csv(arqPath("target"), low_memory=False, sep=" ", index_col="NameSpace")
    arq_vol_bytes = pd.read_csv(arqPath("vol_bytes"), low_memory=False, sep=" ", index_col="NameSpace")

    return arq_acessos, arq_classes, arq_vol_bytes

In [139]:
arq_acessos, arq_classes, arq_vol_bytes = read_arqs(pop)

In [140]:
# region constants
HOT, WARM = [0, 1]
COSTSTORAGEHOT, COSTSTORAGEWARM = [0.0230, 0.0125]
COSTOPERATIONHOT, COSTOPERATIONWARM = [0.0004, 0.0010]
COSTRETRIEVALHOT, COSTRETRIEVALWARM = [0.0000, 0.0100]
# endregion

def object_cost(vol_gb, acc, obj_class):
    acc_1k = float(acc) / 1000.0  # acc prop to 1000
    cost = 0
    if obj_class == HOT:
        cost = (vol_gb * COSTSTORAGEHOT + acc_1k * COSTOPERATIONHOT + vol_gb * acc * COSTRETRIEVALHOT)
    else:  # warm
        cost = (vol_gb * COSTSTORAGEWARM + acc_1k * COSTOPERATIONWARM + vol_gb * acc * COSTRETRIEVALWARM)
    return cost


def threshold_access(vol_gb, obj_class):
    if obj_class == "HW":  # hot to warm
        return int(vol_gb * (COSTSTORAGEWARM - COSTSTORAGEHOT) / ( COSTOPERATIONHOT - COSTOPERATIONWARM - vol_gb * 1000 * (COSTRETRIEVALWARM - COSTRETRIEVALHOT)))


def get_optimal_cost(acc_fut, vol_gb, costs):
    # Limiares de acesso para camadas H-W e W-C
    acc_thres_hw = threshold_access(vol_gb, "HW")
    QQ0 = QW0 = 0
    if acc_fut > acc_thres_hw:  # HOT
        costs["opt"] += object_cost(vol_gb, acc_fut, HOT)
        QQ0 += 1
    else:  # WARM
        costs["opt"] += object_cost(vol_gb, acc_fut, WARM)
        QW0 += 1


def get_classifier_cost(row, costs, vol_gb, acc_fut, FPPenaltyEnabled):
    if row["label"] == 0 and row["pred"] == 0:
        # Se o objeto é warm e modelo acertou, adiciona custo de mudança pra warm
        costs["TN"] += object_cost(vol_gb, acc_fut, WARM)
    elif row["label"] == 0 and row["pred"] == 1:
        # Se o objeto é warm e modelo errou, adiciona penalidade por erro
        # if FPPenaltyEnabled:
        costs["FP"] += object_cost(vol_gb, acc_fut, WARM)  # - penalty
        costs["FP"] += object_cost(vol_gb, acc_fut, HOT)  # - accesses in hot tier
    elif row["label"] == 1 and row["pred"] == 0:
        # Se o objeto é hot e modelo errou, adiciona penalidade por erro
        costs["FN"] += object_cost(vol_gb, 1, WARM)  # - one access to return to hot tier
        # costs["FN"] += object_cost(vol_gb, acc_fut - 1, Hot)  # - accesses in hot tier
    elif row["label"] == 1 and row["pred"] == 1:
        # Se o objeto é hot e modelo acertou, adiciona custo de permanencia em hot
        costs["TP"] += object_cost(vol_gb, acc_fut, HOT)


def costs_of_all_classifiers(costs):
    pred_cost = costs["TP"] + costs["FN"] + costs["TN"] + costs["FP"]  # Custo1
    opt_cost = costs["opt"]
    default_cost = costs["always_H"]  #: always hot
    default_rcs = (default_cost - opt_cost) / default_cost
    pred_rcs = (default_cost - pred_cost) / default_cost
    return {
        "rcs ml": pred_rcs,
        "rcs opt": default_rcs,
        "cost ml": pred_cost,
        "cost opt": opt_cost,
        "cost all hot": default_cost,
    }


def calculate_costs(df, single_score=False, FPPenaltyEnabled=True):
    costs = defaultdict(float)  # default = 0

    for index, row in df.iterrows():
        vol_gb = float(row["vol_bytes"]) / (1024.0**3)  # vol per GB
        acc_fut = row["acc_fut"]

        # Custo otimo
        # costs vai por referencia, volta com os valores preenchidos nas colunas
        get_optimal_cost(acc_fut, vol_gb, costs)

        # Custo classificador
        get_classifier_cost(row, costs, vol_gb, acc_fut, FPPenaltyEnabled)

        # Custo simples sem otimização
        costs["always_H"] += object_cost(vol_gb, acc_fut, HOT)  #: always hot

    pred_cost = costs["TP"] + costs["FN"] + costs["TN"] + costs["FP"]  # Custo1
    if single_score:
        return pred_cost
    else:
        return costs_of_all_classifiers(costs)

In [141]:
def get_period_by_window(time_window, windowSize):
    first_period_week = time_window
    last_period_week = first_period_week + windowSize
    return [first_period_week, last_period_week]

def get_period_stamps(time_window, windowSize):
    firstPeriod, lastPeriod = get_period_by_window(time_window, windowSize)
    firstPeriodLAbel, lastPeriodLabel = get_period_by_window(lastPeriod, windowSize)
    return [firstPeriod, lastPeriod, firstPeriodLAbel, lastPeriodLabel]

def get_all_windows(time_window, windowSize, stepsToTake):
    steps = stepsToTake * time_window

    initialTrain, endTrain, initialTrainLabel, endTrainLabel = get_period_stamps(steps, windowSize)
    (initialEvaluation, endEvaluation, initialEvaluationLabel, endEvaluationLabel) = get_period_stamps(initialTrainLabel, windowSize)
    
    endEvaluationLabel = initialEvaluationLabel + stepsToTake
    
    return [
        initialTrain,
        endTrain,
        initialTrainLabel,
        endTrainLabel,
        initialEvaluation,
        endEvaluation,
        initialEvaluationLabel,
        endEvaluationLabel,
    ]

In [142]:
def get_default_classifiers(probability=False, random_state=42):
    default_clfs = {
        "SVMR": svm.SVC(
            gamma="auto", probability=probability, random_state=random_state
        ),  # rbf
        "SVML": svm.SVC(
            kernel="linear", probability=probability, random_state=random_state
        ),
        "SVMS": svm.SVC(
            kernel="sigmoid", probability=probability, random_state=random_state
        ),
        "RF": RandomForestClassifier(n_jobs=-1, random_state=random_state),
        "KNN": KNeighborsClassifier(n_jobs=-1),
        "DCT": DecisionTreeClassifier(random_state=random_state),
        "LR": LogisticRegression(n_jobs=-1),
    }
    return {
        **default_clfs,
        "HV": VotingClassifier(list(default_clfs.items()), voting="hard", n_jobs=-1),
        "SV": VotingClassifier(list(default_clfs.items()), voting="soft", n_jobs=-1),
    }


def set_classifier(clf_key, clfs_dict):
    clf = clfs_dict.get(clf_key, None)
    if clf is None:
        print("Unknown classifier!")
        sys.exit(1)
    return clf

In [143]:
def calculate_metrics(df, window=0, rel=False):
    y_true, y_pred = df["label"].values, df["pred"].values
    # precision1, recall1, _ = precision_recall_curve(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    total_samples = len(y_true)

    return pd.DataFrame(
        data={
            "Qtd obj": total_samples,
            "Accuracy": accuracy_score(y_true, y_pred),
            "auc_roc": roc_auc_score(y_true, y_pred),
            "f1_score": f1_score(y_true, y_pred),
            "f_beta_2": fbeta_score(y_true, y_pred, beta=2),
            #         "auc": auc(recall1, precision1),
            "Precision": precision_score(y_true, y_pred),
            "Recall": recall_score(y_true, y_pred),
            "P": f"{fn + tp} ({(fn + tp) / total_samples:.2%})" if rel else (fn + tp),
            "N": f"{fp + tn} ({(fp + tn) / total_samples:.2%})" if rel else fp + tn,
            "TP": f"{tp} ({tp / total_samples:.2%})" if rel else tp,
            "FP": f"{fp} ({fp / total_samples:.2%})" if rel else fp,
            "FN": f"{fn} ({fn / total_samples:.2%})" if rel else fn,
            "TN": f"{tn} ({tn / total_samples:.2%})" if rel else tn,
        },
        index=[window],
    )


def generate_results(df, window, log_id, clf_name, FPPenaltyEnabled):
    metrics = calculate_metrics(df)
    costs = calculate_costs(df)

    results = {
        "Dados": log_id,
        "Model": clf_name,
        #                "Time window": window,
        "Qtd obj": df.shape[0],
        **metrics.to_dict(orient="list"),
        **costs,
    }
    return pd.DataFrame(results, index=[window])


def aggregate_results(df_results, gp_objs=5, std=False):
    df_mean = pd.DataFrame()

    for i, df in enumerate(df_results[:gp_objs], start=1):
        df = df.drop(columns=["P", "N", "Qtd obj"])  # ,"TP", "FP", "FN", "TN"])
        mean = df.mean()

        if std:
            df_std = df.std()
            df_std.loc[["TP", "FP", "FN", "TN"]] = np.nan

            df = pd.DataFrame(
                data=np.array(
                    [
                        f"{x:.5f} ({y:.2%})" if not np.isnan(y) else f"{x:.2f}"
                        for x, y in zip(mean, df_std)
                    ]
                ).reshape(1, -1),
                columns=df.columns,
                index=[i],
            )
        else:
            df = pd.DataFrame(
                data=np.array([f"{x:.5f}" for x in mean]).reshape(1, -1),
                index=[i],
                columns=df.columns,
            )

        df_mean = pd.concat([df_mean, df]).loc[:, [*df.columns]]

    return df_mean

In [144]:
def train_ML(initial_train, end_train, initial_train_label, end_train_label, resample, classifier_name, classifiers_dictionary):
    # TRAINING
    acc_train = arq_acessos.iloc[:, initial_train:end_train]

    # filtrando volume 0 (objetos que ainda não existem nessa janela de tempo)
    vol_train = arq_vol_bytes.iloc[:, end_train - 1]
    idx = vol_train.values.ravel() > 0.0
    acc_train = acc_train[idx]

    # train_label = arq_classes.iloc[:, idx_train_label]
    train_label = arq_classes.iloc[:, initial_train_label:end_train_label]  #
    train_label = train_label.apply(lambda row: int(row.any()), axis=1)  # Aqui se existir algum 1 na janela de label de treino então pegamos 1, se não, 0.

    train_label = train_label[idx]  # filtrando volume 0

    # dealing with the imbalanced dataset
    # X_train, y_train = resample.fit_resample(acc_train, train_label)
    clf = set_classifier(classifier_name, classifiers_dictionary)
    clf.fit(acc_train, train_label)
    return clf

In [145]:
def initialize_training_parameters(arq_acessos, random_state=42, **kwargs):
    num_weeks = arq_acessos.shape[1]
    scaler = kwargs.get("scaler", MinMaxScaler(feature_range=(0, 1)))
    resample = kwargs.get("resample", RandomUnderSampler(random_state=random_state))
    clfs_dict = kwargs.get("clfs_dict", get_default_classifiers(True, random_state))
    
    return [
        pd.DataFrame(),
        range((num_weeks // steps_to_take) - ((2 * window_size) // steps_to_take)),
        scaler,
        resample,
        clfs_dict,
    ]

In [146]:
def predict(classifier_name, acc_eval, clf, initial_evaluation, end_evaluation):
    if classifier_name != "ONLINE":
        X_eval = acc_eval
        y_hat_by_obj = clf.predict(X_eval)
        y_hat_obj_red = y_hat_by_obj  # .apply(lambda row: int(row.any()), axis=1)
    # calculando métricas
    else:
        y_hat_by_obj = arq_classes.iloc[:, initial_evaluation:end_evaluation]
        y_hat_obj_red = y_hat_by_obj.apply(lambda row: int(row.any()), axis=1)
    return y_hat_obj_red

In [147]:
def evaluate(initial_evaluation, end_evaluation, initial_evaluation_label, end_evaluation_label):
    # EVALUATION
    acc_eval = arq_acessos.iloc[:, initial_evaluation:end_evaluation]

    # filtrando volume 0 (objetos que ainda não existem nessa janela de tempo )
    vol_eval = arq_vol_bytes.iloc[:, end_evaluation - 1]
    idx = vol_eval.values.ravel() > 0.0
    acc_eval = acc_eval[idx]

    # eval_label = arq_classes.iloc[:, idx_eval_label]
    eval_label = arq_classes.iloc[:, initial_evaluation_label:end_evaluation_label]  #
    eval_label = eval_label.apply(lambda row: int(row.any()), axis=1)  #

    eval_label = eval_label[idx]  # filtrando volume 0
    return acc_eval, eval_label  # o retorno são os dados de acesso e se há acesso

In [148]:
def run_train_eval(classifier_name, random_state, window_size, steps_to_take, pop, print_df=True, FP_penalty_enabled=True, **kwargs):

    (df_metrics_eval, time_total, scaler, resample,classifiers_dictionary) = initialize_training_parameters(arq_acessos, random_state, **kwargs)

    for time_window in time_total:

        (initial_train, end_train, initial_train_label, end_train_label, initial_evaluation, end_evaluation, initial_evaluation_label, end_evaluation_label) = get_all_windows(time_window, window_size, steps_to_take)

        if classifier_name != "ONLINE":
            clf = train_ML( initial_train, end_train, initial_train_label, end_train_label, resample, classifier_name, classifiers_dictionary)

        acc_eval, eval_label = evaluate(initial_evaluation, end_evaluation, initial_evaluation_label, end_evaluation_label)

        if classifier_name != "ONLINE":
            y_hat_obj_red = predict(classifier_name, acc_eval, clf, initial_evaluation, end_evaluation)
        else:
            y_hat_obj_red = predict(classifier_name, acc_eval, None, initial_evaluation, end_evaluation)

        first_col, last_col = acc_eval.columns[0], acc_eval.columns[-1]

        df = pd.DataFrame()
        df_to_evaluate = pd.DataFrame()
        df["label"] = eval_label.squeeze().values
        df["pred"] = y_hat_obj_red

        vol_eval = arq_vol_bytes.iloc[:, initial_evaluation_label:end_evaluation_label]
        acc_eval = arq_acessos.iloc[:, initial_evaluation_label:end_evaluation_label]
        df["total_vol"] = vol_eval.sum(axis=1)

        acc_with_no_zeroes = acc_eval[df.total_vol > 0]
        vol_with_no_zeroes = vol_eval[df.total_vol > 0]
        df_with_no_zeroes  = df[df.total_vol > 0]

        print(df_with_no_zeroes)
        
        for i in range(steps_to_take):
            df_to_evaluate["vol_bytes"] = vol_with_no_zeroes.iloc[:, i]
            df_to_evaluate["acc_fut"] = acc_with_no_zeroes.iloc[:, i]
            df_to_evaluate["label"] = df_with_no_zeroes.label[:]
            df_to_evaluate["pred"] = df_with_no_zeroes.pred[:]
            eval_metrics = generate_results(
                df_to_evaluate,
                f"{first_col}:{last_col}",
                pop,
                classifier_name,
                FP_penalty_enabled,
            )
            df_metrics_eval = pd.concat([df_metrics_eval, eval_metrics])

    return df_metrics_eval

In [149]:
pop = "Pop4"
clf_name = "LR"
window_size = 4
steps_to_take = 4

# agregar resultados? proximo parametro
# habilitar grafico
print_df = False
# penalidade para falso positivo
FP_penalty_enabled = True  # param de entrada

arq_acessos, arq_classes, arq_vol_bytes = read_arqs(pop)
costs = pd.DataFrame()
results = {}
random_state = 42
progressionBarDescription = None

costs_over = pd.DataFrame()
results_over = {}
costs_smote = pd.DataFrame()
results_smote = {}

results[clf_name] = run_train_eval(clf_name, random_state, window_size, steps_to_take, pop)
costs[clf_name] = results[clf_name][["cost ml", "cost all hot", "cost opt"]].sum()

   label  pred
0      1     1
1      0     0
2      1     1
3      1     1
   label  pred     total_vol
0      1     1  4.294967e+09
1      0     0  4.294967e+09
2      1     1  4.294967e+09
3      1     1  4.294967e+09
   label  pred
0      0     1
1      0     0
2      1     1
3      1     1
   label  pred     total_vol
0      0     1  4.294967e+09
1      0     0  4.294967e+09
2      1     1  4.294967e+09
3      1     1  4.294967e+09


In [150]:
print(costs)

                    LR
cost ml       0.702905
cost all hot  0.736905
cost opt      0.600405
