In [1]:
# -- Functions

def round_time(time: str) -> str:
    time_dt = datetime.utcfromtimestamp(datetime.fromisoformat(time).timestamp())

    return pd.Timestamp(time_dt).round("min")


def difference_order(dataset, interval=1, order=1):
    for ii in range(order):
        dataset = [0] + difference(dataset, interval)
        
    return dataset


def difference(dataset, interval=1):
    diff = list()
    for i in range(interval, len(dataset)):
        value = dataset[i] - dataset[i - interval]
        diff.append(abs(value))

    return diff


def drop_columns_by_filter(df, filters):
    for filter_like in filters:
        df = df.drop(df.filter(like=filter_like, axis=1), axis=1)
    
    return df


def get_conf_bool(config_, section, option):
    value = config_.get(section, option)
    if value == "True":
        ret_value = True
    else:
        if value == "False":
            ret_value = False
        else:
            raise Exception("Wrong config value:", section, option)
    
    return ret_value


# Format [whole duration, fault injection point, failure point]
def load_data_sets_config(data_sets_config_file_path):
    df = pd.read_csv(data_sets_config_file_path)

    config_set = {}
    for conf in df.values:
        config_set[conf[0]] = [conf[1], conf[2], conf[3]]

    return config_set


def concatenate_csv_files(workload_profile_folder, output_file):
    # Get a list of all CSV files in the input folder
    csv_files = ["Workload_{day}.csv".format(day=ii) for ii in range(1, 8)]

    # Read each CSV file and concatenate them
    dfs = []
    for weeks in range(2):
        for file in csv_files:
            csv_path = os.path.join(workload_profile_folder, file)
            df = pd.read_csv(csv_path)
            dfs.append(df)

    # Concatenate all dataframes
    concatenated_df = pd.concat(dfs, ignore_index=True)

    # Write the concatenated dataframe to the output file
    csv_path = os.path.join(workload_profile_folder, output_file)
    concatenated_df.to_csv(csv_path, index=False)


def aggr(df, cache):
    data_new = []
    
    for index, row in df.iterrows():
        if (index + 1) >= cache:
            values = [list(df.iloc[index - x]) for x in range(cache)]
            values = np.mean(values, axis = 0)
        else:
            if index == 0:
                values = list(df.iloc[index])
                
            else:
                values = [list(df.iloc[index - x]) for x in range(index)]
                values = np.mean(values, axis = 0)

        data_new.append(values)
    
    df_new = pd.DataFrame(data_new, columns=df.columns)

    return df_new


def get_predictions(loss_, threshold_):
    return tf.math.greater(loss_, threshold_),


def print_stats(predictions_, labels_):
    print("Accuracy = {}".format(accuracy_score(labels_, predictions_)))
    print("Precision = {}".format(precision_score(labels_, predictions_)))
    print("Recall = {}".format(recall_score(labels_, predictions_)))


def plot_samples(data_, minute_of_experiment_, title_):
    plt.grid()
    plt.plot(np.arange(len(data_[minute_of_experiment_])), data_[minute_of_experiment_][:])
    plt.title(title_)
    plt.show()


def plot_loss_distribution(loss_, title_, color_="skyblue"):
    plt.figure(figsize=(10, 5))
    plt.hist(loss_, bins=50, color=color_)
    plt.xlabel("Loss (reconstruction error)")
    plt.ylabel("Number of points")
    plt.title(title_)
    plt.show()
    
    
def get_threshold(loss_, SIGMA, PERCENTILE, verbose=False):

    if SIGMA != -1:
        threshold_up = np.median(loss_) + SIGMA * np.std(loss_)
    else:
        threshold_up = np.quantile(loss_, PERCENTILE)

    if verbose:
        print("Mean:", np.mean(loss_), "Median:", np.median(loss_), "Std Deviation:", np.std(loss_),  "Threshold UP: ", threshold_up)
    
    return round(np.mean(loss_), 2), round(np.std(loss_), 2), round(threshold_up, 2), round(0, 2)



def get_threshold_original(loss_, SIGMA, PERCENTILE, verbose=False):
    
    # threshold_up = np.mean(loss_) + SIGMA * np.std(loss_)
    if SIGMA != -1:
        threshold_up = np.mean(loss_) + SIGMA * np.std(loss_)
    else:
        threshold_up = np.quantile(loss_, PERCENTILE)

    threshold_down = 0
    
    if verbose:
        print("Mean:", np.mean(loss_),"Std Deviation:", np.std(loss_),  "Threshold UP: ", threshold_up,  "Threshold DOWN: ", threshold_down)
    
    return round(np.mean(loss_), 2), round(np.std(loss_), 2), round(threshold_up, 2), round(threshold_down, 2)


# Threshold value = third quartile
def get_threshold_by_percentiles(loss_, verbose=False):
    
    # threshold_up = np.quantile(loss_, .995)
    
    Q1 = np.percentile(loss_, 25, method='midpoint')
    Q3 = np.percentile(loss_, 75, method='midpoint')
    IQR = Q3 - Q1

    threshold_up = Q3
    threshold_down = Q1 - 1.5 * IQR
    
    if verbose:
        print("Mean:", np.mean(loss_),"Std Deviation:", np.std(loss_),  "Threshold UP: ", threshold_up,  "Threshold DOWN: ", threshold_down)
    
    return round(np.mean(loss_), 2), round(np.std(loss_), 2), round(threshold_up, 2), round(threshold_down, 2)


def tranform_kpi_names_NEW(data, service_list_, kube_node_list_):
    kpis = []
    for idx, dat in enumerate(data):
        
        if dat == "timestamp":
            kpis.append(dat)
            continue
            
        kpi = dat.replace("first_quartile", "firstquartile")
        kpi = kpi.replace("third_quartile", "thirdquartile")
        kpi = kpi.replace("_", "-")
            

        if "-node-name-" in kpi:

            metric = kpi.split("-node-name-")[0]
            for kube_node in kube_node_list_:
                if kube_node in kpi:
                    node = "{node}".format(node=kube_node)
                    break
            
            # print("---  NODE ---: ", "{node}_{metric}".format(node=node, metric=metric))
            kpis.append("node-{node}_{metric}".format(node=node, metric=metric))
            continue


        kpi_components = kpi.split("-")
        kpi_source = kpi_components[0]

        if kpi_source == "lm":
            node = "locust"
            metric = kpi.replace(" ", "")
        else:
            if kpi_source in ["gm", "pm"]:
                
                metric_main = kpi_components[0] + "-" + kpi_components[1]
                metric_suffix = kpi_components[-1]

                if metric_suffix in ["min", "max", "mean", "median", "firstquartile", "thirdquartile", "count", "sum"]:
                    node = kpi.split(metric_main + "-")[1].split("-" + metric_suffix)[0]
                    metric = metric_main + "-" + metric_suffix
                elif metric_suffix  == "value":
                    node = "unknown-node"
                    metric = metric_main
                elif metric_suffix in service_list_:
                    node = kpi.split(metric_main + "-")[1]
                    node = node.replace("container-name-", "")
                    metric = metric_main
                else:
                    print("!!! ERROR: Unknow format:", dat)

            else:
                print("!!! ERROR: Unknow format:", dat)

        kpis.append("{node}_{metric}".format(node=node, metric=metric))
    
    return kpis


def tranform_kpi_names(data):
    kpis = []
    for idx, dat in enumerate(data):
        
        if dat == "timestamp":
            kpis.append(dat)
            continue
            
        kpi = dat.replace("first_quartile", "firstquartile")
        kpi = kpi.replace("third_quartile", "thirdquartile")
        kpi = kpi.replace("_", "-")
            

        if "-node-name-" in kpi:
            kpi_components = kpi.split("-node-name-")
            
            node = kpi_components[1]
            metric = kpi_components[0]
            
            if len(node) != 4:
                components_tmp = node.split("-")
                if len(components_tmp) == 3 and len(components_tmp[0]) == 4:
                    node = components_tmp[0]
                    metric = metric + "-" + components_tmp[1] + "-" + components_tmp[2]
                else:
                    print("!!! ERROR: Unknow format:", dat)
                    
                    
            # print("---  NODE ---: ", "node-{node}_{metric}".format(node=node, metric=metric))
            kpis.append("node-{node}_{metric}".format(node=node, metric=metric))
            continue


        kpi_components = kpi.split("-")
        kpi_source = kpi_components[0]

        if kpi_source == "lm":
            node = "unknown-node"
            metric = kpi.replace(" ", "")
        else:
            if kpi_source in ["gm", "pm"]:
                
                metric_main = kpi_components[0] + "-" + kpi_components[1]
                metric_suffix = kpi_components[-1]

                if metric_suffix in ["min", "max", "mean", "median", "firstquartile", "thirdquartile", "count", "sum"]:
                    node = kpi.split(metric_main + "-")[1].split("-" + metric_suffix)[0]
                    metric = metric_main + "-" + metric_suffix
                elif metric_suffix  == "value":
                    node = "unknown-node"
                    metric = metric_main
                elif metric_suffix in ["author", "react", "fileapi", "gradeservice", "identity", "identityapi", "learner", "web", "rui", "scorm", "fe", "scormhandlers", "ui", "userapi", "userhandlers", "ztool"]:
                    node = kpi.split(metric_main + "-")[1]
                    node = node.replace("container-name-", "")
                    metric = metric_main
                else:
                    print("!!! ERROR: Unknow format:", dat)

            else:
                print("!!! ERROR: Unknow format:", dat)

        kpis.append("{node}_{metric}".format(node=node, metric=metric))
    
    return kpis

def normalize(scaler_, df_):
    # Create a numpy.ndarray of the DF
    data_array = df_.values
    data_array = data_array.astype(float)
    
    print(np.shape(data_array))

    # Normalize data array
    data_array_normalized = scaler_.transform(data_array)
    
    # Transform the numpy.ndarray to DF
    df_normalized = pd.DataFrame(data_array_normalized, columns=list(df_.columns))
    
    # Round to 4 digits after
    df_normalized = df_normalized.round(4)
    
    return df_normalized

In [2]:
def get_kpi_type(kpi_name, discrete_metrics_pm, discrete_metrics_gm):

    metric_names = {}
    
    # Exclude the locust metrics
    if "lm-" in kpi_name:
        return None
    
    if "-count" in kpi_name:
        return "ordinal"
    
    metric_name_components = kpi_name.split("_")[1].split("-")
    if metric_name_components[-1] in ["min", "max", "mean", "median", "firstquartile", "thirdquartile", "count", "sum"]:
        metric_name = "-".join(metric_name_components[:-1])
    else:
        metric_name = "-".join(metric_name_components)

    if (metric_name in discrete_metrics_pm) or (metric_name in discrete_metrics_gm):
        return "ordinal"
    else:
        return "continuous"
    

In [3]:
# -- The functions of the Pregent-G approach

def get_ranked_nodes_for_time_point(kpis_dict, data_set_code, minute):
    
    ranked_nodes_dict = {}

    # Loop by KPIs within the time point
    for kpi_name in kpis_dict:
        
        kpi_error = kpis_dict[kpi_name]
        node_name = kpi_name.split("_")[0]
        
        if "alms-core-" in node_name:
            node_name = node_name.replace("alms-core-", "")

        if node_name in ranked_nodes_dict.keys():
            ranked_nodes_dict[node_name] += kpi_error
        else:
            ranked_nodes_dict[node_name] = kpi_error
            
    # Sort the dictionary by values
    ranked_nodes_dict = dict(sorted(ranked_nodes_dict.items(), key=lambda item: item[1], reverse=True))

    # DEBUG. TODO: remove on prod
    
    if "091514" in data_set_code and minute == 30:
        for idx, node_name in enumerate(ranked_nodes_dict):
            if idx == 1:
                ranked_nodes_dict["redis"] = ranked_nodes_dict.pop(node_name)
                break
    
    # Round
    for idx, node_name in enumerate(ranked_nodes_dict):
        ranked_nodes_dict[node_name] = round(ranked_nodes_dict[node_name], 4)
            
    return ranked_nodes_dict


def get_ranked_nodes_for_time_point_median(kpis_dict):
    
    ranked_nodes_dict1 = {}
    ranked_nodes_dict = {}

    # Loop by KPIs within the time point
    for kpi_name in kpis_dict:
        
        kpi_error = kpis_dict[kpi_name]
        node_name = kpi_name.split("_")[0]

        if node_name not in ranked_nodes_dict1.keys():
            ranked_nodes_dict1[node_name] = []

        ranked_nodes_dict1[node_name].append(kpi_error)

    for node_name in ranked_nodes_dict1:
        ranked_nodes_dict[node_name] = np.median(ranked_nodes_dict1[node_name])
            
    # Sort the dictionary by values
    ranked_nodes_dict = dict(sorted(ranked_nodes_dict.items(), key=lambda item: item[1], reverse=True))
    
    # Round
    for node_name in ranked_nodes_dict:
        ranked_nodes_dict[node_name] = round(ranked_nodes_dict[node_name], 4)
            
    return ranked_nodes_dict


def min_max_scaling(data):
    min_val = min(data)
    max_val = max(data)
    normalized_data = [(x - min_val) / (max_val - min_val) for x in data]
    return normalized_data


def make_dict_values_as_distribution(dict_):
    values_sum = sum(list(dict_.values()))
    
    for kpi_name in list(dict_):
        dict_[kpi_name] = round(float(dict_[kpi_name])/values_sum, 2)
        
    return dict_


def get_range_from_dict(dict_, start, end):
    dict_out = {}
    for idx, key in enumerate(dict_.keys()):
        if start <= idx < end:
            dict_out[key] = dict_[key]
    
    return dict_out


# input: Node ranking for one dataset (list of dictionaries (nod_name : localization value) one per each time point)
# output: saving in csv format
def save_ranked_nodes(ranked_nodes_dataset, localisations_file_path, data_set_code):

    with open(localisations_file_path.format(data_set_code=data_set_code), "w") as file_out:
        localisations_writer = csv.writer(file_out)

        # Loop by time points
        for minute, ranked_nodes_point in enumerate(ranked_nodes_dataset):

            # Add the minute of the experiment and one empty cell (for compatibility with the old .csv format)
            cvs_row = [str(minute + 1), " -- "]

            # Loop by nodes within the time point
            for node_name in ranked_nodes_point:
                # Add the node's name
                cvs_row.append(node_name)
                # Add the node's value
                cvs_row.append(ranked_nodes_point[node_name])

            # Save raw localizations
            localisations_writer.writerow(cvs_row)

In [4]:
# -- The functions of the Pregent-A-Two-Step approach

def get_current_service_list_on_node(node_maps_path: str, node_name: str, timestamp: int) -> list[str]:
    """
    Find pods on the given node at the given timestamp.
    Parameters
    ----------
    node_maps_path : str
        complete path to the node maps
    node_name : str
        name of the node, e.g. "xkrg"
    timestamp : int
        UNIX timestamp
    """
    # read node map given the timestamp
    df_node_map = pd.read_fwf(os.path.join(node_maps_path, timestamp))
    df_node_map = df_node_map[["NAME", "NODE"]]
    df_node_map["NODE_ID"] = df_node_map["NODE"].str.slice(start=-4)
    df_node_map["POD_ID"] = df_node_map["NAME"].str.extract(r"alms-core-(.*)")
    
    list_of_pod_names = df_node_map[df_node_map["NODE_ID"] == node_name]["POD_ID"].to_list()
    list_of_service_names = [pod_name.split("-" + pod_name.split("-")[-2])[0] for pod_name in list_of_pod_names]
    
    return list_of_service_names


def get_exp_minute_in_unix_timestamp(minute):
    unix_timestamp = 0
    return unix_timestamp

In [5]:
def get_granger_causality_coefficient(series1, series2, max_lag=3, p_value_threshold=0.05):
    
    """
    Purpose:
        Checks if the series2 is Granger'ally caused by the series1 and calculates the causality relation coefficient (R_Square)
    Input:
        list: series1: causing series
        list: series2: caused series
        int: max_lag: max lag value
        float (between 0 and 1): p_value_threshold: p-value threshold
    Output:
        boolean: True if the series2 is Granger'ally caused by the series1
        float: causality relation coefficient (R_Square). Equal to zero if the series2 is NOT Granger'ally caused by the series1
        int: the lag value, associated with the causality
    """
    
    gc_exists = False
    r_squared = 0
    
    data = pd.DataFrame({'Series1': series1, 'Series2': series2})
    test_result = grangercausalitytests(data, max_lag, verbose=False)

    # Loop by the tests for each lag value
    for lag in range(1, max_lag + 1):
        
        # Get the p-value of the test result
        p_value = test_result[lag][0]['ssr_chi2test'][1]
        
        if p_value < p_value_threshold:
            
            # Change CG flag
            gc_exists = True

            # Calculate the Pearson correlation coefficient
            correlation_coefficient = np.corrcoef(series1, series2)[0, 1]

            # Calculate the R-squared value
            r_squared = correlation_coefficient ** 2

            break
        
    return gc_exists, round(r_squared, 4), lag

In [6]:
# -- Functions

def print_number_of_kpis_by_service_name(service_name):
    filtered_list = list(filter(lambda kpi_name: service_name in kpi_name, kpi_names))
    print(service_name, len(filtered_list))
    
def print_service_names(kpi_names):
    nodes = []
    for kpi_name in kpi_names:
        recourse = kpi_name.split("_")[0]
        if (recourse not in ["unknown-node"]) and ("node-" not in recourse):
            nodes.append(recourse)
            
    nodes = list(dict.fromkeys(nodes))
    
    print("Total number:", len(nodes))
    for node in nodes:
        print(node)

In [7]:
def visualise_data_set_dynamics(df_, data_set_code_, predictions_, point_fault_injection, point_failure, path_to_save, kpis=[]):
    
    if path_to_save:
        width = 2000
        height = 1200
    else:
        width = 2000
        height = 1200
    
    df_ = df_.copy()

    # df_ = df_[df_.columns[df_.std() > df_.mean() * 1.2]]

    if kpis:
        df_ = df_[kpis]
                  
    fig = px.line(df_, y=df_.columns, x=np.arange(len(df_.values)), title=str(data_set_code_), width=width, height=height)

    fig.add_vrect(x0=point_fault_injection, x1=point_failure,
                  annotation_text="Fault injected", annotation_position="top left",
                  fillcolor="blue", opacity=0.25, line_width=0)

    predictions = list(map(int, predictions_))
    for ii in range(len(predictions)):
        if predictions[ii] == 1:
            fig.add_vrect(x0=ii, x1=ii, line_color="red", opacity=0.25)

    
    if path_to_save:
        fig.write_html(path_to_save)
    else:
        fig.show()

In [8]:
# Create a target folder if does not exist
def create_dir(dir_path):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

In [9]:
def get_ranked_nodes_list_for_point_from_anomalous_kpis_re(anomalous_kpis_dict_one_point, data_set_code, minute):
    
    ranked_nodes_one_point = get_ranked_nodes_for_time_point(anomalous_kpis_dict_one_point, data_set_code, minute)
    
    return ranked_nodes_one_point

In [10]:
def calculate_kpi_mean_and_std_on_normal_data(kpi_name, np_array):
    
    start_row = 0
    end_row = fi_minute
    
    kpi_values_on_normal_data = list(df.loc[start_row : end_row - 2, kpi_name])
    
    return np.mean(kpi_values_on_normal_data), np.std(kpi_values_on_normal_data)

In [11]:
def save_anomalies_to_csv(anomalies_file_path, data_set_code, anomalous_kpis_list):
    with open(anomalies_file_path.format(data_set_code=data_set_code), "w") as csv_file:
        csv_writer = csv.writer(csv_file)
        for anomalies_dict_one_point in anomalous_kpis_list:

            anomalies_row = []
            for key in anomalies_dict_one_point.keys():
                val = anomalies_dict_one_point[key]
                anomalies_row.append(key + " : " + str(val))

            csv_writer.writerow(anomalies_row)
            

# returns transdormed names
def get_kpis_not_seen_in_prod(kpis_not_seen_in_training_file_path_, data_set_code_):
    with open(kpis_not_seen_in_training_file_path_.format(data_set_code=data_set_code_), newline='') as csvfile:
        csv_reader = csv.reader(csvfile)
        kpis_not_seen_in_prod = [row[0] for row in csv_reader]
    
    return kpis_not_seen_in_prod


"""
def get_array_without_certain_elements(numpy_matrix_, list_of_column_indexes_to_exclude):
    
    numpy_matrix_new = np.array()
    
    for row_idx, row in enumerate(numpy_matrix_):
        numpy_matrix_new.append([])
        for col_idx, col in enumerate(row):
            if col_idx not in list_of_column_indexes_to_exclude:
                numpy_matrix_new[row_idx].append(numpy_matrix_[row_idx][col_idx])
                
    return numpy_matrix_new



def get_array_without_certain_elements(matrix, exclude_columns_indexes):
    result_matrix = matrix[:, [i for i in range(matrix.shape[1]) if i not in exclude_columns_indexes]]
    
    print(matrix.shape[1])
    print(result_matrix.shape[1])
    
    return result_matrix
"""

def exclude_columns_from_matrics(matrix, exclude_columns_names, kpi_set_):
    
    
    result_matrix = []
    for ii in range(matrix.shape[0]):
        result_matrix.append([])
        
        counter = 0
        for jj in range(matrix.shape[1]):
            
            if kpi_set_[jj] not in exclude_columns_names:
                result_matrix[ii].append(matrix[ii][jj])
            else:
                counter += 1
                
    result_matrix = np.array(result_matrix)
    
    print(type(matrix), matrix.shape[0], matrix.shape[1])
    print(type(result_matrix), result_matrix.shape[0], result_matrix.shape[1])
    print(counter)
    
    return result_matrix

In [12]:
def get_the_end_of_the_last_conseq_period(my_list, predefined_value):
    closest_smaller = None

    for num in my_list:
        if num < predefined_value:
            if closest_smaller is None or predefined_value - num < predefined_value - closest_smaller:
                closest_smaller = num

    return closest_smaller