In [1]:
import os
import numpy as np
import pandas as pd
import csv
from datetime import datetime
import plotly.express as px
from sklearn.feature_selection import VarianceThreshold
from warnings import simplefilter
import matplotlib.pyplot as plt
import configparser

In [2]:
# -- Functions

%run functions.ipynb

In [3]:
# -- Init Configuration Parameters

%run predict_notebook_sections/configuration.ipynb

In [4]:
# -- Load Training data

train_data_df = pd.read_csv(consolidated_data_set_file_path.format(data_set_code=train_data_set_code)).set_index("timestamp").fillna(0)
train_data_df.index = pd.to_datetime(train_data_df.index)
print(train_data_df.shape)

train_data_df = train_data_df.select_dtypes(include=['number'])
print(train_data_df.shape)

(20074, 4702)
(20074, 4702)


In [5]:
# -- Load Prod data

prod_data_dfs = []
for data_set_code in data_sets_to_preprocess:
    print(data_set_code)
    
    df_tmp = pd.read_csv(consolidated_data_set_file_path.format(data_set_code=data_set_code)).set_index("timestamp").fillna(0)
    df_tmp.index = pd.to_datetime(df_tmp.index)
    df_tmp.sort_index(inplace=True)
    prod_data_dfs.append(df_tmp)
    print(df_tmp.shape)

linear-cpu-stress-userapi-051516
(181, 4389)
linear-memory-stress-userapi-051218
(181, 4446)
linear-network-loss-userapi-051808
(180, 4393)
linear-network-delay-userapi-051816
(180, 4355)
linear-network-corrupt-userapi-050420
(189, 4336)
linear-network-delay-redis-091409
(151, 5786)
linear-network-loss-redis-091414
(151, 5567)
linear-cpu-stress-redis-091514
(151, 5808)
linear-memory-stress-redis-091522
(151, 6015)
linear-network-corrupt-redis-091622
(151, 5791)
linear-network-delay-userhandlers-082208
(151, 4165)
linear-network-loss-userhandlers-082119
(151, 4161)
linear-network-corrupt-userhandlers-082114
(151, 4363)
linear-network-delay-redis-092016
(151, 5844)
linear-network-loss-redis-092022
(151, 5673)
linear-network-corrupt-redis-092111
(151, 5923)
linear-network-delay-scorm-092511
(151, 6018)
linear-cpu-stress-scorm-092722
(151, 5862)
linear-memory-stress-scorm-092801
(151, 5600)


In [7]:
# -- Aligh the prod datasets' column sets to the train dataset's column set
# We use outer_by_training_ds

if dataset_alignment_method == "outer_by_training_ds":
    
    for ii, data_set_code in enumerate(data_sets_to_preprocess):

        # - Save the kpis not seen in prod
        
        kpis_not_seen_in_prod_names_transformed = tranform_kpi_names(list(set(train_data_df.columns) - set(prod_data_dfs[ii].columns)))
        # Create a target folder if does not exist
        create_dir(kpis_not_seen_in_prod_dir_path)
        with open(kpis_not_seen_in_prod_file_path.format(data_set_code=data_set_code), "w") as outfile:
            csv_writer = csv.writer(outfile)
            for row in kpis_not_seen_in_prod_names_transformed:
                csv_writer.writerow([row])
        
        train_data_df, prod_data_dfs[ii] = train_data_df.align(prod_data_dfs[ii], axis=1, join='left')
        print(data_set_code, ":", train_data_df.shape, prod_data_dfs[ii].shape)
        


if dataset_alignment_method == "intersection":
    for vv in range(2):
        for ii, data_set_code in enumerate(data_sets_to_preprocess):
            train_data_df, prod_data_dfs[ii] = train_data_df.align(prod_data_dfs[ii], axis=1, join='inner')
            print(data_set_code, ":", train_data_df.shape, prod_data_dfs[ii].shape)

linear-cpu-stress-userapi-051516 : (20074, 4702) (181, 4702)
linear-memory-stress-userapi-051218 : (20074, 4702) (181, 4702)
linear-network-loss-userapi-051808 : (20074, 4702) (180, 4702)
linear-network-delay-userapi-051816 : (20074, 4702) (180, 4702)
linear-network-corrupt-userapi-050420 : (20074, 4702) (189, 4702)
linear-network-delay-redis-091409 : (20074, 4702) (151, 4702)
linear-network-loss-redis-091414 : (20074, 4702) (151, 4702)
linear-cpu-stress-redis-091514 : (20074, 4702) (151, 4702)
linear-memory-stress-redis-091522 : (20074, 4702) (151, 4702)
linear-network-corrupt-redis-091622 : (20074, 4702) (151, 4702)
linear-network-delay-userhandlers-082208 : (20074, 4702) (151, 4702)
linear-network-loss-userhandlers-082119 : (20074, 4702) (151, 4702)
linear-network-corrupt-userhandlers-082114 : (20074, 4702) (151, 4702)
linear-network-delay-redis-092016 : (20074, 4702) (151, 4702)
linear-network-loss-redis-092022 : (20074, 4702) (151, 4702)
linear-network-corrupt-redis-092111 : (2007

In [9]:
# -- Transform the KPI names into the {ServiceName}_{MetricName} format
# We do it

# Get the tranformed column names
transformed_column_names = tranform_kpi_names(train_data_df.columns)

# Update dfs
train_data_df.columns = transformed_column_names
for ii in range(len(prod_data_dfs)):
    prod_data_dfs[ii].columns = transformed_column_names


train_data_df.head()

Unnamed: 0_level_0,unknown-node_lm-UserCount,unknown-node_lm-Requests/s,unknown-node_lm-Failures/s,unknown-node_lm-50%,unknown-node_lm-66%,unknown-node_lm-75%,unknown-node_lm-80%,unknown-node_lm-90%,unknown-node_lm-95%,unknown-node_lm-98%,...,alms-core-rui-dcount_gm-64-count,alms-core-rui-dcount_gm-64-firstquartile,alms-core-rui-dcount_gm-64-thirdquartile,alms-core-rui-dmean_gm-64-min,alms-core-rui-dmean_gm-64-max,alms-core-rui-dmean_gm-64-mean,alms-core-rui-dmean_gm-64-median,alms-core-rui-dmean_gm-64-count,alms-core-rui-dmean_gm-64-firstquartile,alms-core-rui-dmean_gm-64-thirdquartile
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-03-28 12:54:00,1.0,0.0,0.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-03-28 12:57:00,1.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-03-28 12:58:00,1.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-03-28 12:59:00,1.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-03-28 13:00:00,1.0,0.359016,0.0,1700.0,1700.0,1700.0,1700.0,1700.0,1700.0,1700.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# -- Create and save the prod data sets configuration from the experminetal log

# Load the experminetal log
df_experiment_log = pd.read_csv(failure_log)

# Parse the experminetal log
data_sets_configs = []
for data_set_idx, data_set_code in enumerate(data_sets_to_preprocess):

    if data_set_code in normal_data_sets:
        continue

    print(data_set_code)

    start_failure_time = round_time(df_experiment_log[df_experiment_log["folder_name"] == data_set_code]["failure_begin_timestamp"].iloc[0])
    disruption_minute = int(df_experiment_log[df_experiment_log["folder_name"] == data_set_code]["Disruption"].iloc[0])
    start_failure_minute = np.where(prod_data_dfs[data_set_idx].index == start_failure_time)[0][0]
    exp_duration = prod_data_dfs[data_set_idx].shape[0] - 1

    the_config = [data_set_code, exp_duration, start_failure_minute, disruption_minute]
    data_sets_configs.append(the_config)

# Save the prod data sets configuration
create_dir("/".join(data_sets_config_file_path.split("/")[:-1]))
with open(data_sets_config_file_path, 'w') as f:
    write = csv.writer(f)
    write.writerow(["code", "total", "fi_start", "fi_end"])
    write.writerows(data_sets_configs)

linear-cpu-stress-userapi-051516
linear-memory-stress-userapi-051218
linear-network-loss-userapi-051808
linear-network-delay-userapi-051816
linear-network-corrupt-userapi-050420
linear-network-delay-redis-091409
linear-network-loss-redis-091414
linear-cpu-stress-redis-091514
linear-memory-stress-redis-091522
linear-network-corrupt-redis-091622
linear-network-delay-userhandlers-082208
linear-network-loss-userhandlers-082119
linear-network-corrupt-userhandlers-082114
linear-network-delay-redis-092016
linear-network-loss-redis-092022
linear-network-corrupt-redis-092111
linear-network-delay-scorm-092511
linear-cpu-stress-scorm-092722
linear-memory-stress-scorm-092801


In [11]:
# -- Exclude from the training data set the rows where the workload is <= user_count_threshold
# We do it

if DROP_LOW_TRAFFIC_POINTS_FROM_TRAINING:
    print(DROP_LOW_TRAFFIC_POINTS_FROM_TRAINING)

    train_data_df = train_data_df[train_data_df[column_name_number_of_users] >= DROP_LOW_TRAFFIC_POINTS_THRESHOLD]
    for ii in range(len(prod_data_dfs)):
        prod_data_dfs[ii] = prod_data_dfs[ii][prod_data_dfs[ii][column_name_number_of_users] >= DROP_LOW_TRAFFIC_POINTS_THRESHOLD]

True


In [None]:
# -- Drop the constant columns
# We do it

if DROP_CONSTANT_SERIES:

    print(DROP_CONSTANT_SERIES)
    
    # - Collect the constant KPIs in a list

    constant_filter = VarianceThreshold(threshold=0.00001)
    constant_filter.fit(train_data_df)

    constant_columns = [
        column for column in train_data_df.columns
        if column not in train_data_df.columns[constant_filter.get_support()] 
        and column not in columns_to_exclude_from_constant_kpis_search
    ]
    columns_to_drop = list(set(constant_columns))

    print("Number of the constant columns:", len(columns_to_drop))
    for col_ in columns_to_drop:
        print(col_)

    # Drop the constant columns in training data
    train_data_df.drop(columns=columns_to_drop, inplace=True)

    # Drop the constant columns in prod data
    for ii in range(len(prod_data_dfs)):
        prod_data_dfs[ii].drop(columns=columns_to_drop, inplace=True)

    train_data_df.head()

In [13]:
# -- Drop the columns that do not contain service name or kube node name
# We don't do it

if DROP_KPIS_WITH_UNKNOWN_NODES:
    print(DROP_KPIS_WITH_UNKNOWN_NODES)
    
    columns_to_drop = []
    for column_name in list(train_data_df.columns):
        resource_name = column_name.split("_")[0]
        if (resource_name not in LMS_SERVICE_LIST) and (resource_name[:4] not in KUBE_NODE_LIST):
            columns_to_drop.append(column_name)
            
    train_data_df.drop(columns=columns_to_drop, inplace=True)
    for ii in range(len(prod_data_dfs)):
        prod_data_dfs[ii].drop(columns=columns_to_drop, inplace=True)

In [14]:
# -- Drop locust metrics
# We don't do it

if DROP_LOCUST_KPIS:
    print(DROP_LOCUST_KPIS)

    columns_to_drop = [column_name for column_name in list(train_data_df.columns) if column_name.split("_")[0] == locust_node_name]
    
    train_data_df.drop(columns=columns_to_drop, inplace=True)
    for ii in range(len(prod_data_dfs)):
        prod_data_dfs[ii].drop(columns=columns_to_drop, inplace=True)

In [15]:
# -- Drop the columns of the extended set
# We don't do it

if DROP_EXTENDED_KPI_SET:
    print(DROP_EXTENDED_KPI_SET)
    
    train_data_df = drop_columns_by_filter(train_data_df, set_of_kpis_to_drop)
    for ii in range(len(prod_data_dfs)):
        prod_data_dfs[ii] = drop_columns_by_filter(prod_data_dfs[ii], set_of_kpis_to_drop)

In [16]:
# -- Apply the first order differencing to all collumns
# We don't do it

if APPLY_DIFFERENCING:
    print(APPLY_DIFFERENCING)
    
    if kpis_to_apply_differencing == "all":
        kpis_to_apply_differencing = list(train_data_df.columns)
    else:
        kpis_to_apply_differencing = kpis_to_apply_differencing.split(",")

    # For the training data set
    for idx, col in enumerate(train_data_df.columns):
        if col in kpis_to_apply_differencing and col != column_name_number_of_users:
            train_data_df[col] = difference_order(train_data_df[col], 1, 1)

    # For the production data sets
    for ii in range(len(prod_data_dfs)):
        for idx, col in enumerate(prod_data_dfs[ii].columns):
            if col in kpis_to_apply_differencing and col != column_name_number_of_users:
                prod_data_dfs[ii][col] = difference_order(prod_data_dfs[ii][col], 1, 1)

In [17]:
# -- Keep only the service related KPIs
# We don't do it

if KEEP_ONLY_SERVICE_RELATED_KPIS:
    print(KEEP_ONLY_SERVICE_RELATED_KPIS)

    columns_to_drop = [column_name for column_name in list(train_data_df.columns) if column_name.split("_")[0] not in app_service_list]
    
    train_data_df.drop(columns=columns_to_drop, inplace=True)
    for ii in range(len(prod_data_dfs)):
        prod_data_dfs[ii].drop(columns=columns_to_drop, inplace=True)

In [18]:
# Detect & Remove up outliers for the predefined set of KPIs from the Training data set
# We don't do it

if REMOVE_OUTLIERS:
    print(REMOVE_OUTLIERS)

    for kpi in list(train_data_df.columns):

        if "count" in kpi:
            continue

        Q1 = np.percentile(train_data_df[kpi], 25, method='midpoint')
        Q3 = np.percentile(train_data_df[kpi], 75, method='midpoint')
        Q95 = np.percentile(train_data_df[kpi], 95, method='midpoint')
        
        # IQR = Q3 - Q1
        # upper = Q3 + 1.5 * IQR
        # lower = Q1 - 1.5 * IQR
        upper = Q95

        upper_array = np.where(train_data_df[kpi] >= upper)[0]
        # lower_array = np.where(train_data_df[kpi] <= lower)[0]

        train_data_df.drop(index=upper_array, inplace=True, errors='ignore')
        # train_data_df.drop(index=lower_array, inplace=True, errors='ignore')
        
    
    print(train_data_df.shape)

In [19]:
# -- Save data sets

# Create a target folder if does not exist
create_dir(tuned_data_set_dir_path)

# Save the training data set
train_data_df.to_csv(tuned_data_set_file_path.format(data_set_code=train_data_set_code), encoding='utf-8', index=False, header=True)
print("Train DF saved", train_data_df.shape)

# Save the prod data sets
for ii in range(len(prod_data_dfs)):
    data_set_code = prod_data_set_codes[ii]
    df_tmp = prod_data_dfs[ii]
    df_tmp.to_csv(tuned_data_set_file_path.format(data_set_code=data_set_code), encoding='utf-8', index=False, header=True)
    
    print("{data_set_code} DF Saved. Shape: ".format(data_set_code=data_set_code), df_tmp.shape)

Train DF saved (14514, 3948)
linear-cpu-stress-userapi-051516 DF Saved. Shape:  (181, 3948)
linear-memory-stress-userapi-051218 DF Saved. Shape:  (181, 3948)
linear-network-loss-userapi-051808 DF Saved. Shape:  (180, 3948)
linear-network-delay-userapi-051816 DF Saved. Shape:  (180, 3948)
linear-network-corrupt-userapi-050420 DF Saved. Shape:  (189, 3948)
linear-network-delay-redis-091409 DF Saved. Shape:  (151, 3948)
linear-network-loss-redis-091414 DF Saved. Shape:  (151, 3948)
linear-cpu-stress-redis-091514 DF Saved. Shape:  (151, 3948)
linear-memory-stress-redis-091522 DF Saved. Shape:  (151, 3948)
linear-network-corrupt-redis-091622 DF Saved. Shape:  (151, 3948)
linear-network-delay-userhandlers-082208 DF Saved. Shape:  (151, 3948)
linear-network-loss-userhandlers-082119 DF Saved. Shape:  (151, 3948)
linear-network-corrupt-userhandlers-082114 DF Saved. Shape:  (151, 3948)
linear-network-delay-redis-092016 DF Saved. Shape:  (151, 3948)
linear-network-loss-redis-092022 DF Saved. Shap