In [5]:
import os
import numpy as np
import pandas as pd
import csv
from datetime import datetime
import plotly.express as px
from sklearn.feature_selection import VarianceThreshold
from warnings import simplefilter
import matplotlib.pyplot as plt
import configparser
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [6]:
# --- Functions

%run functions.ipynb

In [7]:
# --- Init Configuration Parameters

%run predict_notebook_sections/configuration.ipynb

Configuration file loaded: config-train_ticket.ini


In [8]:
# --- Load training data

train_data_df = pd.read_csv(consolidated_data_set_file_path.format(data_set_code=train_data_set_code))

In [9]:
# --- Format the training data set

# Add the timestamp column to the training df
train_data_df['timestamp'] = [ii for ii in range(int(train_data_df.shape[0]))]

# Set timestamp columns as index, fillna the training df
train_data_df = train_data_df.set_index("timestamp").fillna(0)

# Convert the index column to datetime format
train_data_df.index = pd.to_datetime(train_data_df.index)
train_data_df = train_data_df.select_dtypes(include=['number'])

print("Training data set shape:", train_data_df.shape)

Training data set shape: (19996, 12525)


In [10]:
# --- Load & format production data sets

print("Production data sets (data set name - shape):")

prod_data_dfs = []
for data_set_code in data_sets_to_preprocess:
    df_tmp = pd.read_csv(consolidated_data_set_file_path.format(data_set_code=data_set_code)).set_index("timestamp").fillna(0)
    df_tmp.index = pd.to_datetime(df_tmp.index)
    df_tmp.sort_index(inplace=True)
    prod_data_dfs.append(df_tmp)

    print(data_set_code, "-", df_tmp.shape)

Production data sets (data set name - shape):
linear-cpu-stress-ts-station-service-020211 - (181, 12615)
linear-cpu-stress-ts-basic-service-020616 - (180, 12195)
linear-cpu-stress-ts-train-service-020713 - (181, 12063)
linear-memory-stress-ts-station-service-021917 - (241, 15186)
linear-memory-stress-ts-train-service-021316 - (240, 14173)
linear-memory-stress-ts-basic-service-022009 - (240, 14706)
linear-network-delay-ts-station-service-013016 - (180, 12652)
linear-network-delay-ts-basic-service-020911 - (181, 13859)
linear-network-delay-ts-train-service-020116 - (181, 12449)
linear-cpu-train-memory-station-020912 - (181, 14141)
linear-cpu-station-memory-train-020810 - (181, 15003)
linear-cpu-train-delay-station-021212 - (181, 13827)
linear-cpu-station-delay-train-021310 - (180, 12619)
linear-memory-train-delay-station-022621 - (240, 16097)
linear-memory-station-delay-train-022622 - (241, 15924)


In [11]:
# --- Align the production datasets' column sets to the train dataset's column set. We use the join by left method

if dataset_alignment_method == "outer_by_training_ds":
    for ii, data_set_code in enumerate(data_sets_to_preprocess):

        # - Save the kpis not seen in prod
        kpis_not_seen_in_prod_names_transformed = tranform_kpi_names(list(set(train_data_df.columns) - set(prod_data_dfs[ii].columns)), statistics)
        # Create a target folder if does not exist
        create_dir(kpis_not_seen_in_prod_dir_path)
        with open(kpis_not_seen_in_prod_file_path.format(data_set_code=data_set_code), "w") as outfile:
            csv_writer = csv.writer(outfile)
            for row in kpis_not_seen_in_prod_names_transformed:
                csv_writer.writerow([row])
        
        train_data_df, prod_data_dfs[ii] = train_data_df.align(prod_data_dfs[ii], axis=1, join='left')
        print(data_set_code, ":", train_data_df.shape, prod_data_dfs[ii].shape)
        

if dataset_alignment_method == "intersection":
    for vv in range(2):
        for ii, data_set_code in enumerate(data_sets_to_preprocess):
            train_data_df, prod_data_dfs[ii] = train_data_df.align(prod_data_dfs[ii], axis=1, join='inner')
            print(data_set_code, ":", train_data_df.shape, prod_data_dfs[ii].shape)

linear-cpu-stress-ts-station-service-020211 : (19996, 12525) (181, 12525)
linear-cpu-stress-ts-basic-service-020616 : (19996, 12525) (180, 12525)
linear-cpu-stress-ts-train-service-020713 : (19996, 12525) (181, 12525)
linear-memory-stress-ts-station-service-021917 : (19996, 12525) (241, 12525)
linear-memory-stress-ts-train-service-021316 : (19996, 12525) (240, 12525)
linear-memory-stress-ts-basic-service-022009 : (19996, 12525) (240, 12525)
linear-network-delay-ts-station-service-013016 : (19996, 12525) (180, 12525)
linear-network-delay-ts-basic-service-020911 : (19996, 12525) (181, 12525)
linear-network-delay-ts-train-service-020116 : (19996, 12525) (181, 12525)
linear-cpu-train-memory-station-020912 : (19996, 12525) (181, 12525)
linear-cpu-station-memory-train-020810 : (19996, 12525) (181, 12525)
linear-cpu-train-delay-station-021212 : (19996, 12525) (181, 12525)
linear-cpu-station-delay-train-021310 : (19996, 12525) (180, 12525)
linear-memory-train-delay-station-022621 : (19996, 125

In [12]:
# --- Transform the KPI names

# Get the tranformed column names
transformed_column_names = tranform_kpi_names(train_data_df.columns, statistics)

# Update the columns in training data frame
train_data_df.columns = transformed_column_names

# Update the columns in production data frames
for ii in range(len(prod_data_dfs)):
    prod_data_dfs[ii].columns = transformed_column_names

In [13]:
# --- Clean KPI set

# KPIs to drop
kpis_to_remove = "distribut|POD_PHASE-failed|POD_PHASE-pending|lm_TotalMedianResponseTime|FAULT-TYPE-minor"

# Drop KPIs from training data frame
train_data_df = clean_data(train_data_df, kpis_to_remove)

# Drop KPIs from production data frames
for ii, data_set_code in enumerate(data_sets_to_preprocess):
    prod_data_dfs[ii] = clean_data(prod_data_dfs[ii], kpis_to_remove) 

In [14]:
# --- Drop the columns with relatively constant variance

if DROP_CONSTANT_SERIES:
    print("Drop the columns with relatively constant variance", DROP_CONSTANT_SERIES)
    
    # - Collect the constant KPIs in a list
    constant_filter = VarianceThreshold(threshold=0.00001)
    constant_filter.fit(train_data_df)

    constant_columns = [
        column for column in train_data_df.columns
        if column not in train_data_df.columns[constant_filter.get_support()] 
        and column not in columns_to_exclude_from_constant_kpis_search
    ]
    columns_to_drop = list(set(constant_columns))

    print("Number of the constant columns:", len(columns_to_drop))

    # Drop the constant columns in training data frame
    train_data_df.drop(columns=columns_to_drop, inplace=True)

    # Drop the constant columns in production data frames
    for ii in range(len(prod_data_dfs)):
        prod_data_dfs[ii].drop(columns=columns_to_drop, inplace=True)

Drop the columns with relatively constant variance True
Number of the constant columns: 3007


In [15]:
# --- Differencing: Create a list of KPIs to apply differencing

kpi_indexes_to_diff = []

# - Get KPIs indexes by the theshold
for ii, data_set_code in enumerate(data_sets_to_preprocess):
    if data_set_code == "linear-cpu-station-memory-train-020810":
        kpi_indexes_to_diff = get_cols_to_diff_by_value_threshold(prod_data_dfs[ii], 2)

# - Add the KPIs with the specific indexes
kpi_indexes_to_diff = set(kpi_indexes_to_diff + [6177, 6178, 6179, 6180, 6181, 6182, 7820, 7794, 7815, 7814, 7811, 7816, 7819, 7793, 7804, 7818, 7789, 7817, 7813, 7812, 7809, 7810, 7811])
# TEST kpi_indexes_to_diff = set(kpi_indexes_to_diff + [6177])

# - Convert the indexes to KPI names
kpi_names_to_diff = [train_data_df.columns[kpi_index] for kpi_index in kpi_indexes_to_diff]

# - Add the KPI names by the metric names
metrics_to_diff = [115, 167, 112, 94]

all_kpis = list(train_data_df.columns)
for element in all_kpis:
    for metric_to_diff in metrics_to_diff:
        metric_to_diff_full_str = "metric-" + str(metric_to_diff)
        if metric_to_diff_full_str in element:
            kpi_names_to_diff.append(element)
            break

# - Keep only the uniqe KPI names
kpis_to_apply_differencing = set(kpi_names_to_diff)

print("Number of KPIs to apply differencing:", len(kpis_to_apply_differencing))

Number of KPIs to apply differencing: 1194


In [16]:
# --- Differencing: Apply differency to the training data set

if not preprocess_only_prod:
    for idx, col in enumerate(train_data_df.columns):
        if col in kpis_to_apply_differencing and col != column_name_number_of_users:
            if col == "CONTAINER-NAME-ts-travel-service-FAULT-TYPE_metric-112-min":
                continue

            train_data_df[col] = difference_order(train_data_df[col], 1, 1)

  value = dataset[i] - dataset[i - interval]


In [17]:
# --- Differencing: Apply differencing to the production data sets

for data_set_idx, data_set_code in enumerate(data_sets_to_preprocess):
    for col_idx, col in enumerate(prod_data_dfs[data_set_idx].columns):
        if col in kpis_to_apply_differencing and col != column_name_number_of_users:
            if col == "CONTAINER-NAME-ts-travel-service-FAULT-TYPE_metric-112-min":
                continue

            if col in prod_data_dfs[data_set_idx].columns:
                prod_data_dfs[data_set_idx][col] = difference_order(prod_data_dfs[data_set_idx][col], 1, 1)

  value = dataset[i] - dataset[i - interval]


In [18]:
# --- Save tunned data: create a folder

# Create a target folder if does not exist
create_dir(tuned_data_set_dir_path)

In [19]:
# --- Save tunned data: training data set

if not preprocess_only_prod:
    train_data_df.to_csv(tuned_data_set_file_path.format(data_set_code=train_data_set_code), encoding='utf-8', index=False, header=True)
    print("Train DF saved. Shape:", train_data_df.shape)

Train DF saved. Shape: (19996, 8164)


In [20]:
# --- Save tunned data: production data sets

for data_set_idx, data_set_code in enumerate(data_sets_to_preprocess):
    prod_data_dfs[data_set_idx].to_csv(tuned_data_set_file_path.format(data_set_code=data_set_code), encoding='utf-8', index=False, header=True)
    print("{data_set_code} DF Saved. Shape: ".format(data_set_code=data_set_code), prod_data_dfs[data_set_idx].shape)

linear-cpu-stress-ts-station-service-020211 DF Saved. Shape:  (181, 8164)
linear-cpu-stress-ts-basic-service-020616 DF Saved. Shape:  (180, 8164)
linear-cpu-stress-ts-train-service-020713 DF Saved. Shape:  (181, 8164)
linear-memory-stress-ts-station-service-021917 DF Saved. Shape:  (241, 8164)
linear-memory-stress-ts-train-service-021316 DF Saved. Shape:  (240, 8164)
linear-memory-stress-ts-basic-service-022009 DF Saved. Shape:  (240, 8164)
linear-network-delay-ts-station-service-013016 DF Saved. Shape:  (180, 8164)
linear-network-delay-ts-basic-service-020911 DF Saved. Shape:  (181, 8164)
linear-network-delay-ts-train-service-020116 DF Saved. Shape:  (181, 8164)
linear-cpu-train-memory-station-020912 DF Saved. Shape:  (181, 8164)
linear-cpu-station-memory-train-020810 DF Saved. Shape:  (181, 8164)
linear-cpu-train-delay-station-021212 DF Saved. Shape:  (181, 8164)
linear-cpu-station-delay-train-021310 DF Saved. Shape:  (180, 8164)
linear-memory-train-delay-station-022621 DF Saved. Sha

In [21]:
# --- Load Tuned Training data

if preprocess_only_prod:
    train_data_df = pd.read_csv(tuned_data_set_file_path.format(data_set_code=train_data_set_code))
    print(train_data_set_code, train_data_df.shape)

In [22]:
# --- Build a scaler

# Create a numpy.ndarray of the DF's values
data_arr = train_data_df.values.astype(float)

# Get the train data array
train_data, _, _, _ = train_test_split(data_arr, [ii for ii in range(len(data_arr))], test_size=0.2, random_state=SEED)

# Build the scaler on the train data array
scaler = MinMaxScaler()
scaler.fit(train_data)

In [23]:
# --- Normalize Training data

if not preprocess_only_prod:
    train_data_df = normalize(scaler, train_data_df)

(19996, 8164)


In [24]:
# --- Normalize Production data

for data_set_idx, data_set_code in enumerate(data_sets_to_preprocess):
    prod_data_dfs[data_set_idx] = normalize(scaler, prod_data_dfs[data_set_idx])

(181, 8164)
(180, 8164)
(181, 8164)
(241, 8164)
(240, 8164)
(240, 8164)
(180, 8164)
(181, 8164)
(181, 8164)
(181, 8164)
(181, 8164)
(181, 8164)
(180, 8164)
(240, 8164)
(241, 8164)


In [25]:
# --- Save normalized data: Create folder

# Create a target folder if does not exist
create_dir(normalized_data_set_dir_path)

In [26]:
# --- Save normalized data: Training data

if not preprocess_only_prod:
    train_data_df.to_csv(normalized_data_set_file_path.format(data_set_code=train_data_set_code), encoding='utf-8', index=False, header=True)
    print("Train DF saved. Shape:", train_data_df.shape)

Train DF saved. Shape: (19996, 8164)


In [27]:
# --- Save normalized data: Production data

for data_set_idx, data_set_code in enumerate(data_sets_to_preprocess):
    prod_data_dfs[data_set_idx].to_csv(normalized_data_set_file_path.format(data_set_code=data_set_code), encoding='utf-8', index=False, header=True)
    print("{data_set_code} DF saved. Data shape: {data_set_shape}".format(data_set_code=data_set_code, data_set_shape=prod_data_dfs[data_set_idx].shape))

linear-cpu-stress-ts-station-service-020211 DF saved. Data shape: (181, 8164)
linear-cpu-stress-ts-basic-service-020616 DF saved. Data shape: (180, 8164)
linear-cpu-stress-ts-train-service-020713 DF saved. Data shape: (181, 8164)
linear-memory-stress-ts-station-service-021917 DF saved. Data shape: (241, 8164)
linear-memory-stress-ts-train-service-021316 DF saved. Data shape: (240, 8164)
linear-memory-stress-ts-basic-service-022009 DF saved. Data shape: (240, 8164)
linear-network-delay-ts-station-service-013016 DF saved. Data shape: (180, 8164)
linear-network-delay-ts-basic-service-020911 DF saved. Data shape: (181, 8164)
linear-network-delay-ts-train-service-020116 DF saved. Data shape: (181, 8164)
linear-cpu-train-memory-station-020912 DF saved. Data shape: (181, 8164)
linear-cpu-station-memory-train-020810 DF saved. Data shape: (181, 8164)
linear-cpu-train-delay-station-021212 DF saved. Data shape: (181, 8164)
linear-cpu-station-delay-train-021310 DF saved. Data shape: (180, 8164)
li