In [12]:
import os
import numpy as np
import pandas as pd
import csv
import json
import requests
import matplotlib.pyplot as plt
import plotly.express as px
from warnings import simplefilter
import configparser
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [13]:
# -- Functions

%run functions.ipynb

In [14]:
# - Notification filter

simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [15]:
# -- Init Configuration Parameters

%run predict_notebook_sections/configuration.ipynb

In [16]:
# -- Load data sets config

data_sets_config = load_data_sets_config(data_sets_config_file_path)

In [17]:
# -- Load Training data

train_data_df = pd.read_csv(tuned_data_set_file_path.format(data_set_code=train_data_set_code))
print(train_data_set_code, train_data_df.shape)

normal_1_14 (14514, 3948)


In [18]:
# -- Load data sets to preprocess

data_set_dfs = {}
for data_set_code in data_sets_to_preprocess:
    data_set_dfs[data_set_code] = pd.read_csv(tuned_data_set_file_path.format(data_set_code=data_set_code))
    print(data_set_code, data_set_dfs[data_set_code].shape)

linear-cpu-stress-userapi-051516 (181, 3948)
linear-cpu-stress-redis-091514 (151, 3948)
linear-memory-stress-userapi-051218 (181, 3948)
linear-memory-stress-redis-091522 (151, 3948)
linear-network-delay-userapi-051816 (180, 3948)
linear-network-delay-redis-092016 (151, 3948)


In [19]:
# -- Build a scaler

# Create a numpy.ndarray of the DF's values
data_arr = train_data_df.values.astype(float)

# Get the train data array
train_data, _, _, _ = train_test_split(data_arr, [ii for ii in range(len(data_arr))], test_size=0.2, random_state=SEED)

# Build the scaler on the train data array
scaler = MinMaxScaler()
scaler.fit(train_data)

In [20]:
# -- Normalize

train_data_df = normalize(scaler, train_data_df)
for data_set_code in data_sets_to_preprocess:
    data_set_dfs[data_set_code] = normalize(scaler, data_set_dfs[data_set_code])

(14514, 3948)
(181, 3948)
(151, 3948)
(181, 3948)
(151, 3948)
(180, 3948)
(151, 3948)


In [21]:
# -- Aggregation by the smoothing average
# We do it (length = 3)

if AGGREGATE_BY_SMOOTHING_AVERAGE:
    print(AGGREGATE_BY_SMOOTHING_AVERAGE)

    train_data_df = aggr(train_data_df, AGGREGATE_BY_SMOOTHING_AVERAGE_LENGTH)
    for data_set_code in data_sets_to_preprocess:
        data_set_dfs[data_set_code] = aggr(data_set_dfs[data_set_code], AGGREGATE_BY_SMOOTHING_AVERAGE_LENGTH)

True


In [22]:
# -- Save

# Create a target folder if does not exist
create_dir(normalized_data_set_dir_path)

train_data_df.to_csv(normalized_data_set_file_path.format(data_set_code=train_data_set_code), encoding='utf-8', index=False, header=True)
for data_set_code in data_sets_to_preprocess:
    data_set_dfs[data_set_code].to_csv(normalized_data_set_file_path.format(data_set_code=data_set_code), encoding='utf-8', index=False, header=True)
    print("{data_set_code} DF saved. Data shape: {data_set_shape}".format(data_set_code=data_set_code, data_set_shape=data_set_dfs[data_set_code].shape))

linear-cpu-stress-userapi-051516 DF saved. Data shape: (181, 3948)
linear-cpu-stress-redis-091514 DF saved. Data shape: (151, 3948)
linear-memory-stress-userapi-051218 DF saved. Data shape: (181, 3948)
linear-memory-stress-redis-091522 DF saved. Data shape: (151, 3948)
linear-network-delay-userapi-051816 DF saved. Data shape: (180, 3948)
linear-network-delay-redis-092016 DF saved. Data shape: (151, 3948)
