# Arabesque Dataset ETL

## Metadata

|Parameter | Description |
|----------|-------------|
|limit_cores| CPU cores limit of the container. Sampled every 60 seconds.|
|limit_utilization| The fraction of the CPU limit that is currently in use on the instance. This value cannot exceed 1 as usage cannot exceed the limit. Sampled every 60 seconds. After sampling, data is not visible for up to 240 seconds.|
|request_cores|Number of CPU cores requested by the container. Sampled every 60 seconds. After sampling, data is not visible for up to 120 seconds.|
|request_utilization|The fraction of the requested CPU that is currently in use on the instance. This value can be greater than 1 as usage can exceed the request. Sampled every 60 seconds. After sampling, data is not visible for up to 240 seconds.|
|limit_bytes|Local ephemeral storage limit in bytes. Sampled every 60 seconds.|
|request_bytes|Local ephemeral storage request in bytes. Sampled every 60 seconds.|
|uptime|Time in seconds that the container has been running. Sampled every 60 seconds.|
|||

In [1]:
import pandas as pd
import numpy as np
import pickle

# Tupled Dataset
## 1. Extract

In [2]:
with open('dataset_portfolio_July_all.pkl', 'rb') as f:
    dataset = pickle.load(f)

In [4]:
dataset[('dev-compute-2',
  'qryfolio',
  'qryfolio-data-job-apac-1625110200-3693355398',
  'main')]

Unnamed: 0_level_0,container/cpu/limit_cores,container/cpu/limit_utilization,container/cpu/request_cores,container/cpu/request_utilization,container/memory/limit_bytes,container/memory/limit_utilization_{'memory_type': 'evictable'},container/memory/limit_utilization_{'memory_type': 'non-evictable'},container/memory/request_bytes,container/memory/request_utilization_{'memory_type': 'evictable'},container/memory/request_utilization_{'memory_type': 'non-evictable'},container/memory/used_bytes_{'memory_type': 'evictable'},container/memory/used_bytes_{'memory_type': 'non-evictable'},container/uptime
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2021-07-01 03:30:00+00:00,0.1,,0.1,,268435456.0,,,134217728.0,,,0.0,71856128.0,15.416095
2021-07-01 03:31:00+00:00,,0.206846,,0.206846,,0.0,0.267685,,0.0,0.53537,,,
2021-07-01 03:32:00+00:00,,,,,,0.0,0.267685,,0.0,0.53537,,,


## 2. Transform

In [35]:
# get the proper instances from dataset (cuz some of the instances have incomplete feature set)
valid_containers = []
for pod in dataset.keys():
    if len(dataset[pod].keys()) == 13:
        valid_containers.append(pod)


final = {} 

# iterate on the dataset and transform each instance to the desired form
for pod in dataset.keys():
    if pod in valid_containers:
        result = {}

        # general info
        namespace = pod[1]
        pod_name =pod[2]
        container = pod[3]

        result[container] = {}
        """
        cpu ETL info:
            - fill the nan values with -1
            - compute cpu usage
            - store results in a numpy array
        """
        
        cpu_limit = dataset[pod]["container/cpu/limit_cores"].fillna(-1).to_numpy()
        cpu_request = dataset[pod]["container/cpu/request_cores"].fillna(-1).to_numpy()
        cpu_utilization = dataset[pod]["container/cpu/request_cores"].fillna(-1) *  dataset[pod]["container/cpu/request_utilization"].fillna(1)
        cpu_utilization = cpu_utilization.to_numpy()


        """
        memory ETL info:
            - fill the nan values with -1
            - store results in a numpy array
        """
        mem_limit = dataset[pod]["container/memory/limit_bytes"].fillna(-1).to_numpy()
        mem_request = dataset[pod]["container/memory/request_bytes"].fillna(-1).to_numpy()
        mem_usage_evictable = dataset[pod]["container/memory/used_bytes_{'memory_type': 'evictable'}"].fillna(-1).to_numpy()
        mem_usage_non = dataset[pod]["container/memory/used_bytes_{'memory_type': 'non-evictable'}"].fillna(-1).to_numpy()

        """
        timestamp info:
            - transform datetime to timestamp
            - transform timestamp to intervals in seconds
            - store results in a numpy array
        """
        # timestamp
        timestamp = dataset[pod].index

        # datetime to timestamp (turn datetime to timestamp)
        result[container]["timestamp"] = timestamp.values.astype(np.int64) // 10 ** 9
        
        # transform timestampt to intervals in seconds
        result[container]["timestamp"] = result[container]["timestamp"] - np.amin(result[container]["timestamp"])

        # add workload
        result[container]["workload"] = [mem_usage_non, mem_usage_evictable, cpu_utilization]

        # add limit
        result[container]["limit"] = [mem_limit, cpu_limit]

        # add request
        result[container]["request"] = [mem_request, cpu_request]

        # add to the dictionary
        if final.get(namespace, None) is None:
            final[namespace] = {}

        if final[namespace].get(pod_name, None) is None:
            final[namespace][pod_name] = {}


        final[namespace][pod_name] = result

In [39]:
final['qryfolio-daily'].keys()

dict_keys(['qryfolio-cli-backtest-global-q9m8m-1716119528', 'qryfolio-cli-backtest-global-q9m8m-1177278544', 'qryfolio-cli-backtest-emea-4z5dg-384635719', 'qryfolio-cli-backtest-emea-4z5dg-3118974067', 'qryfolio-cli-backtest-bimb-wbtrt-3376620446', 'qryfolio-cli-backtest-bimb-wbtrt-1628592346', 'qryfolio-rebalance-global-d86j7-537827377', 'qryfolio-rebalance-global-d86j7-3379045925', 'qryfolio-rebalance-oas-h85zl-3154563979', 'qryfolio-rebalance-oas-h85zl-4114233401'])

## 3. Load

In [40]:
with open('arabesque.pickle', 'wb') as handle:
    pickle.dump(final, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Nested Dataset
## 1. Extract

In [42]:
with open('dataset-2/dataset_engine.pkl', 'rb') as f:
    dataset = pickle.load(f)

## 2. Transform

In [72]:
# get the proper instances from dataset (cuz some of the instances have incomplete feature set)
valid_containers = []
for pod in dataset['dev-compute-4']['engine'].keys():
    if len(dataset['dev-compute-4']['engine'][pod]['main'].keys()) == 13:
        valid_containers.append(pod)

final = {} 

# iterate on the dataset and transform each instance to the desired form
for pod in dataset['dev-compute-4']['engine'].keys():
    result = {}
    
    # general info
    namespace = 'engine'
    pod_name = pod
    container = 'main'

    result[container] = {}

    
    # cpu info
    cpu_limit = dataset['dev-compute-4']['engine'][pod]['main']["container/cpu/limit_cores"].fillna(-1).to_numpy()
    cpu_request = dataset['dev-compute-4']['engine'][pod]['main']["container/cpu/request_cores"].fillna(-1).to_numpy()
    cpu_utilization = dataset['dev-compute-4']['engine'][pod]['main']["container/cpu/request_cores"].fillna(-1) *  dataset['dev-compute-4']['engine'][pod]['main']["container/cpu/request_utilization"].fillna(1)
    cpu_utilization = cpu_utilization.to_numpy()
    
    # memory info
    mem_limit = dataset['dev-compute-4']['engine'][pod]['main']["container/memory/limit_bytes"].fillna(-1).to_numpy()
    mem_request = dataset['dev-compute-4']['engine'][pod]['main']["container/memory/request_bytes"].fillna(-1).to_numpy()
    mem_usage_evictable = dataset['dev-compute-4']['engine'][pod]['main']["container/memory/used_bytes_{'memory_type': 'evictable'}"].fillna(-1).to_numpy()
    mem_usage_non = dataset['dev-compute-4']['engine'][pod]['main']["container/memory/used_bytes_{'memory_type': 'non-evictable'}"].fillna(-1).to_numpy()

    # timestamp
    timestamp = dataset['dev-compute-4']['engine'][pod]['main'].index
    
    # datetime to timestamp (turn datetime to timestamp)
    result[container]["timestamp"] = timestamp.values.astype(np.int64) // 10 ** 9
    
    # change timestamp to interval
    result[container]["timestamp"] = result[container]["timestamp"] - np.amin(result[container]["timestamp"])
    
    # add workload 
    result[container]["workload"] = [mem_usage_non, mem_usage_evictable, cpu_utilization]
    
    # add limit
    result[container]["limit"] = [mem_limit, cpu_limit]
    
    # add request
    result[container]["request"] = [mem_request, cpu_request]
    
    # add to the dictionary
    if final.get(namespace, None) is None:
        final[namespace] = {}
   
    if final[namespace].get(pod_name, None) is None:
        final[namespace][pod_name] = {}
        
        
    final[namespace][pod_name] = result

# final['engine']['datasets-qbt8z-4024139702']['main']['workload']

## 3. Load

In [74]:
with open('engine.pickle', 'wb') as handle:
    pickle.dump(final, handle, protocol=pickle.HIGHEST_PROTOCOL)