# Work environment

In [1]:
# Import
import os
import numpy as np
import pickle
from tqdm import tqdm

from scipy.stats import skew
from src.data.functions_preprocessing_data import load_interim_data
from src.data.functions_exploration_data import count_unique_subarrays, count_unique_elements

In [2]:
# Set tensorflow to GPU-only (data is stored as tensors even when tf is not used)
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [3]:
# Change working directory
# working_directory = 'c:/Users/Steph/OneDrive - Universität Bayreuth/Masterarbeit/03_Programmierung/remaining_trace_prediction_master_thesis_stephan_faatz'
working_directory = '/home/jupyter-sfaatz/'
os.chdir(working_directory)
print("Working directory: ", os.getcwd())

Working directory:  /home/jupyter-sfaatz


In [4]:
# Set path variables
path_raw = 'data/raw/'
path_interim = 'data/interim/'
path_benchmark = 'data/benchmark/'
path_data = 'data/processed/'
path_control = 'data/control/'
path_predictions = 'data/predictions/'
path_models = 'models/'
path_reports = 'reports/'

In [5]:
# Load variables
filename_variables = 'variables_bpi2019_true.pkl'
with open(path_control + filename_variables, 'rb') as file:
    variables = pickle.load(file)

2023-12-29 21:26:44.216084: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-29 21:26:45.369737: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-12-29 21:26:45.369784: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving CUDA diagnostic information for host: jupyter-ext-wi
2023-12-29 21:26:45.369789: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:175] hostname: jupyter-ext-wi
2023-12-29 21:26:45.369945: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:199] libcuda reported version is: 535.54.3
2023-12-29 21:26:45.369962: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagno

In [6]:
# Load interim data
mapped_array = load_interim_data(path_interim + variables['filename_interim_dataset'])


mapped_array shape:  (227314,)




### Skewness

In [7]:
# Examine trace length
trace_length = [len(arr) for arr in mapped_array]

In [8]:
# Calculate skewness
skewness = skew(trace_length)
# print(skewness)

### Sparsity

In [9]:
# Calculate sparsity
num_unique_elements = count_unique_elements(mapped_array)
print("Number of unique elements: ", num_unique_elements)
sparsity = num_unique_elements/len(mapped_array)
print("Sparsity:", sparsity)

Counting: 100%|██████████| 227314/227314 [00:00<00:00, 1361540.27it/s]

Number of unique elements:  613
Sparsity: 0.0026967102774136218





### Variation

In [10]:
# Calculate variation
num_unique_array = count_unique_subarrays(mapped_array)
print("Number of unique arrays: ", num_unique_array)
variation = num_unique_array/variables['num_traces'] 
print("Variation: ", variation)

Counting: 227314it [00:00, 1110706.76it/s]

Number of unique arrays:  35512
Variation:  0.15622442964357672





### Repetitiveness

In [11]:
# Calculate the number of unique elements in each array
unique_counts = [len(np.unique(arr)) for arr in mapped_array]

# Calculate the average number of unique elements
average_unique_count = np.mean(unique_counts)

print("Average number of unique elements:", average_unique_count)
print("Trace length mean:", np.array(trace_length).mean())
# Calculate the repetitiveness
print("Repetitiveness: ", np.array(trace_length).mean()/average_unique_count)

Average number of unique elements: 9.895562965765418
Trace length mean: 10.157218649093325
Repetitiveness:  1.026441717791411
