## Working environment

In [1]:
# Import
import os
import numpy as np
import pickle

# Custom library
from src.general.variables_control import variables
from src.data.functions_preprocessing_data import load_interim_data, create_input_format, train_val_test_split
from src.general.functions_report import report_preprocessing_benchmark
from src.general.functions_time import get_timestamp

In [2]:
# Set tensorflow to GPU-only (data is stored as tensors even when tf is not used)
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [3]:
# Change working directory
# working_directory = 'c:/Users/Steph/OneDrive - Universität Bayreuth/Masterarbeit/03_Programmierung/remaining_trace_prediction_master_thesis_stephan_faatz'
working_directory = '/home/jupyter-sfaatz/'
os.chdir(working_directory)
print("Working directory: ", os.getcwd())

Working directory:  /home/jupyter-sfaatz


## Preprocessing pipeline


Naming Scheme:
- time:timestamp = timestamp
- concept:name = Activity ID
- case:concept:name = Case ID
- org:resource = Resource

### Parameters

In [4]:
# Set path variables
path_raw = 'data/raw/'
path_interim = 'data/interim/'
path_benchmark = 'data/benchmark/'
path_data = 'data/processed/'
path_control = 'data/control/'
path_predictions = 'data/predictions/'
path_models = 'models/'
path_reports = 'reports/'


# Initalize variables
filename_variables = 'variables_helpdesk.pkl'

with open(path_control + filename_variables, 'rb') as file:
    variables = pickle.load(file)
    
# Get timestamp
variables['timestamp_preprocessing_benchmark'] = get_timestamp()

2023-12-29 21:30:39.916902: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-29 21:30:41.072844: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-12-29 21:30:41.072899: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving CUDA diagnostic information for host: jupyter-ext-wi
2023-12-29 21:30:41.072904: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:175] hostname: jupyter-ext-wi
2023-12-29 21:30:41.073080: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:199] libcuda reported version is: 535.54.3
2023-12-29 21:30:41.073101: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagno

### Load data

In [5]:
# Load interim data
mapped_array = load_interim_data(path_interim + variables['filename_interim_dataset'])


mapped_array shape:  (4255,)




In [6]:
print(mapped_array)

[array([4, 5, 5, 6, 7], dtype=object) array([4, 5, 6, 7], dtype=object)
 array([4, 5, 8, 6, 7], dtype=object) ...
 array([4, 5, 6, 7], dtype=object) array([4, 5, 9, 6, 7], dtype=object)
 array([4, 5, 6, 7], dtype=object)]


### Create input format

In [7]:
# Create input format
x_input, y_input = create_input_format(mapped_array, variables['mapping'], variables['num_traces'], variables['max_length_trace'], variables['num_ex_activities'], num_features = variables['num_features'], benchmarking = True)

Processing Arrays: 100%|██████████| 4255/4255 [00:00<00:00, 7492.62it/s]


### Train-Test-Val split

In [8]:
# Train test split
x_train, y_train, x_val, y_val, x_test, y_test = train_val_test_split(x_input, y_input, train_size = 0.7, val_size = 0.15, test_size = 0.15)

# Extract only the first value, because of training structure
y_train = y_train[:, :1]
y_val = y_val[:, :1]

# Save shapes
variables['x_train_shape_benchmark'] = x_train.shape
variables['x_val_shape_benchmark'] = x_val.shape
variables['x_test_shape_benchmark'] = x_test.shape

Number of training samples:    13166
Number of validation samples:  2822
Number of test samples:        2821


### Save data

In [9]:
# Save the preprocessed data to file
np.savez(path_benchmark + variables['filename_benchmark_dataset'], x_train=x_train, y_train=y_train, x_val = x_val, y_val = y_val, x_test = x_test, y_test = y_test)

In [10]:
# Check if predictions file exists
if os.path.isfile(path_predictions + variables['filename_predictions']):
    # If the file exists, load it
    data_predictions = np.load(path_predictions + variables['filename_predictions'])
else:
    # If not, create it
    np.savez(path_predictions + variables['filename_predictions'], y_test_benchmark = y_test)
    data_predictions = np.load(path_predictions + variables['filename_predictions'])

data_predictions_dict = dict(data_predictions)
data_predictions_dict['y_test_benchmark'] = y_test

# Save the modified data back to the npz file
np.savez(path_predictions + variables['filename_predictions'], **data_predictions_dict)

In [11]:
# Store variables in pickle file
with open(path_control + filename_variables, 'wb') as file:
    pickle.dump(variables, file)

### Summary

In [12]:
report_preprocessing_benchmark(filename_variables, variables, variables['timestamp_preprocessing_benchmark'], path_reports)

Summary:


Dataset:                 helpdesk.csv
Filename interim data:   interim_data_helpdesk.npz
Filename benchmark data: benchmark_data_helpdesk.npz
Filename variables:      variables_helpdesk.pkl


Samples in training:     (13166, 6)
Samples in validation:   (2822, 6)
Samples in test:         (2821, 6)

Report has been written to 'reports/preprocessing/benchmark/2023-12-29_21-30-41_report_preprocessing_pdesk.txt'
