## Working environment

In [1]:
# Import
import os
import numpy as np
import pickle

# Custom library
from src.general.variables_control import variables
from src.data.functions_preprocessing_data import load_data, data_cleaning, extract_meta_data, create_traces, tokenizer, create_input_format, train_val_test_split
from src.data.functions_exploration_data import descriptive_statistics
from src.general.functions_report import report_preprocessing
from src.general.functions_time import get_timestamp

In [2]:
# Set tensorflow to GPU-only (data is stored as tensors even when tf is not used)
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [3]:
# Change working directory
# working_directory = 'c:/Users/Steph/OneDrive - Universität Bayreuth/Masterarbeit/03_Programmierung/remaining_trace_prediction_master_thesis_stephan_faatz'
working_directory = '/home/jupyter-sfaatz/'
os.chdir(working_directory)
print("Working directory: ", os.getcwd())

Working directory:  /home/jupyter-sfaatz


## Preprocessing pipeline


Naming Scheme:
- time:timestamp = timestamp
- concept:name = Activity ID
- case:concept:name = Case ID
- org:resource = Resource

### Parameters

In [4]:
# Set path variables
path_raw = 'data/raw/'
path_interim = 'data/interim/'
path_benchmark = 'data/benchmark/'
path_data = 'data/processed/'
path_control = 'data/control/'
path_predictions = 'data/predictions/'
path_models = 'models/'
path_reports = 'reports/'


# Initiate control file
filename_variables = 'variables_helpdesk_true.pkl'
path_file_control = os.path.join(path_control, filename_variables)

if os.path.exists(path_file_control): # Check if the file already exists and then laod it, to not potentially cause data loss from training etc.
    print("Control file exists: ", path_file_control)
    with open(path_file_control, 'rb') as file:
        variables_old = pickle.load(file)
        variables.update(variables_old)  # Update mechanism in case the structure of variables was extended
else:
    print(f"Control file not found: {path_file_control} \nNew control file creation initiated.")

# Set filenames
variables['filename_dataset'] = 'helpdesk.csv'
variables['filename_processed_dataset'] = 'preprocessed_data_'+ filename_variables[10:][:-4] + '.npz'
variables['filename_predictions'] = 'predictions_' + filename_variables[10:][:-4] + '.npz'
variables['filename_interim_dataset'] = 'interim_data_' + filename_variables[10:][:-4] + '.npz'
variables['filename_benchmark_dataset'] = 'benchmark_data_' + filename_variables[10:][:-4] + '.npz'

# Get timestamp
variables['timestamp_preprocessing'] = get_timestamp()

# Set params
variables['trace_length_min'] = 1
variables['interleave'] = True
variables['features'] = ['concept:name', 'org:resource'] # ['concept:name', 'org:resource']

# Input features
if variables['filename_dataset'] == 'helpdesk.csv':
    variables['input_features'] = ['Complete Timestamp','Case ID','Activity', 'Resource'] # Helpdesk
else:
    variables['input_features'] = ['time:timestamp','case:concept:name','concept:name', 'org:resource'] # Standard


Control file exists:  data/control/variables_helpdesk_true.pkl


2023-12-29 21:28:35.251736: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-29 21:28:36.469326: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-12-29 21:28:36.469368: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving CUDA diagnostic information for host: jupyter-ext-wi
2023-12-29 21:28:36.469373: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:175] hostname: jupyter-ext-wi
2023-12-29 21:28:36.469528: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:199] libcuda reported version is: 535.54.3
2023-12-29 21:28:36.469546: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagno

### Load data

In [5]:
# Load the dataset
data = load_data(path_raw + variables['filename_dataset'], variables['input_features'])

In [6]:
print(data)

          time:timestamp case:concept:name           concept:name org:resource
0    2012-10-09 14:50:17            Case 1     Assign seriousness      Value 1
1    2012-10-09 14:51:01            Case 1  Take in charge ticket      Value 1
2    2012-10-12 15:02:56            Case 1  Take in charge ticket      Value 2
3    2012-10-25 11:54:26            Case 1         Resolve ticket      Value 1
4    2012-11-09 12:54:39            Case 1                 Closed      Value 3
...                  ...               ...                    ...          ...
4611 2013-01-04 16:51:50          Case 998                 Closed      Value 3
4612 2013-02-12 16:06:37          Case 999     Assign seriousness      Value 1
4613 2013-02-25 11:37:20          Case 999  Take in charge ticket     Value 12
4614 2013-03-14 16:24:30          Case 999         Resolve ticket     Value 12
4615 2013-03-29 16:24:45          Case 999                 Closed      Value 3

[21348 rows x 4 columns]


In [7]:
# Calulate descriptive statistics before data cleaning
descriptive_statistics(data, variables['features'])

Number of activites:           21348
Number of resources:           21348
Unique activites:              36
Unique resources:              22
Number of cases:               4580
Unique processes:              226
Maximum case length:           15
Minimum case length:           2
Average case length:           4.66
99.99% percentile:             14.542099999999664
99.00% percentile:             9.0
95.00% percentile:             7.0
75.00% percentile:             5.0
50.00% percentile:             4.0
25.00% percentile:             4.0
concept:name nan values:        0
org:resource nan values:        0
Variance of occurence counts:  26527.315870206494
Variance in process:           25.130576903828306


### Data cleaning

In [8]:
# Data cleaning
data = data_cleaning(data, variables['trace_length_min'])

Remove traces containing nan:
0 traces removed.
0.0 % of traces removed.

Remove too long traces:
Upper bound of 6.5 applied.
Traces longer than 6 events removed.
325 values removed.

Remove too short traces:
Traces shorter than 1 events removed.
0 values removed.

Spaces in the concept:name column replaced by '-'.
22233 values replaced.



In [9]:
# Calulate descriptive statistics after data cleaning
descriptive_statistics(data, variables['features'])

Number of activites:           18809
Number of resources:           18809
Unique activites:              32
Unique resources:              22
Number of cases:               4255
Unique processes:              95
Maximum case length:           6
Minimum case length:           2
Average case length:           4.42
99.99% percentile:             6.0
99.00% percentile:             6.0
95.00% percentile:             6.0
75.00% percentile:             5.0
50.00% percentile:             4.0
25.00% percentile:             4.0
concept:name nan values:        0
org:resource nan values:        0
Variance of occurence counts:  62417.402015677544
Variance in process:           11.836176938076685


### Create traces

In [10]:
 # Extract the meta data
variables['vocab'], variables['vocab_size'], variables['max_length_trace'], variables['num_traces'], variables['num_ex_activities'], variables['num_features'] = extract_meta_data(data,'case:concept:name',variables['features'])

Summary: 
vocab:                 ['<pad>' '<unk>' '<start>' '<end>' 'Assign-seriousness'
 'Take-in-charge-ticket']
vocab_size:            36
max_length_trace:      6
num_traces:            4255
num_ex_activities:     18809


Features: 
num_features:          2
Feature:               concept:name
Feature:               org:resource


In [11]:
# Create the traces
traces = create_traces(data, variables['features'], interleave = variables['interleave'])

In [12]:
print(traces)

[array(['Assign-seriousness', 'Value 1', 'Take-in-charge-ticket',
        'Value 1', 'Take-in-charge-ticket', 'Value 2', 'Resolve-ticket',
        'Value 1', 'Closed', 'Value 3'], dtype=object)
 array(['Assign-seriousness', 'Value 2', 'Take-in-charge-ticket',
        'Value 2', 'Resolve-ticket', 'Value 2', 'Closed', 'Value 5'],
       dtype=object)
 array(['Assign-seriousness', 'Value 1', 'Take-in-charge-ticket',
        'Value 9', 'Require-upgrade', 'Value 9', 'Resolve-ticket',
        'Value 2', 'Closed', 'Value 3'], dtype=object)             ...
 array(['Assign-seriousness', 'Value 1', 'Take-in-charge-ticket',
        'Value 13', 'Resolve-ticket', 'Value 13', 'Closed', 'Value 5'],
       dtype=object)
 array(['Assign-seriousness', 'Value 9', 'Take-in-charge-ticket',
        'Value 2', 'Wait', 'Value 9', 'Resolve-ticket', 'Value 9',
        'Closed', 'Value 3'], dtype=object)
 array(['Assign-seriousness', 'Value 1', 'Take-in-charge-ticket',
        'Value 12', 'Resolve-ticket', 'Valu

### Tokenizing

In [13]:
# Tokenize traces
mapped_array, variables['mapping'] = tokenizer(traces, variables['vocab'])
np.savez(path_interim + variables['filename_interim_dataset'], mapped_array=mapped_array)


Mapping: 100%|██████████| 4255/4255 [00:00<00:00, 709500.02it/s]
Processing Arrays: 100%|██████████| 4255/4255 [00:00<00:00, 1469353.16it/s]


In [14]:
print(variables['mapping'])

{'<pad>': 0, '<unk>': 1, '<start>': 2, '<end>': 3, 'Assign-seriousness': 4, 'Take-in-charge-ticket': 5, 'Resolve-ticket': 6, 'Closed': 7, 'Require-upgrade': 8, 'Wait': 9, 'Insert-ticket': 10, 'Create-SW-anomaly': 11, 'Schedule-intervention': 12, 'Resolve-SW-anomaly': 13, 'Value 1': 14, 'Value 2': 15, 'Value 3': 16, 'Value 5': 17, 'Value 9': 18, 'Value 4': 19, 'Value 19': 20, 'Value 8': 21, 'Value 12': 22, 'Value 16': 23, 'Value 15': 24, 'Value 14': 25, 'Value 6': 26, 'Value 13': 27, 'Value 10': 28, 'Value 17': 29, 'Value 7': 30, 'Value 18': 31, 'Value 11': 32, 'Value 20': 33, 'Value 21': 34, 'Value 22': 35}


### Create input format

In [15]:
# Create the input format
x_input, y_input = create_input_format(mapped_array, variables['mapping'], variables['num_traces'], variables['max_length_trace'], variables['num_ex_activities'], num_features = variables['num_features'], interleave = variables['interleave'])

Processing Arrays: 100%|██████████| 4255/4255 [00:01<00:00, 2848.45it/s]


### Train-Test-Val split

In [16]:
# Train test split
x_train, y_train, x_val, y_val, x_test, y_test = train_val_test_split(x_input, y_input, train_size = 0.7, val_size = 0.15, test_size = 0.15, shuffle = False)
variables['x_train_shape'] = x_train.shape
variables['x_val_shape'] = x_val.shape
variables['x_test_shape'] = x_test.shape

Number of training samples:    26332
Number of validation samples:  5643
Number of test samples:        5643


### Save data

In [17]:
# Save the preprocessed data to file
np.savez(path_data + variables['filename_processed_dataset'], x_train=x_train, y_train=y_train, x_val = x_val, y_val = y_val, x_test = x_test, y_test = y_test)

In [18]:
# Check if predictions file exists and if not create it
if os.path.isfile(path_predictions + variables['filename_predictions']):
    # If the file exists, load it
    data_predictions = np.load(path_predictions + variables['filename_predictions'])
else:
    np.savez(path_predictions + variables['filename_predictions'], y_test = y_test)
    data_predictions = np.load(path_predictions + variables['filename_predictions'])

# Update predictions
data_predictions_dict = dict(data_predictions)
data_predictions_dict['y_test'] = y_test

# Save the modified data back to the npz file
np.savez(path_predictions + variables['filename_predictions'], **data_predictions_dict)

In [19]:
# Store variables in pickle file
with open(path_control + filename_variables, 'wb') as file:
    pickle.dump(variables, file)

### Summary

In [24]:
report_preprocessing(filename_variables, variables, variables['timestamp_preprocessing'], path_reports)

Summary:


Dataset:                 BPI_Challenge_2019.xes
Filename processed data: preprocessed_data_bpi2019_false.npz
Filename variables:      variables_bpi2019_false.pkl


vocab (first 6):         ['<pad>' '<unk>' '<start>' '<end>' 'Create-Purchase-Order-Item'
 'Receive-Order-Confirmation']
vocab_size:              617
max_length_trace:        7
num_traces:              227314
num_ex_activities:       1154439
num_features:            2
features:                ['concept:name', 'org:resource']
interleave:              False


Samples in training:     (1616214, 16)
Samples in validation:   (346332, 16)
Samples in test:         (346332, 16)

Report has been written to 'reports/preprocessing/2023-11-18_19-18-31_report_preprocessing_bpi2019_false.txt'
