## Work environment

In [1]:
# Import
import os
import pickle
import numpy as np

from joblib import load
from src.general.functions_time import tic, toc, get_timestamp
from src.data.functions_training_data import load_processed_data
from src.evaluation.functions_prediction import get_multi_dim_prediction
from src.general.functions_report import report_prediction_random_forest

2023-12-29 21:43:27.717233: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Set tensorflow to GPU-only (data is stored as tensors even when tf is not used)
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [3]:
# Change working directory
# working_directory = 'c:/Users/Steph/OneDrive - Universität Bayreuth/Masterarbeit/03_Programmierung/remaining_trace_prediction_master_thesis_stephan_faatz'
working_directory = '/home/jupyter-sfaatz/'
os.chdir(working_directory)
print("Working directory: ", os.getcwd())

Working directory:  /home/jupyter-sfaatz


## Parameters

In [4]:
# Set path variables
path_raw = 'data/raw/'
path_interim = 'data/interim/'
path_benchmark = 'data/benchmark/'
path_data = 'data/processed/'
path_control = 'data/control/'
path_predictions = 'data/predictions/'
path_models = 'models/'
path_reports = 'reports/'

# Initalize variables
filename_variables = 'variables_helpdesk_true.pkl'

with open(path_control + filename_variables, 'rb') as file:
    variables = pickle.load(file)

# timestamp
variables['rf_timestamp_prediction_start'] = get_timestamp()

2023-12-29 21:44:26.976706: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-12-29 21:44:26.976751: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving CUDA diagnostic information for host: jupyter-ext-wi
2023-12-29 21:44:26.976756: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:175] hostname: jupyter-ext-wi
2023-12-29 21:44:26.977001: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:199] libcuda reported version is: 535.54.3
2023-12-29 21:44:26.977019: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:203] kernel reported version is: 535.54.3
2023-12-29 21:44:26.977022: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:309] kernel version seems to match DSO: 535.54.3


## Preprocessing

In [5]:
# Load benchmark data
x_train, y_train, x_val, y_val, x_test, y_test = load_processed_data(path_benchmark + variables['filename_benchmark_dataset'], tensor = False)


x_train_tensor shape:  (26332, 12)
y_train_tensor shape:  (26332, 1)
x_val_tensor shape:    (5643, 12)
y_val_tensor shape:    (5643, 1)
x_test_tensor shape:   (5643, 12)
y_test_tensor shape:  (5643, 12)




## Predicting

In [6]:
# Load model
rf = load(path_models +'random_forest/'+ variables['random_forest_model'])

In [7]:
# Get multidimensional predictions
tic()
y_pred = get_multi_dim_prediction(rf, x_test, variables['mapping'])
variables['elapsed_time_predictions_rf'] = toc()

Predicting:   0%|          | 0/12 [00:00<?, ?it/s][Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.1s finished
Predicting:  17%|█▋        | 2/12 [00:00<00:00, 13.20it/s][Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.1s finished
Predicting:  33%|███▎      | 4/1


Elapsed time: 0.849087 seconds





In [8]:
# Save predictions
data_predictions = np.load(path_predictions + variables['filename_predictions'])
data_predictions_copy = dict(data_predictions)
data_predictions_copy['y_pred_rf'] = y_pred
data_predictions_copy['y_test_benchmark'] = y_test
np.savez(path_predictions + variables['filename_predictions'], **data_predictions_copy)
print("Predictions saved to ", path_predictions + variables['filename_predictions'])

Predictions saved to  data/predictions/predictions_helpdesk_true.npz


In [9]:
# Generate report
report_prediction_random_forest(filename_variables, variables, variables['rf_timestamp_prediction_start'], path_reports)


Summary:


Dataset:                 helpdesk.csv
Filename interim data:   interim_data_helpdesk_true.npz
Filename processed data: preprocessed_data_helpdesk_true.npz
Filename variables:      variables_helpdesk_true.pkl


vocab (first 6):         ['<pad>' '<unk>' '<start>' '<end>' 'Assign-seriousness'
 'Take-in-charge-ticket']
vocab_size:              36
max_length_trace:        6
num_traces:              4255
num_ex_activities:       18809
num_features:            2
features:                ['concept:name', 'org:resource']
interleave:              True


Samples in training:     (26332, 14)
Samples in validation:   (5643, 14)
Samples in test:         (5643, 14)


Prediction Random Forest:
Elapsed time:            0.8102619647979736
Random Forest model:     rf_helpdesk_true.joblib


Parameters Random Forest:
rf_n_estimators:         100
rf_n_jobs:               -1

Report has been written to 'reports/prediction/competing_artifacts/random_forest/2023-12-29_21-44-26_report_prediction_rf_

In [10]:
# Store variables in pickle file
with open(path_control + filename_variables, 'wb') as file:
    pickle.dump(variables, file)