## Work environment

In [11]:
# Import
import os
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from joblib import dump
from src.general.functions_report import report_training_random_forest
from src.general.functions_time import tic, toc, get_timestamp
from src.data.functions_training_data import load_processed_data

In [2]:
# Set tensorflow to GPU-only (data is stored as tensors even when tf is not used)
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [3]:
# Change working directory
# working_directory = 'c:/Users/Steph/OneDrive - Universität Bayreuth/Masterarbeit/03_Programmierung/remaining_trace_prediction_master_thesis_stephan_faatz'
working_directory = '/home/jupyter-sfaatz/'
os.chdir(working_directory)
print("Working directory: ", os.getcwd())

Working directory:  /home/jupyter-sfaatz


## Parameters

In [4]:
# Set path variables
path_raw = 'data/raw/'
path_interim = 'data/interim/'
path_benchmark = 'data/benchmark/'
path_data = 'data/processed/'
path_control = 'data/control/'
path_predictions = 'data/predictions/'
path_models = 'models/'
path_reports = 'reports/'

# Initalize variables
filename_variables = 'variables_helpdesk.pkl'

with open(path_control + filename_variables, 'rb') as file:
    variables = pickle.load(file)

# Timestamp 
variables['rf_timestamp_training_start'] = get_timestamp()

# Set model name
variables['random_forest_model'] = "rf_"+ filename_variables[10:][:-4] +".joblib"

2023-12-29 21:32:48.951570: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-12-29 21:32:48.951608: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving CUDA diagnostic information for host: jupyter-ext-wi
2023-12-29 21:32:48.951613: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:175] hostname: jupyter-ext-wi
2023-12-29 21:32:48.951737: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:199] libcuda reported version is: 535.54.3
2023-12-29 21:32:48.951753: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:203] kernel reported version is: 535.54.3
2023-12-29 21:32:48.951756: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:309] kernel version seems to match DSO: 535.54.3


## Preprocessing

In [5]:
# Load benchmark data
x_train, y_train, x_val, y_val, x_test, y_test = load_processed_data(path_benchmark + variables['filename_benchmark_dataset'], tensor = False)


x_train_tensor shape:  (13166, 6)
y_train_tensor shape:  (13166, 1)
x_val_tensor shape:    (2822, 6)
y_val_tensor shape:    (2822, 1)
x_test_tensor shape:   (2821, 6)
y_test_tensor shape:  (2821, 6)




In [6]:
# Model parameter
variables['rf_n_estimators'] = 100
variables['rf_n_jobs'] = -1

## Training

In [7]:
# Initalize and train model
rf = RandomForestClassifier(n_estimators = variables['rf_n_estimators'], max_depth = 12, random_state = 29061998, verbose = 1, n_jobs= variables['rf_n_jobs'])
tic()
rf.fit(x_train, y_train.flatten())
variables['rf_elapsed_time'] = toc()

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.2s



Elapsed time: 0.402420 seconds


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.4s finished


In [8]:
# Save model
dump(rf, path_models + 'random_forest/' + variables['random_forest_model'])

['models/random_forest/rf_helpdesk.joblib']

## Evaluation

In [9]:
y_val_pred = rf.predict(x_val)
variables['rf_acc'] = accuracy_score(y_val.flatten(), y_val_pred)
print('acc: ',variables['rf_acc'])

acc:  0.8773919206236711


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.1s finished


In [12]:
# Generate report
report_training_random_forest(filename_variables, variables, variables['rf_timestamp_training_start'], path_reports)


Summary:


Dataset:                 helpdesk.csv
Filename interim data:   interim_data_helpdesk.npz
Filename variables:      variables_helpdesk.pkl


vocab (first 6):         ['<pad>' '<unk>' '<start>' '<end>' 'Assign-seriousness'
 'Take-in-charge-ticket']
vocab_size:              14
max_length_trace:        6
num_traces:              4255
num_ex_activities:       18809
num_features:            1
features:                ['concept:name']
interleave:              True


Samples in training:     (13166, 8)
Samples in validation:   (2822, 8)
Samples in test:         (2821, 8)


Training Random Forest:
Elapsed time:            0.4024195671081543
Random forest model:     rf_helpdesk.joblib


Parameters Random Forest:
rf_n_estimators:         100
rf_n_jobs:               -1


Training-Evaluation:
rf_acc          :        0.8773919206236711

Report has been written to 'reports/training/competing_artifacts/random_forest/2023-12-29_21-32-48_report_training_rf_helpdesk.txt'


In [9]:
# Store variables in pickle file
with open(path_control + filename_variables, 'wb') as file:
    pickle.dump(variables, file)