## Work environment

In [2]:
# Import
import os
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from joblib import dump
from src.general.functions_report import report_training_logistical_regression
from src.general.functions_time import tic, toc, get_timestamp
from src.data.functions_training_data import load_processed_data

2023-12-29 21:38:26.456441: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Set tensorflow to GPU-only (data is stored as tensors even when tf is not used)
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [4]:
# Change working directory
# working_directory = 'c:/Users/Steph/OneDrive - Universität Bayreuth/Masterarbeit/03_Programmierung/remaining_trace_prediction_master_thesis_stephan_faatz'
working_directory = '/home/jupyter-sfaatz/'
os.chdir(working_directory)
print("Working directory: ", os.getcwd())

Working directory:  /home/jupyter-sfaatz


## Parameters

In [5]:
# Set path variables
path_raw = 'data/raw/'
path_interim = 'data/interim/'
path_benchmark = 'data/benchmark/'
path_data = 'data/processed/'
path_control = 'data/control/'
path_predictions = 'data/predictions/'
path_models = 'models/'
path_reports = 'reports/'

# Initalize variables
filename_variables = 'variables_helpdesk_true.pkl'

with open(path_control + filename_variables, 'rb') as file:
    variables = pickle.load(file)

# Get timestamp
variables['logreg_timestamp_training_start'] = get_timestamp()

# Set model name
variables['regression_model'] = "logreg_"+ filename_variables[10:][:-4] +".joblib"

2023-12-29 21:38:31.947386: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-12-29 21:38:31.947427: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving CUDA diagnostic information for host: jupyter-ext-wi
2023-12-29 21:38:31.947432: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:175] hostname: jupyter-ext-wi
2023-12-29 21:38:31.947555: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:199] libcuda reported version is: 535.54.3
2023-12-29 21:38:31.947571: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:203] kernel reported version is: 535.54.3
2023-12-29 21:38:31.947575: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:309] kernel version seems to match DSO: 535.54.3


## Preprocessing

In [6]:
# Load benchmark data
x_train, y_train, x_val, y_val, x_test, y_test = load_processed_data(path_benchmark + variables['filename_benchmark_dataset'], tensor = False)


x_train_tensor shape:  (26332, 12)
y_train_tensor shape:  (26332, 1)
x_val_tensor shape:    (5643, 12)
y_val_tensor shape:    (5643, 1)
x_test_tensor shape:   (5643, 12)
y_test_tensor shape:  (5643, 12)




In [7]:
# Model parameter
variables['logreg_max_iter'] = 1000
variables['logreg_n_jobs'] = -1

## Training

In [8]:
# Initalize model
logreg = LogisticRegression(max_iter = variables['logreg_max_iter'], random_state = 29061998, verbose = 1, n_jobs = variables['logreg_n_jobs'],multi_class='multinomial', solver='lbfgs')
tic()
# Train model
logreg.fit(x_train, y_train.flatten()) 
variables['logreg_elapsed_time'] = toc()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          416     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  9.12598D+04    |proj g|=  5.23762D+04


 This problem is unconstrained.



At iterate   50    f=  4.15856D+04    |proj g|=  2.17578D+03

At iterate  100    f=  3.94304D+04    |proj g|=  2.38739D+03

At iterate  150    f=  3.84361D+04    |proj g|=  1.55533D+03

At iterate  200    f=  3.79076D+04    |proj g|=  4.71993D+02

At iterate  250    f=  3.76291D+04    |proj g|=  1.10622D+03

At iterate  300    f=  3.73792D+04    |proj g|=  3.96479D+02

At iterate  350    f=  3.72449D+04    |proj g|=  1.00346D+03

At iterate  400    f=  3.71561D+04    |proj g|=  2.23781D+02

At iterate  450    f=  3.70883D+04    |proj g|=  3.51921D+02

At iterate  500    f=  3.70310D+04    |proj g|=  1.44973D+02

At iterate  550    f=  3.69866D+04    |proj g|=  6.32614D+02

At iterate  600    f=  3.69474D+04    |proj g|=  1.82144D+02

At iterate  650    f=  3.69220D+04    |proj g|=  2.64570D+02

At iterate  700    f=  3.69025D+04    |proj g|=  2.59698D+02

At iterate  750    f=  3.68798D+04    |proj g|=  3.63007D+02

At iterate  800    f=  3.68570D+04    |proj g|=  2.14442D+02

At iter

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
# Save the model
dump(logreg, path_models + 'logistic_regression/' + variables['regression_model'])

['models/logistic_regression/logreg_helpdesk_true.joblib']

## Evaluation

In [10]:
# Evaluate model
y_val_pred = logreg.predict(x_val)
variables['logreg_acc'] = accuracy_score(y_val.flatten(), y_val_pred)
print('acc: ',variables['logreg_acc'])

acc:  0.5842636895268474


In [11]:
# Generate report
report_training_logistical_regression(filename_variables, variables, variables['logreg_timestamp_training_start'], path_reports)


Summary:


Dataset:                 helpdesk.csv
Filename interim data:   interim_data_helpdesk_true.npz
Filename variables:      variables_helpdesk_true.pkl


vocab (first 6):         ['<pad>' '<unk>' '<start>' '<end>' 'Assign-seriousness'
 'Take-in-charge-ticket']
vocab_size:              36
max_length_trace:        6
num_traces:              4255
num_ex_activities:       18809
num_features:            2
features:                ['concept:name', 'org:resource']
interleave:              True


Samples in training:     (26332, 14)
Samples in validation:   (5643, 14)
Samples in test:         (5643, 14)


Training Logistical Regression:
Elapsed time:            14.357698917388916
Regression model:        logreg_helpdesk_true.joblib


Parameters Logistical Regression:
logreg_max_iter:         1000
logreg_n_jobs:           -1


Training-Evaluation:
logreg_acc:              0.5842636895268474

Report has been written to 'reports/training/competing_artifacts/logistical_regression/2023-12-29

In [12]:
# Store variables in pickle file
with open(path_control + filename_variables, 'wb') as file:
    pickle.dump(variables, file)
print("Variables saved to ", path_control + filename_variables)

Variables saved to  data/control/variables_helpdesk_true.pkl
