In [None]:
import os
import re
import shutil
import statistics
import time
from dataclasses import dataclass, field
from typing import Optional
from tqdm import tqdm
import logging
from IPython.display import clear_output
import numpy as np
import pandas as pd

import torch
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, BitsAndBytesConfig, set_seed
from peft import AutoPeftModelForCausalLM
from huggingface_hub import login
import wandb
import argparse
import pm4py

login(token="YOURTOKENHERE", add_to_git_credential=True)

def secure_mkdir(directory_path):
    if not os.path.isdir(directory_path):
        os.mkdir(directory_path)

def secure_listdir(path, rm_dirs=[".ipynb_checkpoints", ]):
    path_list = os.listdir(path)
    for rm_dir in rm_dirs:
        if rm_dir in path_list:
            path_list.remove(rm_dir)
    return path_list

# Calculate Accuracy

In [None]:
models = [
    [
        "Meta-Llama-3-8B-bpi13_closed_problem-199900595",
        "Meta-Llama-3-8B-bpi13_closed_problem-534895718",
        "Meta-Llama-3-8B-bpi13_closed_problem-787846414",
        "Meta-Llama-3-8B-bpi13_closed_problem-862061404",
        "Meta-Llama-3-8B-bpi13_closed_problem-996406378"
    ],
        [
        "Meta-Llama-3-8B-bpi13_incidents-199900595",
        "Meta-Llama-3-8B-bpi13_incidents-534895718",
        "Meta-Llama-3-8B-bpi13_incidents-787846414",
        "Meta-Llama-3-8B-bpi13_incidents-862061404",
        "Meta-Llama-3-8B-bpi13_incidents-996406378"
    ],
    [
        "Meta-Llama-3-8B-env_permit-199900595",
        "Meta-Llama-3-8B-env_permit-534895718",
        "Meta-Llama-3-8B-env_permit-787846414",
        "Meta-Llama-3-8B-env_permit-862061404",
        "Meta-Llama-3-8B-env_permit-996406378"
    ],
    [
        "Meta-Llama-3-8B-sepsis_cases-199900595",
        "Meta-Llama-3-8B-sepsis_cases-534895718",
        "Meta-Llama-3-8B-sepsis_cases-787846414",
        "Meta-Llama-3-8B-sepsis_cases-862061404",
        "Meta-Llama-3-8B-sepsis_cases-996406378"
    ],
    [
        "Meta-Llama-3-8B-helpdesk-199900595",
        "Meta-Llama-3-8B-helpdesk-534895718",
        "Meta-Llama-3-8B-helpdesk-787846414",
        "Meta-Llama-3-8B-helpdesk-862061404",
        "Meta-Llama-3-8B-helpdesk-996406378"
    ],
    [
        "Meta-Llama-3-8B-bpi12-199900595",
        "Meta-Llama-3-8B-bpi12-534895718",
        #"Meta-Llama-3-8B-bpi12-787846414",
        "Meta-Llama-3-8B-bpi12-862061404",
        #"Meta-Llama-3-8B-bpi12-996406378", # results are not ready for these two folds
    ]
]
results_path = "results"
model_results = pd.DataFrame([], columns=["model_name", "case", "concept_name_pred", "concept_name_true", "right", ])
for model_names in models:
    all_res = []
    for model_name in model_names:
        logger = logging.getLogger(__name__)
        logging.basicConfig(filename=f'{model_name}.log', encoding='utf-8', level=logging.INFO)
        target_true_path = os.path.join(results_path, f'True_{model_name}.xes')
        log_true = pm4py.read_xes(target_true_path)
        target_predict_path = os.path.join(results_path, f'Pred_{model_name}.xes')
        to_replace = """		</event>
	</trace>
		<event>"""
        replacement = """		</event>
	<!-- </trace> unnecessary trace tag -->
		<event>"""
        to_replace1 = """</event>
	</trace>

		<event>"""
        to_replace2 = """</event>
	</trace>


		<event>"""
        to_replace3 = """</event>
	</event>
	</trace>"""
        replacement1 = """</event>
	<!-- </event> unnecessary event tag -->
	</trace>"""
        to_replace4="""		</event>
	<!-- </trace> unnecessary trace tag -->
		<event>"""
        replacement2 = """		</event>
		<event>"""
        xes_content = ""
        with open(target_predict_path, "r", encoding="utf-8") as read_file:
            xes_content = read_file.read()
        xes_content = xes_content.replace(to_replace, replacement).replace(to_replace1, replacement).replace(to_replace2, replacement).replace(to_replace3, replacement1).replace(to_replace4, replacement2)
        with open(target_predict_path, "w", encoding="utf-8") as write_file:
            write_file.write(xes_content)
        log_pred = pm4py.read_xes(target_predict_path)
        results = []
        for concept in log_true.loc[:, "case:concept:name"].unique():
            for i in range(len(log_true[log_true["case:concept:name"] == concept])):
                results.append(
                    [
                        model_name,
                        concept,
                        log_pred[log_pred["case:concept:name"] == concept].reset_index().loc[i, "concept:name"],
                        log_true[log_true["case:concept:name"] == concept].reset_index().loc[i, "concept:name"],
                        str(log_pred[log_pred["case:concept:name"] == concept].reset_index().loc[i, "concept:name"]).strip() == str(log_true[log_true["case:concept:name"] == concept].reset_index().loc[i, "concept:name"]).strip(),
                    ]
                )
        results = pd.DataFrame(results, columns=["model_name", "case", "concept_name_pred", "concept_name_true", "right", ])
        logger.info(f"{round(results.right.sum()/len(results.right), 4)}")
        all_res.append(round(results.right.sum()/len(results.right), 4))
        print(f"Accuracy for {model_name}: {round(results.right.sum()/len(results.right), 4)}")
    
    logger.info(f"{sum(all_res) / len(all_res)}")
    print(f"Total accuracy: {round(sum(all_res) / len(all_res), 3)}")
    model_results = pd.concat([model_results.copy(), results.copy()])
    res_path = os.path.join(results_path, f"{'results'}.csv")
    model_results.to_csv(res_path, index=False)

# Calculate Errors

In [None]:
models = [
    [
        "Meta-Llama-3-8B-bpi13_closed_problem-199900595",
        "Meta-Llama-3-8B-bpi13_closed_problem-534895718",
        "Meta-Llama-3-8B-bpi13_closed_problem-787846414",
        "Meta-Llama-3-8B-bpi13_closed_problem-862061404",
        "Meta-Llama-3-8B-bpi13_closed_problem-996406378"
    ],
    [
        "Meta-Llama-3-8B-bpi13_incidents-199900595",
        "Meta-Llama-3-8B-bpi13_incidents-534895718",
        "Meta-Llama-3-8B-bpi13_incidents-787846414",
        "Meta-Llama-3-8B-bpi13_incidents-862061404",
        "Meta-Llama-3-8B-bpi13_incidents-996406378"
    ],
    [
        "Meta-Llama-3-8B-sepsis_cases-199900595",
        "Meta-Llama-3-8B-sepsis_cases-534895718",
        "Meta-Llama-3-8B-sepsis_cases-787846414",
        "Meta-Llama-3-8B-sepsis_cases-862061404",
        "Meta-Llama-3-8B-sepsis_cases-996406378"
    ],
    [
        "Meta-Llama-3-8B-helpdesk-199900595",
        "Meta-Llama-3-8B-helpdesk-534895718",
        "Meta-Llama-3-8B-helpdesk-787846414",
        "Meta-Llama-3-8B-helpdesk-862061404",
        "Meta-Llama-3-8B-helpdesk-996406378"
    ],
    [
        "Meta-Llama-3-8B-bpi12-534895718",
    ]
]
results_path = "results"
errors = []
errors_names = ["Model", "Dataset", "Total errors",]
for model_names in models:
    for model_name in model_names:
        row = [model_name, model_name.split("-")[4], ]
        target_true_path = os.path.join(results_path, f'True_{model_name}.xes')
        target_predict_path = os.path.join(results_path, f'Pred_{model_name}.xes')
        xes_content = ""
        with open(target_predict_path, "r", encoding="utf-8") as read_file:
            xes_content = read_file.read()
        xes_content = xes_content.replace(
            """<?xml version="1.0" encoding="UTF-8" ?>
<!-- This file has been generated with the OpenXES library. It conforms -->
<!-- to the XML serialization of the XES standard for log storage and -->
<!-- management. -->
<!-- XES standard version: 1.0 -->
<!-- OpenXES library version: 1.0RC7 -->
<!-- OpenXES is available from http://www.openxes.org/ -->""",
            ""
        )
        row.append(len(re.findall(r"<!-- .* -->", xes_content)))
        #print(re.findall(r"<!-- .* -->", xes_content))
        errors.append(row)
errors = pd.DataFrame(errors, columns=errors_names)
display(errors.drop(columns="Model").groupby("Dataset").agg("mean").reset_index())
errors.drop(columns="Model").groupby("Dataset").agg("mean").reset_index().to_csv("errors.csv", index=False)

# Calculate time

We calculate the time by hand using the logs.