In [None]:
import os
import re
import logging
import numpy as np
import pandas as pd

from huggingface_hub import login
import pm4py
from sklearn.metrics import f1_score

login(token="YOURTOKENHERE", add_to_git_credential=True)

def secure_mkdir(directory_path):
    if not os.path.isdir(directory_path):
        os.mkdir(directory_path)

def secure_listdir(path, rm_dirs=[".ipynb_checkpoints", ]):
    path_list = os.listdir(path)
    for rm_dir in rm_dirs:
        if rm_dir in path_list:
            path_list.remove(rm_dir)
    return path_list

# Calculate Accuracy

## Generate Results

In [None]:
models = [
    [
        "Meta-Llama-3-8B-bpi13_closed_problem-199900595",
        "Meta-Llama-3-8B-bpi13_closed_problem-534895718",
        "Meta-Llama-3-8B-bpi13_closed_problem-787846414",
        "Meta-Llama-3-8B-bpi13_closed_problem-862061404",
        "Meta-Llama-3-8B-bpi13_closed_problem-996406378"
    ],
        [
        "Meta-Llama-3-8B-bpi13_incidents-199900595",
        "Meta-Llama-3-8B-bpi13_incidents-534895718",
        "Meta-Llama-3-8B-bpi13_incidents-787846414",
        "Meta-Llama-3-8B-bpi13_incidents-862061404",
        "Meta-Llama-3-8B-bpi13_incidents-996406378"
    ],
    [
        "Meta-Llama-3-8B-sepsis_cases-199900595",
        "Meta-Llama-3-8B-sepsis_cases-534895718",
        "Meta-Llama-3-8B-sepsis_cases-787846414",
        "Meta-Llama-3-8B-sepsis_cases-862061404",
        "Meta-Llama-3-8B-sepsis_cases-996406378"
    ],
    [
        "Meta-Llama-3-8B-helpdesk-199900595",
        "Meta-Llama-3-8B-helpdesk-534895718",
        "Meta-Llama-3-8B-helpdesk-787846414",
        "Meta-Llama-3-8B-helpdesk-862061404",
        "Meta-Llama-3-8B-helpdesk-996406378"
    ],
    [
        "Meta-Llama-3-8B-bpi12-199900595",
        "Meta-Llama-3-8B-bpi12-534895718",
        "Meta-Llama-3-8B-bpi12-787846414",
        "Meta-Llama-3-8B-bpi12-862061404",
        "Meta-Llama-3-8B-bpi12-996406378",
    ]
]
results_path = "results"
results_csv_path = os.path.join(results_path, "results.csv")
logs_results_colums = ["model", "log", "fold", "fold_index", "case", "concept_name_pred", "concept_name_true", "similar", ]
logs_results = []
for model_names in models:
    all_res = []
    for model_name in model_names:
        
        target_true_path = os.path.join(results_path, f'True_{model_name}.xes')
        target_predict_path = os.path.join(results_path, f'Pred_{model_name}.xes')
        log_true = pm4py.read_xes(target_true_path)
        log_pred = pm4py.read_xes(target_predict_path)
        
        log_true = log_true[["case:concept:name", "concept:name",]].reset_index().rename(columns={"case:concept:name": "case", "concept:name": "concept_name_true", "index": "fold_index"}).copy()
        log_true["case"] = log_true["case"].astype(str)
        log_true["concept_name_true"] = log_true["concept_name_true"].fillna("")
        log_true["concept_name_true"] = log_true["concept_name_true"].astype(str).str.strip()
        
        log_pred = log_pred[["case:concept:name", "concept:name",]].reset_index().rename(columns={"case:concept:name": "case", "concept:name": "concept_name_pred", "index": "fold_index"}).fillna("").copy()
        log_pred["case"] = log_pred["case"].astype(str)
        log_pred["concept_name_pred"] = log_pred["concept_name_pred"].fillna("")
        log_pred["concept_name_pred"] = log_pred["concept_name_pred"].astype(str).str.strip()
        
        log_merged = log_true.merge(log_pred, on=["case", "fold_index"]).copy()
        assert len(log_merged) == len(log_true) and len(log_merged) == len(log_pred)
        log_merged["similar"] = log_merged["concept_name_true"] == log_merged["concept_name_pred"]
        model_info = model_name.split("-")
        log_merged[["model", "log", "fold"]] =  "-".join(model_info[:-2]), model_info[-2], model_info[-1]
        log_merged = log_merged[logs_results_colums].copy()
        
        logs_results.append(log_merged.copy())
logs_results = pd.concat(logs_results).reset_index(drop=True)
logs_results.to_csv(results_csv_path, index=False)

In [None]:
results_csv_path = os.path.join(results_path, "results.csv")
results = pd.read_csv(results_csv_path, low_memory=False)
results_by_fold = results[["model", "log", "fold", "similar",]].groupby(["model", "log", "fold",]).agg(lambda x: round(x.mean(), 4)).reset_index().rename(columns={"similar": "accuracy"})
display(results_by_fold)
results_by_log = results_by_fold[["model", "log", "accuracy",]].groupby(["model", "log",]).agg(lambda x: round(x.mean(), 3)).reset_index()
display(results_by_log)

## Investigate Hallucinations

In [None]:
results_csv_path = os.path.join(results_path, "results.csv")
results = pd.read_csv(results_csv_path, low_memory=False)
concept_names = results["concept_name_true"].unique()
res_false = results[results["similar"] == False].copy()
hallucinations = res_false[res_false["concept_name_pred"].apply(lambda x: x not in concept_names)].copy()
display(hallucinations)
nans = hallucinations[hallucinations["concept_name_pred"].isna()].copy()
nans = nans[["model", "log", "similar"]].groupby(["model", "log",]).agg("count")
display(nans)
display(nans.apply(lambda x: round(x / 5, 4), axis=0))
hallucinations = hallucinations.dropna()[["model", "log", "similar"]].groupby(["model", "log",]).agg("count")
display(hallucinations)
display(hallucinations.apply(lambda x: round(x / 5, 4), axis=0))

## F1-Score

In [None]:
results_csv_path = os.path.join(results_path, "results.csv")
results = pd.read_csv(results_csv_path, low_memory=False)
concept_names = results["concept_name_true"].unique()
false_results = results[results["similar"] == False].copy()
ind = false_results[false_results["concept_name_pred"].apply(lambda x: x not in concept_names)].index
results["concept_name_pred"][results.apply(lambda x: x["concept_name_pred"] not in concept_names and x["log"] == "sepsis_cases" and x["concept_name_true"] != "ER Triage", axis=1)] = "ER Triage" # set as existing, but wrong concept name
results["concept_name_pred"][results.apply(lambda x: x["concept_name_pred"] not in concept_names and x["log"] == "sepsis_cases" and x["concept_name_true"] != "CRP", axis=1)] = "CRP" # set as existing, but wrong concept name
results.loc[1860, "concept_name_pred"] = "Completed"
results.loc[[88405, 93425, 95076, ], "concept_name_pred"] = "Closed"
results.loc[101338, "concept_name_pred"] = "Closed"
results.loc[108678, "concept_name_pred"] = "Require upgrade"
results.loc[281677, "concept_name_pred"] = "A_DECLINED"
display(results.loc[ind])
false_results = results[results["similar"] == False].copy()
assert len(false_results[false_results["concept_name_pred"].apply(lambda x: x not in concept_names)]) == 0

In [None]:
results_by_fold = results[["model", "log", "fold", "concept_name_true", "concept_name_pred", ]].groupby(["model", "log", "fold",]).agg(list).reset_index().rename(columns={"similar": "accuracy"})
results_by_fold["f1_score"] = results_by_fold.apply(lambda x: round(f1_score(x["concept_name_true"], x["concept_name_pred"], average="weighted"), 4), axis=1)
display(results_by_fold)
results_by_log = results_by_fold[["model", "log", "f1_score",]].groupby(["model", "log",]).agg(lambda x: round(x.mean(), 3)).reset_index()
display(results_by_log)

In [None]:
results_by_fold = results[["model", "log", "fold", "concept_name_true", "concept_name_pred", ]].groupby(["model", "log", "fold",]).agg(list).reset_index().rename(columns={"similar": "accuracy"})
results_by_fold["f1_score"] = results_by_fold.apply(lambda x: round(f1_score(x["concept_name_true"], x["concept_name_pred"], average="macro"), 4), axis=1)
display(results_by_fold)
results_by_log = results_by_fold[["model", "log", "f1_score",]].groupby(["model", "log",]).agg(lambda x: round(x.mean(), 3)).reset_index()
display(results_by_log)

In [None]:
models = [
    [
        "Meta-Llama-3-8B-bpi13_closed_problem-199900595",
        "Meta-Llama-3-8B-bpi13_closed_problem-534895718",
        "Meta-Llama-3-8B-bpi13_closed_problem-787846414",
        "Meta-Llama-3-8B-bpi13_closed_problem-862061404",
        "Meta-Llama-3-8B-bpi13_closed_problem-996406378"
    ],
        [
        "Meta-Llama-3-8B-bpi13_incidents-199900595",
        "Meta-Llama-3-8B-bpi13_incidents-534895718",
        "Meta-Llama-3-8B-bpi13_incidents-787846414",
        "Meta-Llama-3-8B-bpi13_incidents-862061404",
        "Meta-Llama-3-8B-bpi13_incidents-996406378"
    ],
    [
        "Meta-Llama-3-8B-sepsis_cases-199900595",
        "Meta-Llama-3-8B-sepsis_cases-534895718",
        "Meta-Llama-3-8B-sepsis_cases-787846414",
        "Meta-Llama-3-8B-sepsis_cases-862061404",
        "Meta-Llama-3-8B-sepsis_cases-996406378"
    ],
    [
        "Meta-Llama-3-8B-helpdesk-199900595",
        "Meta-Llama-3-8B-helpdesk-534895718",
        "Meta-Llama-3-8B-helpdesk-787846414",
        "Meta-Llama-3-8B-helpdesk-862061404",
        "Meta-Llama-3-8B-helpdesk-996406378"
    ],
    [
        "Meta-Llama-3-8B-bpi12-199900595",
        "Meta-Llama-3-8B-bpi12-534895718",
        "Meta-Llama-3-8B-bpi12-787846414",
        "Meta-Llama-3-8B-bpi12-862061404",
        "Meta-Llama-3-8B-bpi12-996406378",
    ]
]

import os
import shutil
import statistics
import time
from dataclasses import dataclass, field
from typing import Optional
from tqdm import tqdm
import logging

import numpy as np
import pandas as pd

import torch
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, BitsAndBytesConfig, set_seed
from peft import AutoPeftModelForCausalLM
from huggingface_hub import login
import wandb
import argparse
import pm4py
login(token="YOURTOKENHERE", add_to_git_credential=True)

def prepare_sample_text(example, tokenizer, remove_indent=False, start=None, end=None, pred=False):
    """Prepare the text from a sample of the dataset."""
    thread = example["event_list"]
    if start != None and end != None:
        thread = thread[start:end]
    text = ""
    thread = thread[-20:]
    for message in thread:
        text += f"{message}{tokenizer.eos_token}\n"
    return text

test_input_ids_all = []
for model_names in models:
    for model_name in model_names:
        model_info = model_name.split("-")
        model = "-".join(model_info[:-2])
        dataset_name =  model_info[-2]
        random_seed = int(model_info[-1])
        set_seed(random_seed)
        model_path = "skaltenp/" + model_name

        dataset_path = "skaltenp/" + dataset_name
        model_name = model_path.split("/")[-1]

        dataset = load_dataset(dataset_path)
        train_data = dataset["train"].train_test_split(train_size=0.8, shuffle=True, seed=random_seed)
        test_data = train_data["test"]
        train_data = train_data["train"].train_test_split(train_size=0.8, shuffle=True, seed=random_seed)
        valid_data = train_data["test"]
        train_data = train_data["train"]
        dataset = DatasetDict(
            {
                "train": train_data,
                "valid": valid_data,
                "test": test_data
            }
        )

        tokenizer = AutoTokenizer.from_pretrained(
            model_path, 
            use_fast=True, 
        )
        tokenizer.pad_token_id = tokenizer.eos_token_id
        tokenizer.truncation_side = "left"

        test_len = len(dataset["test"])
        test_input_ids = []
        for example in dataset["test"]:
            res = prepare_sample_text(example, tokenizer)
            with torch.no_grad():
                inputs = tokenizer(
                    res, 
                    return_tensors="pt", 
                    #max_length=4096, 
                    #truncation=True
                ).to("cpu")
                test_input_ids.append(len(inputs["input_ids"][0]))
        test_input_ids_all.append([model, dataset_name, random_seed, test_input_ids])
print(test_input_ids_all)

In [None]:
import plotly.express as px
from matplotlib import pyplot as plt
for i in test_input_ids_all[:1]:
    px.box(i[-1]).show()

In [None]:
test_input_ids_all_new = pd.DataFrame(test_input_ids_all, columns=["model", "dataset_name", "random_seed", "test_input_ids"])
results = test_input_ids_all_new.copy()
def boxpl(x):
    x = np.array(x["test_input_ids"])
    return [min(x), np.quantile(x, 0.25), np.quantile(x, 0.5), np.quantile(x, 0.75), max(x)]
results[["min", "q25", "median", "q75", "max"]] = results.apply(boxpl, axis=1, result_type="expand")
display(results)

# Calculate Errors

In [None]:
models = [
    [
        "Meta-Llama-3-8B-bpi13_closed_problem-199900595",
        "Meta-Llama-3-8B-bpi13_closed_problem-534895718",
        "Meta-Llama-3-8B-bpi13_closed_problem-787846414",
        "Meta-Llama-3-8B-bpi13_closed_problem-862061404",
        "Meta-Llama-3-8B-bpi13_closed_problem-996406378"
    ],
    [
        "Meta-Llama-3-8B-bpi13_incidents-199900595",
        "Meta-Llama-3-8B-bpi13_incidents-534895718",
        "Meta-Llama-3-8B-bpi13_incidents-787846414",
        "Meta-Llama-3-8B-bpi13_incidents-862061404",
        "Meta-Llama-3-8B-bpi13_incidents-996406378"
    ],
    [
        "Meta-Llama-3-8B-sepsis_cases-199900595",
        "Meta-Llama-3-8B-sepsis_cases-534895718",
        "Meta-Llama-3-8B-sepsis_cases-787846414",
        "Meta-Llama-3-8B-sepsis_cases-862061404",
        "Meta-Llama-3-8B-sepsis_cases-996406378"
    ],
    [
        "Meta-Llama-3-8B-helpdesk-199900595",
        "Meta-Llama-3-8B-helpdesk-534895718",
        "Meta-Llama-3-8B-helpdesk-787846414",
        "Meta-Llama-3-8B-helpdesk-862061404",
        "Meta-Llama-3-8B-helpdesk-996406378"
    ],
    [
        "Meta-Llama-3-8B-bpi12-199900595",
        "Meta-Llama-3-8B-bpi12-534895718",
        "Meta-Llama-3-8B-bpi12-787846414",
        "Meta-Llama-3-8B-bpi12-862061404",
        "Meta-Llama-3-8B-bpi12-996406378"
    ]
]
results_path = "results"
errors = []
errors_names = ["Model", "Dataset", "Total errors",]
for model_names in models:
    for model_name in model_names:
        row = [model_name, model_name.split("-")[4], ]
        target_true_path = os.path.join(results_path, f'True_{model_name}.xes')
        target_predict_path = os.path.join(results_path, f'Pred_{model_name}.xes')
        xes_content = ""
        with open(target_predict_path, "r", encoding="utf-8") as read_file:
            xes_content = read_file.read()
        xes_content = xes_content.replace(
            """<?xml version="1.0" encoding="UTF-8" ?>
<!-- This file has been generated with the OpenXES library. It conforms -->
<!-- to the XML serialization of the XES standard for log storage and -->
<!-- management. -->
<!-- XES standard version: 1.0 -->
<!-- OpenXES library version: 1.0RC7 -->
<!-- OpenXES is available from http://www.openxes.org/ -->""",
            ""
        )
        row.append(len(re.findall(r"<!-- .* -->", xes_content)))
        #print(re.findall(r"<!-- .* -->", xes_content))
        errors.append(row)
errors = pd.DataFrame(errors, columns=errors_names)
display(errors.drop(columns="Model").groupby("Dataset").agg("mean").reset_index())
errors.drop(columns="Model").groupby("Dataset").agg("mean").reset_index().to_csv("errors.csv", index=False)

# Calculate time

We calculate the time by hand using the logs.