In [None]:
import pandas as pd
import numpy as np
from pm4py.objects.log.util import dataframe_utils
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.util.xes_constants import DEFAULT_NAME_KEY, DEFAULT_TIMESTAMP_KEY

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.util import dataframe_utils



#### Baseline

In [None]:
# 1. 데이터 로드 및 전처리
df = pd.read_csv('data/original/Credit.csv')

df = df.rename(columns={
    'Case': 'case:concept:name',
    'Activity': 'concept:name',
    'timestamp': 'time:Timestamp'
})

for col in ['case:concept:name', 'concept:name', 'time:Timestamp']:
    if col not in df.columns:
        df[col] = np.nan

df = dataframe_utils.convert_timestamp_columns_in_df(df)

parameters = {log_converter.Variants.TO_EVENT_LOG.value.Parameters.CASE_ID_KEY: 'case:concept:name'}
event_log = log_converter.apply(df, parameters=parameters, variant=log_converter.Variants.TO_EVENT_LOG)

# 2. prefix 데이터 생성
def generate_prefix_data(log, max_prefix_len=5):
    data = []
    for trace in log:
        activities = [event['concept:name'] for event in trace]
        for l in range(1, min(len(activities), max_prefix_len)):
            prefix = activities[:l]
            next_act = activities[l]
            data.append((prefix, next_act))
    return data

# window length
MAX_LEN = 5
prefix_data = generate_prefix_data(event_log, max_prefix_len=5)

unique_acts = list({act for prefix, label in prefix_data for act in prefix + [label]})
act2idx = {act: i for i, act in enumerate(unique_acts)}
idx2act = {i: act for act, i in act2idx.items()}

def vectorize_prefix(prefix, max_len=4):
    vec = [act2idx[act] for act in prefix]
    vec = vec + [-1] * (max_len - len(vec))
    return vec

X = np.array([vectorize_prefix(prefix) for prefix, label in prefix_data])
y = np.array([act2idx[label] for prefix, label in prefix_data])

ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_encoded = ohe.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Model
models = [
    ("Random Forest", RandomForestClassifier(n_estimators=100, random_state=42)),
    ("Logistic Regression", LogisticRegression(max_iter=1000, random_state=42)),
    ("Decision Tree", DecisionTreeClassifier(random_state=42)),
    ("SVM", SVC(random_state=42)),
    ("Naive Bayes", MultinomialNB())
]

# 4. Test
for name, model in models:
    
    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        print(f"\n[{name}]")
        print(f"Accuracy: {acc:.4f}")
    except Exception as e:
        print(f"\n[{name}] 모델은 에러 발생: {e}")


[Random Forest]
Accuracy: 0.8300

[Logistic Regression]
Accuracy: 0.8300

[Decision Tree]
Accuracy: 0.8300

[SVM]
Accuracy: 0.8300

[Naive Bayes]
Accuracy: 0.8300


#### Baseline LLM Augmented Version

In [30]:

df = pd.read_csv('data/augmented/Credit_augmented_random_0.3.csv')

df = df.rename(columns={
    'Case': 'case_id:concept:name',
    'Activity': 'concept:name',
    'Timestamp': 'time:timestamp'
})
df = dataframe_utils.convert_timestamp_columns_in_df(df)

params = {
    log_converter.Variants.TO_EVENT_LOG.value.Parameters.CASE_ID_KEY:
    'case_id:concept:name'
}
log_orig = log_converter.apply(df[df['Aug']==0], parameters=params,
                               variant=log_converter.Variants.TO_EVENT_LOG)
log_aug  = log_converter.apply(df[df['Aug']==1], parameters=params,
                               variant=log_converter.Variants.TO_EVENT_LOG)

# prefix/label
def generate_prefix_data(log, max_pre_len):
    data = []
    for trace in log:
        acts = [e['concept:name'] for e in trace]
        for l in range(1, min(len(acts), max_pre_len) + 1):
            nxt = acts[l] if l < len(acts) else None
            if nxt:
                data.append((acts[:l], nxt))
    return data


MAX_LEN = 5
prefix_orig = generate_prefix_data(log_orig, MAX_LEN)
prefix_aug  = generate_prefix_data(log_aug,  MAX_LEN)

# index/padding
all_prefixes = prefix_orig + prefix_aug
unique_acts = sorted({act for pre, nxt in all_prefixes for act in pre + [nxt]})

# PAD=0
act2idx = {act: i+1 for i, act in enumerate(unique_acts)}
PAD = 0

def vectorize_prefix(prefix, max_len=MAX_LEN):
    vec = [act2idx[a] for a in prefix]
    # right padding
    vec += [PAD] * (max_len - len(vec))
    return vec

#  train/test split

X_orig = np.array([vectorize_prefix(p, MAX_LEN) for p, _ in prefix_orig])
y_orig = np.array([act2idx[n] for _, n in prefix_orig])

X_train_o, X_test, y_train_o, y_test = train_test_split(
    X_orig, y_orig, test_size=0.2, random_state=42, shuffle=True
)

# train = train + aug
X_train = np.vstack([X_train_o,
                     np.array([vectorize_prefix(p, MAX_LEN) for p, _ in prefix_aug])])
y_train = np.hstack([y_train_o,
                     np.array([act2idx[n] for _, n in prefix_aug])])

# test
X_test  = X_test
y_test  = y_test

#  One-Hot  encoding
ct = ColumnTransformer(
    [(f"pos{i}",
      OneHotEncoder(handle_unknown='ignore'),
      [i])
     for i in range(MAX_LEN)],
    remainder='drop' 
)

X_train_enc = ct.fit_transform(X_train)
X_test_enc  = ct.transform(X_test)

models = [
    ("Random Forest",      RandomForestClassifier(n_estimators=100, random_state=42)),
    ("Logistic Regression",LogisticRegression(max_iter=1000, random_state=42)),
    ("Decision Tree",      DecisionTreeClassifier(random_state=42)),
    ("SVM",                SVC(random_state=42)),
    ("Naive Bayes",        MultinomialNB())
]

for name, model in models:
    model.fit(X_train_enc, y_train)
    y_pred = model.predict(X_test_enc)
    acc = accuracy_score(y_test, y_pred)
    print(f"[{name}] Accuracy: {acc:.4f}")


[Random Forest] Accuracy: 0.6730
[Logistic Regression] Accuracy: 0.6720
[Decision Tree] Accuracy: 0.6580
[SVM] Accuracy: 0.6870
[Naive Bayes] Accuracy: 0.4940


#### Random Augmentation

In [None]:
import pandas as pd
import numpy as np
import random
from datetime import timedelta

dataset = "BPIC15_1"
df = pd.read_csv("BPIC15_1.csv")
df = df.sort_values(by=['Case', 'Timestamp'])
aug_ratio = 0.3


df = df.dropna(how='all')
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
activity_set = sorted(df['Activity'].dropna().astype(str).unique().tolist())

case_groups = list(df.groupby('Case'))
augmented_rows = []
random.seed(42)

for case_id, group in case_groups:
    group = group.sort_values('Timestamp').reset_index(drop=True)
    original_rows = group.to_dict('records')
    new_rows = [{**row, "Aug": 0, "augmented_activity": "", "semantic_reason": ""} for row in original_rows]
    
    seq_len = len(original_rows)
    n_aug = max(1, int(seq_len * aug_ratio))
   
    aug_choices = random.choices(['add', 'delete', 'swap'], k=n_aug)
    aug_indices = random.sample(range(seq_len), k=n_aug)

    for op, idx in zip(aug_choices, aug_indices):
        if op == 'add':
           
            Activity = random.choice(activity_set)
            t_cur = original_rows[idx]['Timestamp']
            t_next = original_rows[idx+1]['Timestamp'] if idx+1 < seq_len else t_cur + timedelta(minutes=1)
            t_new = t_cur + (t_next - t_cur)/2
            aug_row = {
                "case": case_id,
                "Activity": Activity,
                "resource": "",
                "Timestamp": t_new,
                "diagnose": original_rows[idx].get("diagnose", ""),
                "Aug": 1,
                "augmented_activity": Activity,
                "semantic_reason": "Random add"
            }
            new_rows.append(aug_row)
        elif op == 'delete' and seq_len > 2:

            del_row = {
                **original_rows[idx],
                "Aug": 1,
                "augmented_activity": original_rows[idx]['Activity'],
                "semantic_reason": "Random delete (for augmentation only)",
                "Timestamp": original_rows[idx]['Timestamp'],
                "resource": original_rows[idx].get("resource", "")
            }
            new_rows.append(del_row)
        elif op == 'swap' and seq_len > 1 and idx < seq_len-1:
            
            for swap_idx in [idx, idx+1]:
                swap_row = {
                    **original_rows[swap_idx],
                    "Aug": 1,
                    "augmented_activity": original_rows[idx+1 if swap_idx == idx else idx]['Activity'],
                    "semantic_reason": "Random swap",
                    "Timestamp": original_rows[swap_idx]['Timestamp'],
                    "resource": original_rows[swap_idx].get("resource", "")
                }
                new_rows.append(swap_row)
    
    augmented_rows.extend(new_rows)


result_df = pd.DataFrame(augmented_rows)
result_df = result_df.sort_values(by=['Case', 'Timestamp']).reset_index(drop=True)
result_df['Timestamp'] = result_df['Timestamp'].astype(str)

result_df.to_csv(f"{dataset}_augmented_random_{aug_ratio}.csv", index=False)

num_original = (result_df['Aug'] == 0).sum()
num_augmented = (result_df['Aug'] == 1).sum()
augmentation_rate = num_augmented / num_original

print(f"\n[증강 통계]")
print(f"원본 Activity 수: {num_original}")
print(f"증강 Activity 수: {num_augmented}")
print(f"증강 비율: {augmentation_rate:.2%}")



[증강 통계]
원본 Activity 수: 26836
증강 Activity 수: 10309
증강 비율: 38.41%


#### PM4PY Stats.

In [None]:
import pandas as pd
from pm4py.objects.log.util import dataframe_utils
from pm4py.objects.conversion.log import converter as log_converter

# case 통계
# end activity
from pm4py.statistics.end_activities.log import get as end_activities_get
# start activity
from pm4py.statistics.start_activities.log import get as start_activities_get


df = pd.read_csv('sepsis.csv')
df = df.rename(columns={
    'case_id': 'case:concept:name',
    'activity': 'concept:name',
    'timestamp': 'time:timestamp'
})

for col in ['case:concept:name', 'concept:name', 'time:timestamp']:
    if col not in df.columns:
        df[col] = pd.NA

df = dataframe_utils.convert_timestamp_columns_in_df(df)
parameters = {log_converter.Variants.TO_EVENT_LOG.value.Parameters.CASE_ID_KEY: 'case:concept:name'}
event_log = log_converter.apply(df, parameters=parameters, variant=log_converter.Variants.TO_EVENT_LOG)


# (B) End activities
end_activities = end_activities_get.get_end_activities(event_log)
print("\n=== End Activities ===")
for act, cnt in end_activities.items():
    print(f"{act}: {cnt}")

# (C) Start activities
start_activities = start_activities_get.get_start_activities(event_log)
print("\n=== Start Activities ===")
for act, cnt in start_activities.items():
    print(f"{act}: {cnt}")



=== End Activities ===
Release A: 392
Return ER: 291
LacticAcid: 3
Leucocytes: 5
Release B: 51
Release E: 5
Release C: 18
Release D: 14
IV Antibiotics: 1
Admission NC: 8
CRP: 4
IV Liquid: 3

=== Start Activities ===
ER Registration: 752
IV Liquid: 13
Leucocytes: 16
ER Sepsis Triage: 5
ER Triage: 4
CRP: 5
