In [2]:
from IPython.display import clear_output
%load_ext autoreload
%autoreload 2
import os
print(os.getcwd())
os.chdir('/home/vco/Projects/pm4py-dcr') # working directory should be pm4py-dcr (the one behind notebooks)
print(os.getcwd())

/home/vco/Projects/pm4py-dcr/notebooks
/home/vco/Projects/pm4py-dcr


In [3]:
import pm4py
import time
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np

from math import sqrt
from pathlib import Path
from copy import deepcopy
from scipy import stats
from pm4py.algo.discovery.dcr_discover import algorithm as alg
from pm4py.objects.dcr.exporter import exporter as dcr_exporter
from pm4py.objects.dcr.importer import importer as dcr_importer
from pm4py.util.benchmarking import *
from pm4py.algo.evaluation.simplicity.variants import dcr_relations as dcr_simplicity
from pm4py.algo.evaluation.confusion_matrix.algorithm import fitness
from pm4py.objects.dcr import semantics as dcr_semantics
from pm4py.objects.dcr import sp_semantics
import networkx as nx

In [None]:
A,B,X,Y,Z = 'A','B','X','Y','Z'
dcr1 = {'events': {A,B,X,Y,Z},
       'excludesTo': {
           X: {X,Y,A,B},
           Y: {Y,X,A},
           Z: {A},
           A: {X}
       }}
# result should be {X,Y} or {A,X} but A is not self-excluding itself

In [None]:
dcr2 = {'events': {A,B,X,Y,Z},
       'excludesTo': {
           X: {Y,X,Z},
           Y: {X,Z,Y},
           Z: {Y,Z,X},
           A: {B,A,Z},
           B: {B,A}
       }}
# result should be {X,Y,Z}, {A,B}

# Scoring

In [None]:
def run_on_log(event_log_file, dcr_title, result_file_prefix, config):
    reference_dcr, event_log, sp_dcr, sp_log = benchmark_event_log(event_log_file,result_file_prefix,dcr_title,config)
    manual_dcr = None
    if os.path.isfile(f'models/{result_file_prefix}_manual_optimization.xml'):
        manual_dcr = dcr_importer.apply(f'models/{result_file_prefix}_manual_optimization.xml',dcr_importer.Variants.DCRXML)
    return dcr_simplicity.simplicity_summary(dcr_title,[sp_dcr, manual_dcr],reference_dcr)

In [None]:
event_log_file = '/home/vco/Datasets/Sepsis Cases - Event Log.xes'
dcr_title = 'Sepsis Cases'
result_file_prefix = 'sepsis'
config = {
    'findAdditionalConditions' : True,
    'inBetweenRels' : True,
    'timed' : False,
    'discardSelfInPredecessors': True,
    'usePredecessors': False
}
sepsis_reference_dcr, sepsis_event_log, sepsis_sp_dcr, sepsis_sp_log = \
    benchmark_event_log(event_log_file,result_file_prefix,dcr_title,config)
sepsis_manual_dcr = dcr_importer.apply(f'models/{result_file_prefix}_manual_optimization.xml',dcr_importer.Variants.DCRXML)

sepsis_no_i2e_or_e2i, _ = benchmark_subprocess_no_i2e_e2i(event_log_file,result_file_prefix,dcr_title)

sepsis_manual_dcr['subprocesses']['S1'] = {'Leucocytes','CRP'}
sepsis_manual_dcr['subprocesses']['S2'] = {'LacticAcid','S1'}
sepsis_manual_dcr['subprocesses']['S3'] = {'Release C','Release D','Release E'}
sepsis_manual_dcr['subprocesses']['S4'] = {'Release B','S3'}
sepsis_manual_dcr['subprocesses']['S0'] = {'Release A','S4'}
dcr_simplicity.simplicity_summary('Sepsis',
                   [sepsis_sp_dcr,sepsis_manual_dcr,sepsis_no_i2e_or_e2i],sepsis_reference_dcr)

In [None]:
print(fitness(sepsis_event_log,sepsis_reference_dcr))
print(fitness(sepsis_sp_log,sepsis_sp_dcr))
print(fitness(sepsis_sp_log,sepsis_no_i2e_or_e2i))
print(fitness(sepsis_event_log,sepsis_manual_dcr))

In [3]:
def pdcFscore(tp, fp, tn, fn):
    try:
        posAcc = tp / (tp + fn)
        negAcc = tn / (tn + fp)
        res = 2 * posAcc * negAcc / (posAcc + negAcc)
        return res
    except:
        return 0


def fscore(tp, fp, tn, fn):
    try:
        recall = tp / (tp + fn)
        prec = tp / (tp + fp)
        res = 2 * recall * prec / (recall + prec)
        return res
    except:
        return 0


def balancedAccuracy(tp, fp, tn, fn):
    try:
        posAcc = tp / (tp + fn)
        negAcc = tn / (tn + fp)
        res = (posAcc + negAcc) / 2
        return res
    except:
        return 0


def mcc(tp, fp, tn, fn):
    try:
        num = tp * tn - fp * fn
        tmp = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
        denom = sqrt(tmp)
        res = num / denom
        return res
    except:
        return 0

def train_basic_model(train,config):
    dcr_model, _ = alg.apply(train,alg.Variants.DCR_BASIC,**config)
    return dcr_model


def train_subprocess_model(train, config):
    dcr_model, _ = alg.apply(train, alg.Variants.DCR_SUBPROCESS_ME, **config)
    return dcr_model


def score_one_model(dcr_model, ground_truth_log):
    gt = ground_truth_log
    gt_cases = pm4py.convert_to_dataframe(gt).groupby('case:concept:name').first()['case:pdc:isPos']
    # test_log = pm4py.convert_to_dataframe(test)
    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for trace in gt: # the trace is without subprocesses
        gt_is_pos = gt_cases[trace.attributes['concept:name']]
        dcr = deepcopy(dcr_model)
        can_execute = True
        events_so_far = []
        for event in trace:
            # How to project an execution in a subprocess dcr from an event log without them
            executed = sp_semantics.sp_execute(event['concept:name'], dcr) # the graph is with subprocesses
            events_so_far.append(event['concept:name'])
            if executed is False:
                # if gt_is_pos:
                    # print(f"[!] Failing at: {event['concept:name']}")
                    # print(f'[Events so far] {events_so_far}')
                can_execute = False
                break
        accepting = sp_semantics.is_sp_accepting(dcr)
        test_is_pos = False
        if can_execute and accepting:
            test_is_pos = True
        if test_is_pos:
            if gt_is_pos:
                tp += 1
            else:
                fp += 1
        else:
            if gt_is_pos:
                fn += 1
            else:
                tn += 1
    print(f'tp: {tp}| fp: {fp} | tn: {tn} | fn: {fn}')
    return tp, fp, tn, fn

# Do F1 score based on PDC challenges
location: Datasets/PDC19,PDC20,PDC21,PDC22
inside: Ground Truth Logs | Test Logs | Training Logs
exceptions: PDC20 have Models (pnml) and PDC22 has Base Logs

In [17]:
def score_everything():
    base_dir = '/home/vco/Datasets'
    folders = ['PDC19','PDC20','PDC21','PDC22']
    special_folders = ['PDC21','PDC22']
    sub_folders = ['Ground Truth Logs','Test Logs','Training Logs']
    #now just take all .xes files and make sure they match across folders
    results = pd.DataFrame(columns=['PDC Year','Log name','Algorithm','TP','FP','TN','FN','Fscore (PDC)','Fscore','balancedAccuracy','mcc','Fitness','Simplicity','Subprocesses','Events','Runtime'])
    for folder in folders:
        print(f'[i] Started for {folder}')
        for log_name in os.listdir(os.path.join(base_dir,folder,sub_folders[0])):
            print(f'[i] Log {log_name}')
            gt = pm4py.read_xes(os.path.join(base_dir,folder,sub_folders[0],log_name), return_legacy_log_object=True)
            # test = pm4py.read_xes(os.path.join(base_dir,folder,sub_folders[1],log_name),return_legacy_log_object=True)

            if folder in special_folders:
                specific_log = f'{Path(log_name).stem}{0}.xes'
                train = pm4py.read_xes(os.path.join(base_dir,folder,sub_folders[2],specific_log),return_legacy_log_object=True)
            else:
                train = pm4py.read_xes(os.path.join(base_dir,folder,sub_folders[2],log_name),return_legacy_log_object=True)
            # run the basic DisCoveR, Sp-DisCoveR and Sp-DisCoveR without in between relations
            # run basic
            config = {
                'findAdditionalConditions': True,
                'timed': False
            }
            start_time = time.time()
            dcr_basic = train_basic_model(train, config)
            elapsed = time.time() - start_time
            sim = dcr_simplicity.get_simplicity(dcr_basic)
            fit = fitness(train,dcr_basic)
            tp, fp, tn, fn = score_one_model(dcr_basic,gt)
            pdc_f_score = pdcFscore(tp,fp,tn,fn)
            f_score = fscore(tp,fp,tn,fn)
            b_acc = balancedAccuracy(tp,fp,tn,fn)
            m_c_c = mcc(tp,fp,tn,fn)
            results = results.append({
                'PDC Year':folder,
                'Log name':log_name,
                'Algorithm':'DisCoveR',
                'TP':tp,'FP':fp,'TN':tn,'FN':fn,
                'Fscore (PDC)':pdc_f_score,
                'Fscore':f_score,
                'balancedAccuracy':b_acc,
                'mcc':m_c_c,
                'Fitness':fit[0]/fit[1], # fitness is on training
                'Simplicity':sim[0],
                'Subprocesses':0,
                'Events':len(dcr_basic['events']),
                'Runtime':elapsed
            },ignore_index=True)

            # run subprocess
            config = {
                'findAdditionalConditions': True,
                'inBetweenRels': True,
                'timed': False
            }
            start_time = time.time()
            dcr_subprocess_standard = train_subprocess_model(train,config)
            elapsed = time.time() - start_time

            sim = dcr_simplicity.get_simplicity(dcr_subprocess_standard)
            fit = fitness(train,dcr_subprocess_standard)
            tp, fp, tn, fn = score_one_model(dcr_subprocess_standard,gt)
            pdc_f_score = pdcFscore(tp,fp,tn,fn)
            f_score = fscore(tp,fp,tn,fn)
            b_acc = balancedAccuracy(tp,fp,tn,fn)
            m_c_c = mcc(tp,fp,tn,fn)
            results = results.append({
                'PDC Year':folder,
                'Log name':log_name,
                'Algorithm':'Sp-DisCoveR',
                'TP':tp,'FP':fp,'TN':tn,'FN':fn,
                'Fscore (PDC)':pdc_f_score,
                'Fscore':f_score,
                'balancedAccuracy':b_acc,
                'mcc':m_c_c,
                'Fitness':fit[0]/fit[1], # fitness is on training
                'Simplicity':sim[0],
                'Subprocesses':len(dcr_subprocess_standard['subprocesses']),
                'Events':len(dcr_subprocess_standard['events']),
                'Runtime':elapsed
            },ignore_index=True)

            # run subprocess
            config = {
                'findAdditionalConditions': True,
                'inBetweenRels': False,
                'timed': False
            }
            start_time = time.time()
            dcr_subprocess_no_in_between = train_subprocess_model(train,config)
            elapsed = time.time() - start_time

            sim = dcr_simplicity.get_simplicity(dcr_subprocess_no_in_between)
            fit = fitness(train,dcr_subprocess_no_in_between)
            tp, fp, tn, fn = score_one_model(dcr_subprocess_no_in_between,gt)
            pdc_f_score = pdcFscore(tp,fp,tn,fn)
            f_score = fscore(tp,fp,tn,fn)
            b_acc = balancedAccuracy(tp,fp,tn,fn)
            m_c_c = mcc(tp,fp,tn,fn)
            results = results.append({
                'PDC Year':folder,
                'Log name':log_name,
                'Algorithm':'Sp-DisCoveR_no_i2e_e2i',
                'TP':tp,'FP':fp,'TN':tn,'FN':fn,
                'Fscore (PDC)':pdc_f_score,
                'Fscore':f_score,
                'balancedAccuracy':b_acc,
                'mcc':m_c_c,
                'Fitness':fit[0]/fit[1], # fitness is on training
                'Simplicity':sim[0],
                'Subprocesses':len(dcr_subprocess_no_in_between['subprocesses']),
                'Events':len(dcr_subprocess_no_in_between['events']),
                'Runtime':elapsed
            },ignore_index=True)
            clear_output(wait=True)
        print(f'[i] Done for {folder}')
    return results

    # train on the training logs
    # get the isPos for each trace in the Ground Truth log
    # compare the isPos with the prediction on the test for some stupid reason the gt == test plus the isPos tag
    # do aggrgated results too

results = score_everything()
results

[i] Done for PDC22


Unnamed: 0,PDC Year,Log name,Algorithm,TP,FP,TN,FN,Fscore (PDC),Fscore,balancedAccuracy,mcc,Fitness,Simplicity,Subprocesses,Events,Runtime
0,PDC19,pdc_2019_4.xes,DisCoveR,47,4,38,1,0.940495,0.949495,0.941964,0.889908,1.000000,381,0,34,0.039879
1,PDC19,pdc_2019_4.xes,Sp-DisCoveR,45,4,38,3,0.920840,0.927835,0.921131,0.843728,0.951429,321,3,37,0.480446
2,PDC19,pdc_2019_4.xes,Sp-DisCoveR_no_i2e_e2i,48,4,38,0,0.950000,0.960000,0.952381,0.913874,1.000000,125,3,37,0.465687
3,PDC19,pdc_2019_1.xes,DisCoveR,45,0,45,0,1.000000,1.000000,1.000000,1.000000,1.000000,255,0,45,0.116100
4,PDC19,pdc_2019_1.xes,Sp-DisCoveR,32,0,45,13,0.831169,0.831169,0.855556,0.742781,0.740000,206,7,52,1.747886
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1177,PDC22,pdc2022_120010.xes,Sp-DisCoveR,142,0,500,358,0.442368,0.442368,0.642000,0.406818,1.000000,88,3,20,0.462132
1178,PDC22,pdc2022_120010.xes,Sp-DisCoveR_no_i2e_e2i,154,13,487,346,0.468006,0.461769,0.641000,0.378041,1.000000,61,3,20,0.464494
1179,PDC22,pdc2022_120001.xes,DisCoveR,175,30,470,325,0.510078,0.496454,0.645000,0.359176,1.000000,64,0,14,0.031254
1180,PDC22,pdc2022_120001.xes,Sp-DisCoveR,163,25,475,337,0.485423,0.473837,0.638000,0.353201,1.000000,57,2,16,0.323794


In [22]:
results.to_csv('models/results.csv',index_label=False)

In [None]:
i = 9
gt = pm4py.read_xes(f'/home/vco/Datasets/PDC19/Ground Truth Logs/pdc_2019_{i}.xes',return_legacy_log_object=True)
test = pm4py.read_xes(f'/home/vco/Datasets/PDC19/Test Logs/pdc_2019_{i}.xes',return_legacy_log_object=True)
train = pm4py.read_xes(f'/home/vco/Datasets/PDC19/Training Logs/pdc_2019_{i}.xes',return_legacy_log_object=True)

In [None]:
dcr_simplicity.get_simplicity(dcr_model)

In [None]:
fitness(train, dcr_model)

In [None]:
failing_trace = ['q', 'e', 'k']
dcr = deepcopy(dcr_model)
sp_semantics.sp_execute('q',dcr,cmd_print=True)
sp_semantics.sp_execute('e',dcr,cmd_print=True)

In [None]:
A,B,C,D,E,F,S0,S1,S2 = 'A','B','C','D','E','F','S0','S1','S2'
dcr_test = {
    'events': {A, B, C, D, E, F, S0, S1, S2},
    'conditionsFor': {},
    'milestonesFor': {},
    'responseTo': {},
    'includesTo': {},
    'excludesTo': {},
    'marking': {'executed': set(),
                'included': {A, B, C, D, E, F, S0, S1, S2},
                'pending': set()
                },
    'subprocesses': {
        S0:{A,B},
        S1:{C,D,S0},
        S2:{S1,E}
    }
}

In [None]:
sp_semantics.sp_execute(C,dcr_test,cmd_print=True)

# BPIC and RTFMP Other datasets

In [None]:
event_log_file = '/home/vco/Datasets/BPI Challenge 2017 - Offer log.xes'
dcr_title = 'BPIC2017'
result_file_prefix = 'bpic2017'
config = {
    'findAdditionalConditions' : True,
    'inBetweenRels' : True,
    'timed' : False,
    'discardSelfInPredecessors': True,
    'usePredecessors': False
}
bpic17_reference_dcr, bpic17_event_log, bpic17_sp_dcr, bpic17_sp_log =\
    benchmark_event_log(event_log_file,result_file_prefix,dcr_title,config)
dcr_simplicity.simplicity_summary('BPIC17',[bpic17_sp_dcr],bpic17_reference_dcr)

In [None]:
event_log_file = '/home/vco/Datasets/12683249/Road_Traffic_Fine_Management_Process.xes'
dcr_title = 'Traffic Fine'
result_file_prefix = 'rtfmp'
config = {
    'findAdditionalConditions' : True,
    'inBetweenRels' : True,
    'timed' : False,
    'discardSelfInPredecessors': True,
    'usePredecessors': False
}
rtfmp_reference_dcr, rtfmp_event_log, rtfmp_sp_dcr, rtfmp_sp_log =\
benchmark_event_log(event_log_file,result_file_prefix,dcr_title,config)
rtfmp_manual_dcr = dcr_importer.apply(f'models/{result_file_prefix}_manual_optimization.xml',dcr_importer.Variants.DCRXML)
rtfmp_manual_dcr['subprocesses']['S1'] = {'S0','Payment'}
rtfmp_manual_dcr['subprocesses']['S2'] = {'Receive Result Appeal from Prefecture','Insert Fine Notification'}
rtfmp_manual_dcr['subprocesses']['S3'] = {'Notify Result Appeal to Offender','Add penalty'}
rtfmp_manual_dcr['subprocesses']['S0'] = {'Appeal to Judge','Insert Date Appeal to Prefecture','Send Appeal to Prefecture','Send Fine'}
dcr_simplicity.simplicity_summary('RTFMP',[rtfmp_sp_dcr, rtfmp_manual_dcr],rtfmp_reference_dcr)

## Massive benchmarking

In [6]:
logs_list = {'Dreyers':'/home/vco/Datasets/Dreyers Foundation.xes',
              'BPIC19' :'/home/vco/Datasets/BPI_Challenge_2019.xes'}
logs_folder = '/home/vco/Datasets/data/TKDE_Benchmark'
for file in os.listdir(logs_folder):
    if file.endswith(".xes"):
        name = os.path.basename(file.split('.')[0])
        logs_list[name] = os.path.join(logs_folder, file)
print(f'[i] Started benchmarking on {len(logs_list)} logs')

config = {
    'findAdditionalConditions' : True,
    'inBetweenRels' : True,
    'timed' : False,
    'discardSelfInPredecessors': True,
    'usePredecessors': False
}
i = 1
res = {}
for k,v in logs_list.items():
    event_log_file = v
    dcr_title = k
    result_file_prefix = k
    res[k] = run_discover_config(event_log_file,alg.DCR_BASIC,result_file_prefix,dcr_title,config)
    print(f'[i] Done for {i}/{len(logs_list)} name: {k}')
    i += 1

[i] Started benchmarking on 14 logs
[i] Started with config: {'findAdditionalConditions': True, 'inBetweenRels': True, 'timed': False, 'discardSelfInPredecessors': True, 'usePredecessors': False}


parsing log, completed traces :: 100%|██████████| 700/700 [00:00<00:00, 4053.38it/s]


[i] Mining a DCR Model with DisCoveR!
[!] Model saved in models/Dreyers.xml
[i] Done for 1/14 name: Dreyers
[i] Started with config: {'findAdditionalConditions': True, 'inBetweenRels': True, 'timed': False, 'discardSelfInPredecessors': True, 'usePredecessors': False}


parsing log, completed traces ::  16%|█▋        | 41384/251734 [00:09<00:47, 4438.61it/s]

KeyboardInterrupt: 

# JSON for Morten

In [None]:
import json, pickle
class SetEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, set):
            return list(obj)
        return json.JSONEncoder.default(self, obj)

with open('models/example.json', 'w') as fp:
    json.dump(sepsis_manual_dcr, fp, cls=SetEncoder)
with open('models/example.json') as f:
    sepsis_manual_sp_from_json = json.load(f)#, cls=SetDecoder)

## Future work on simplicity

In [None]:
from itertools import product

l_events = len(sepsis_reference_dcr['events'])
G = nx.Graph()
G.add_edges_from((a, b) for a, b in product(range(l_events), range(l_events)) if a != b)
len(G.edges)
pos = nx.circular_layout(G)
nx.draw(G, pos, with_labels=True, arrows=True, node_size=700)

# TWO PROBLEMS TO SOLVE:
Only add an edge -- in G if there is a <--> between the notes in the digraph.
All nodes must be connected to each other.
If you want to be strict intersect with the atMostOnce
If there are cuts to be discovered in the mutual exclusion algorithm then it becomes a choice in relation to some optimization as to which subprocess to create.

Nothing works anymore.

TODO: implement my own simplicity and control flow complexity and some precission recall, F1 score bla bla bla

In [None]:
dcr = dcr1
adj_df = pd.DataFrame(columns=sorted(dcr['events']),index=sorted(dcr['events']),dtype=int)
adj_df = adj_df.fillna(0)
for e1 in dcr['events']:
    for e2 in dcr['events']:
        if e1 in dcr['excludesTo'] and e2 in dcr['excludesTo'][e1]:
            adj_df.loc[e1,e2] = 1
        if e2 in dcr['excludesTo'] and e1 in dcr['excludesTo'][e2]:
            adj_df.loc[e2,e1] = 1
adj_df = adj_df.astype(int)

In [None]:
adj_df

In [None]:
adj_df.T

In [None]:
adj_df + adj_df.T