In [96]:
from os.path import join as path_join

from itertools import product

import pandas as pd
import pm4py

import ordinor.constants as const

## Import original event log data file

In [97]:
DIRPATH = './data/DATA_csv'
LOGNAME = 'bpic2015_disco'

from ordinor.io import read_disco_csv
from ordinor.utils.log_preprocessing import append_case_duration

fn = path_join(DIRPATH, f'{LOGNAME}.csv')
print(f'Import source event log {fn}')

log = read_disco_csv(fn)

log = log[[
#     '(case) Includes_subCases',
    '(case) Responsible_actor',
    '(case) SUMleges', 
#     '(case) caseProcedure', '(case) caseStatus',
    '(case) last_phase', '(case) parts',
#     '(case) requestComplete',
    
    'case:concept:name', 
    
    'concept:name', 'lifecycle:transition', 'action_code',
#     'monitoringResource',
    'org:resource',
    'time:timestamp'
]]

# use action code as activity label
log[const.ACTIVITY] = log['action_code']

# filter out events with no `concept:name`
log = log.dropna(subset=['concept:name'])
log = log[log['concept:name'] != 'nan']

log = log.rename(columns={
#     '(case) Includes_subCases': 'case:Includes_subCases',
    '(case) Responsible_actor': 'case:Responsible_actor',
    '(case) SUMleges': 'case:SUMleges', 
#     '(case) caseProcedure': 'case:caseProcedure', 
#     '(case) caseStatus': 'case:caseStatus',
    '(case) last_phase': 'case:last_phase', 
    '(case) parts': 'case:parts',
#     '(case) requestComplete': 'case:requestComplete',
})

print(sorted(log.columns))

Import source event log ./data/DATA_csv/bpic2015_disco.csv
Importing from CSV file ./data/DATA_csv/bpic2015_disco.csv


  df = pd.read_csv(filepath, sep=sep, **pdkwargs)


Scanned 262628 events from "./data/DATA_csv/bpic2015_disco.csv".
--------------------------------------------------------------------------------
Number of events:		262628
Number of cases:		5647
--------------------------------------------------------------------------------
['action_code', 'case:Responsible_actor', 'case:SUMleges', 'case:concept:name', 'case:last_phase', 'case:parts', 'concept:name', 'lifecycle:transition', 'org:resource', 'time:timestamp']


## Data preprocessing

- ~~Keep only events from the main process, i.e., with `action_code` started with "01_HOOFD"~~

- Check if selected attributes are case attributes

    Only very few (less than 3) cases contain more than 1 value for these attributes. We therefore exclude those cases instead of attributes:
    
    {4020737, 6038724}

- Derive attributes

  - Case attribute:
  
    `case:parts_Bouw`: based on if `case:parts` value contains string "Bouw"
    
  - Event attribute:
    `subprocess`: based on the first part of the `action_code` (split by underscore "_")
    `phase`: based on the first digit after the subprocess prefix
    

In [98]:
# Check case attributes
cases_rm = set()
for attr in [
    'case:Responsible_actor',
    'case:parts',
]:
    case_count = 0
    for key, events in log.groupby(const.CASE_ID):
        if events[attr].nunique() > 1:
            case_count += 1
            cases_rm.add(key)
    if case_count > 0:
        print(f'{attr} is not case attribute!')
        print(f'{case_count} cases have more than 1 value for this attribute')

print(cases_rm)
log = log[~log[const.CASE_ID].isin(cases_rm)]

case:Responsible_actor is not case attribute!
2 cases have more than 1 value for this attribute
case:parts is not case attribute!
1 cases have more than 1 value for this attribute
{'4020737', '6038724'}


In [99]:
# Derive attributes
# case attributes
log['case:parts_Bouw'] = log['case:parts'].apply(lambda s: 'bouw' in str(s).lower())
print(log['case:parts_Bouw'].unique())

[ True False]


In [100]:
log

Unnamed: 0,case:Responsible_actor,case:SUMleges,case:last_phase,case:parts,case:concept:name,concept:name,lifecycle:transition,action_code,org:resource,time:timestamp,case:parts_Bouw
0,1550894.0,398.92447,Besluit onherroepelijk,Bouw,10002357,01_HOOFD_010,complete,01_HOOFD_010,560752,2014-08-05 08:00:00+00:00,True
1,1550894.0,398.92447,Besluit onherroepelijk,Bouw,10002357,01_HOOFD_011,complete,01_HOOFD_011,560752,2014-08-06 08:00:00+00:00,True
2,1550894.0,398.92447,Besluit onherroepelijk,Bouw,10002357,01_HOOFD_015,complete,01_HOOFD_015,560752,2014-08-06 08:00:00+00:00,True
3,1550894.0,398.92447,Besluit onherroepelijk,Bouw,10002357,01_HOOFD_020,complete,01_HOOFD_020,560752,2014-08-06 08:00:00+00:00,True
4,1550894.0,398.92447,Besluit onherroepelijk,Bouw,10002357,01_HOOFD_061,complete,01_HOOFD_061,560752,2014-08-06 08:00:00+00:00,True
...,...,...,...,...,...,...,...,...,...,...,...
262623,560600.0,718.44680,Zaak afgehandeld,"Bouw,Handelen in strijd met regels RO",9998898,01_HOOFD_810,complete,01_HOOFD_810,560600,2014-03-12 09:00:00+00:00,True
262624,560600.0,718.44680,Zaak afgehandeld,"Bouw,Handelen in strijd met regels RO",9998898,01_HOOFD_811,complete,01_HOOFD_811,560600,2014-03-12 09:00:00+00:00,True
262625,560600.0,718.44680,Zaak afgehandeld,"Bouw,Handelen in strijd met regels RO",9998898,01_HOOFD_814,complete,01_HOOFD_814,560600,2014-03-12 09:00:00+00:00,True
262626,560600.0,718.44680,Zaak afgehandeld,"Bouw,Handelen in strijd met regels RO",9998898,01_HOOFD_815,complete,01_HOOFD_815,560600,2014-03-12 09:00:00+00:00,True


In [101]:
# event attribute
# subprocess
import re
import numpy as np
patt = r'\d{2}_\w+_\d{3}'
prog = re.compile(patt)

subprocess = []
phase = []

for i, row in log.iterrows():
    m = prog.match(row[const.ACTIVITY])
    if m:
        subprocess.append(m[0][0:-4])
        phase.append(m[0][0:-2])
    else:
        print(row)
        print(row[const.ACTIVITY])

log.loc[:, 'subprocess'] = subprocess
log.loc[:, 'phase'] = phase

for name, group in log.groupby(['subprocess', 'phase']):
    print(name)

print(log[const.CASE_ID].nunique())
print(len(log))

('01_BB', '01_BB_5')
('01_BB', '01_BB_6')
('01_BB', '01_BB_7')
('01_HOOFD', '01_HOOFD_0')
('01_HOOFD', '01_HOOFD_1')
('01_HOOFD', '01_HOOFD_2')
('01_HOOFD', '01_HOOFD_3')
('01_HOOFD', '01_HOOFD_4')
('01_HOOFD', '01_HOOFD_5')
('01_HOOFD', '01_HOOFD_6')
('01_HOOFD', '01_HOOFD_7')
('01_HOOFD', '01_HOOFD_8')
('01_OLO', '01_OLO_1')
('01_OLO', '01_OLO_2')
('02_DRZ', '02_DRZ_0')
('02_OLO', '02_OLO_1')
('03_GBH', '03_GBH_0')
('03_VD', '03_VD_0')
('04_BPT', '04_BPT_0')
('05_EIND', '05_EIND_0')
('06_OLO', '06_OLO_1')
('06_VD', '06_VD_0')
('06_VD', '06_VD_1')
('07_OPS', '07_OPS_0')
('08_AWB45', '08_AWB45_0')
('08_AWB45', '08_AWB45_1')
('08_AWB45_WAW', '08_AWB45_WAW_0')
('08_OLO', '08_OLO_1')
('09_AH_I', '09_AH_I_0')
('09_AWB45', '09_AWB45_0')
('10_OLO', '10_OLO_1')
('10_UOV', '10_UOV_0')
('10_UOV', '10_UOV_1')
('10_UOV', '10_UOV_2')
('11_AH_II', '11_AH_II_0')
('11_AH_II', '11_AH_II_1')
('11_OLO', '11_OLO_1')
('12_AP', '12_AP_0')
('12_AP_UOV', '12_AP_UOV_0')
('13_CRD', '13_CRD_0')
('14_VRIJ', '14_

In [102]:
log.to_csv(f'./bpic2015.preprocessed.csv')