### Make top-level imports

* PM4Py
* OrdinoR (*note: from local repo*)

In [11]:
from os.path import join as path_join

import pandas as pd
import pm4py

import ordinor.constants as const
from ordinor.io import read_csv
from ordinor.utils.log_preprocessing import append_case_duration

### Import original event log data file

In [12]:
DIRPATH = './data/DATA_csv'
LOGNAME = 'bpic2018'

fn = path_join(DIRPATH, f'{LOGNAME}.csv')
print(f'Import source event log {fn}')

try:
    log = read_csv(fn)
except Exception as e:
    print(e)

print(sorted(log.columns))

Import source event log ./data/DATA_csv/bpic2018.csv
Importing from CSV file ./data/DATA_csv/bpic2018.csv
Scanned 2514266 events from "./data/DATA_csv/bpic2018.csv".
--------------------------------------------------------------------------------
Number of events:		2514266
Number of cases:		43809
--------------------------------------------------------------------------------
['@@case_index', '@@index', 'Unnamed: 0', 'activity', 'case:amount_applied0', 'case:amount_applied1', 'case:amount_applied2', 'case:amount_applied3', 'case:applicant', 'case:application', 'case:area', 'case:basic payment', 'case:concept:name', 'case:cross_compliance', 'case:department', 'case:greening', 'case:identity:id', 'case:number_parcels', 'case:payment_actual0', 'case:payment_actual1', 'case:payment_actual2', 'case:payment_actual3', 'case:penalty_ABP', 'case:penalty_AGP', 'case:penalty_AJLP', 'case:penalty_AUVP', 'case:penalty_AVBP', 'case:penalty_AVGP', 'case:penalty_AVJLP', 'case:penalty_AVUVP', 'case:pen

### Data preprocessing (generic)

##### Preprocessing
- Keep only cases that happened after 2017/01/01 (reason: According to data description, since 2017, there is no further change to document types. We assume the process no longer has concept drifts)
- Keep only events from subprocess "Main" and "Application"
- Keep only events with `success` marked as "true" (Boolean True)
- Construct "actual" activity labels (concept:name)

In [13]:
# Keep only cases happened after 2017/01/01
cases_to_keep = set()
for case_id, trace in log.groupby(const.CASE_ID):
    events = trace.sort_values(by=const.TIMESTAMP)
    case_start_time = events.iloc[0][const.TIMESTAMP]
    if case_start_time.tz_localize(None) >= pd.Timestamp(2017, 1, 1):
        cases_to_keep.add(case_id)
print(len(cases_to_keep))
log = log[log[const.CASE_ID].isin(cases_to_keep)]

# Keep only events from subprocess "Main" and "Application"
print(log['subprocess'].unique())
log = log[log['subprocess'].isin(['Main', 'Application'])]

# Keep only successful events
print(log['success'].unique())
log = log[log['success'] == True]

# Construct "actual" activity labels (concept:name)
log.loc[:, 'concept:name'] = log[['doctype', 'subprocess', 'activity']].agg(' - '.join, axis=1)

# Filter out meaningless resource labels: `0;n/a`
log = log[log[const.RESOURCE] != '0;n/a']

print(log['doctype'].nunique())
print(log['subprocess'].nunique())
print(log['activity'].nunique())
print(log[const.ACTIVITY].nunique())

log

14507
['Application' 'Main' 'Declared' 'Reported' 'On-Site' 'Objection' 'Remote'
 'Change']
[ True False]
5
2
18
44


Unnamed: 0.1,Unnamed: 0,success,org:resource,docid_uuid,doctype,subprocess,docid,activity,note,eventid,...,case:payment_actual1,case:amount_applied1,case:penalty_amount2,case:payment_actual2,case:amount_applied2,case:penalty_amount3,case:payment_actual3,case:amount_applied3,@@index,@@case_index
517,1759685,True,Document processing automaton,7F845C5F-ADB6-47AC-8510-4B5F3810F6FF,Geo parcel document,Main,-18008418266845870,initialize,none,-1.800842e+16,...,,,,,,,,,517,7
521,1759689,True,Document processing automaton,EDEF57D1-CAC9-41CB-867A-C948C8672C6A,Control summary,Main,-18008418266838151,initialize,none,-1.800842e+16,...,,,,,,,,,521,7
522,1759690,True,Document processing automaton,EDEF57D1-CAC9-41CB-867A-C948C8672C6A,Control summary,Main,-18008418266838151,begin editing,none,-1.800842e+16,...,,,,,,,,,522,7
523,1759691,True,Document processing automaton,EDEF57D1-CAC9-41CB-867A-C948C8672C6A,Control summary,Main,-18008418266838151,finish editing,none,-1.800842e+16,...,,,,,,,,,523,7
524,1759692,True,d0f451,7F845C5F-ADB6-47AC-8510-4B5F3810F6FF,Geo parcel document,Main,-18008418266845870,save,none,-9.006600e+16,...,,,,,,,,,524,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2514195,2383942,True,727350,A148E2FE-85DA-44E2-8A6C-824EDF068291,Payment application,Application,-54037160734135903,finish editing,automatic,-1.800835e+16,...,,,,,,,,,2514195,43807
2514196,2383943,True,155add,A148E2FE-85DA-44E2-8A6C-824EDF068291,Payment application,Application,-54037160734135903,decide,automatic,-9.006594e+16,...,,,,,,,,,2514196,43807
2514197,2383944,True,DP-Z,A148E2FE-85DA-44E2-8A6C-824EDF068291,Payment application,Application,-54037160734135903,begin payment,automatic during payment,-9.006594e+16,...,,,,,,,,,2514197,43807
2514198,2383945,True,Notification automaton,A148E2FE-85DA-44E2-8A6C-824EDF068291,Payment application,Application,-54037160734135903,insert document,notification for applicant,-9.006594e+16,...,,,,,,,,,2514198,43807


### Experiments

##### Preprocessing
- Filter out meaningless resource labels: `0;n/a`

##### Config
- Determine CT based on both the original trace attributes and the provided "derived attributes" (as in the dataset description)
    
    case:redistribution, case:small farmer, case:young farmer, case:penalty_{xxx}, case:selected_risk, case:selected_manually, case:rejected

- Determine AT in 3 ways: by 
    
    activity (note that this is not "concept:name"), doctype, subprocess
    
- Determine TT in 3 ways: by 
    
    year, month, weekday

- Discover resource groups using AHC, number of groups automatically determined (via CV)

- Apply OverallScore, parameters automatically determined (via Grid Search)


In [14]:
log.to_csv(f'data/{LOGNAME}.preprocessed.csv')

print(log['case:department'].unique())

# Is "department" really a case-level attribute?
# for case, events in log.groupby(const.CASE_ID):
#     if events['case:department'].nunique() > 1:
#         print('Not case-level attribute')
#         break
# YES. There is not a case where more than 1 department is recorded

# split into sublogs
# log_d4 = log[log['case:department'] == 'd4']
# log_4e = log[log['case:department'] == '4e']
# log_e7 = log[log['case:department'] == 'e7']
# log_6b = log[log['case:department'] == '6b']

# log_d4.to_csv(f'input/{LOGNAME}_d4.preprocessed.csv')
# log_4e.to_csv(f'input/{LOGNAME}_4e.preprocessed.csv')
# log_e7.to_csv(f'input/{LOGNAME}_e7.preprocessed.csv')
# log_6b.to_csv(f'input/{LOGNAME}_6b.preprocessed.csv')

['6b' 'e7' '4e' 'd4']
