# Data preprocessing

Raw datasets are in a CSV format. They were converted from the original formats including .dsc (Disco project file, BPIC 2015), .xes (BPIC 2017), and .xlsx (the proprietary data).

In [1]:
import pandas as pd

In [4]:
log_name = 'bpic15'

fn_logs = {
    'bpic15': './data/bpic2015_disco.csv',
    'bpic17': './data/bpic2017.csv',
    'propr': './data/proprietary.csv'
}

In [5]:
log = pd.read_csv(fn_logs[log_name])
# drop rows or columns with all values as NA
log = log.dropna(how='all')
log

Unnamed: 0,Case ID,activity,Resource,Complete Timestamp,(case) IDofConceptCase,(case) Includes_subCases,(case) Responsible_actor,(case) SUMleges,(case) caseProcedure,(case) caseStatus,...,(case) landRegisterID,(case) last_phase,(case) parts,(case) requestComplete,Activity,activityNameNL,lifecycle:transition,monitoringResource,question,municipality
0,4284682,enter senddate procedure confirmation,560852,18/11/2009 9:00,,J,560852.0,1438.34350,,G,...,,Besluit genomen,"Bouw,Sloop",True,01_HOOFD_190_2,invoeren verzenddatum procedurebevestiging,complete,560852,EMPTY,muni-4
1,4511735,register submission date request,1254625,23/11/2009 9:00,,J,560604.0,258.80100,,G,...,4504496.0,Vergunning onherroepelijk,Brandveilig gebruik (vergunning),True,01_HOOFD_010,registratie datum binnenkomst aanvraag,complete,1254625,EMPTY,muni-5
2,3090877,enter senddate procedure confirmation,3148844,1/1/2010 9:00,,J,3442724.0,,Uitgebreid,G,...,,Vergunning verleend,Milieu (vergunning),True,01_HOOFD_065_2,invoeren verzenddatum procedurebevestiging,complete,560922,EMPTY,muni-3
3,5726442,enter senddate acknowledgement,2013365,13/5/2010 8:00,5726449.0,N,2013365.0,23847.62700,Uitgebreid,G,...,5726441.0,Ontwerpbesluit genomen,Bouw,False,08_AWB45_051_2,invoeren verzenddatum ontvangstbevestiging,complete,2013365,EMPTY,muni-3
4,3871304,date for inspection MER,560532,29/6/2010 8:00,,N,560458.0,,Uitgebreid,G,...,,Procedure afgebroken,Milieu (vergunning),True,01_HOOFD_080,datum ter inzage MER,complete,560458,29/6/2010 0:00,muni-2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262623,10069611,date draft decision for inspection,560752,5/3/2015 9:00,10069618.0,,560431.0,,Uitgebreid,O,...,,Ontwerpbesluit genomen,Milieu (omgevingsvergunning beperkte milieutoe...,False,10_UOV_030,datum ontwerpbeschikking ter inzage,complete,560431,EMPTY,muni-4
262624,12663845,start decision phase decision permitting sent,560912,5/3/2015 9:00,12663987.0,,560912.0,5019.30600,,O,...,12663844.0,Beschikking verzonden,Bouw,False,01_HOOFD_510_2a,instellen besluitfase:besluit vergunnen verzonden,complete,2670601,EMPTY,muni-1
262625,12999831,start decision phase decision permitting sent,560872,9/3/2015 9:00,13000092.0,,560872.0,220.82385,,O,...,717776.0,Beschikking verzonden,Kap,True,01_HOOFD_510_2a,instellen besluitfase:besluit vergunnen verzonden,complete,560872,EMPTY,muni-1
262626,12999831,enter senddate decision environmental permit,560872,9/3/2015 9:00,13000092.0,,560872.0,220.82385,,O,...,717776.0,Beschikking verzonden,Kap,True,01_HOOFD_510_2,invoeren verzenddatum beschikking omgevingsver...,complete,560872,EMPTY,muni-1


In [6]:
log.columns

Index(['Case ID', 'activity', 'Resource', 'Complete Timestamp',
       '(case) IDofConceptCase', '(case) Includes_subCases',
       '(case) Responsible_actor', '(case) SUMleges', '(case) caseProcedure',
       '(case) caseStatus', '(case) case_type', '(case) landRegisterID',
       '(case) last_phase', '(case) parts', '(case) requestComplete',
       'Activity', 'activityNameNL', 'lifecycle:transition',
       'monitoringResource', 'question', 'municipality'],
      dtype='object')

In [7]:
if log_name == 'bpic15':
    print('#Cases:\t{}'.format(log['Case ID'].nunique()))
    print('#Activities:\t{}'.format(log['activity'].nunique()))
    print('#Resources:\t{}'.format(log['Resource'].nunique()))

if log_name == 'bpic17':
    print('#Cases:\t{}'.format(log['case:concept:name'].nunique()))
    print('#Activities:\t{}'.format(log['concept:name'].nunique()))
    print('#Resources:\t{}'.format(log['org:resource'].nunique()))
    print(log['lifecycle:transition'].unique())

if log_name == 'propr':
    print('#Cases:\t{}'.format(log['PI_ID'].nunique()))
    print('#Activities:\t{}'.format(log['Activity'].nunique()))
    print('#Resources:\t{}'.format(log['Originator'].nunique()))

#Cases:	5647
#Activities:	356
#Resources:	72


In [8]:
COL_CASE_ID = 'case_id'
COL_ACTIVITY = 'activity'
COL_START_TS = 'start timestamp'
COL_COMPLETE_TS = 'complete timestamp'
COL_RESOURCE = 'resource'

if log_name == 'bpic15':
    log = log.rename(columns={
        'Case ID': COL_CASE_ID,
        'activity': COL_ACTIVITY,
        'Complete Timestamp': COL_COMPLETE_TS,
        'Resource': COL_RESOURCE
    })
    log.to_csv('./data/preproc/bpic15.csv', index=False)

if log_name == 'bpic17':
    pass

if log_name == 'propr':
    log = log.rename(columns={
        'PI_ID': COL_CASE_ID,
        'Activity': COL_ACTIVITY,
        'Start': COL_START_TS,
        'End': COL_COMPLETE_TS,
        'Originator': COL_RESOURCE
    })
    log.to_csv('./data/preproc/proprietary.csv', index=False)