# Split a log into "train" and "test" sets 
This is used when a reference model is missing. 

The "train" set will be used to discover a model, which will then be used as an
artificial reference model to test the detection methods.

Splitting will be applied to cases.

In [1]:
import pandas as pd
import numpy as np
import altair as alt

In [2]:
log_name = 'bpic15'

fn_logs = {
    'bpic15': './data/preproc/bpic15.csv',
    'bpic17': './data/preproc/bpic2017.csv',
    'propr': './data/preproc/proprietary.csv'
}

In [3]:
log = pd.read_csv(fn_logs[log_name])
log

Unnamed: 0,case_id,activity,resource,complete timestamp,(case) IDofConceptCase,(case) Includes_subCases,(case) Responsible_actor,(case) SUMleges,(case) caseProcedure,(case) caseStatus,...,(case) landRegisterID,(case) last_phase,(case) parts,(case) requestComplete,Activity,activityNameNL,lifecycle:transition,monitoringResource,question,municipality
0,4284682,enter senddate procedure confirmation,560852,18/11/2009 9:00,,J,560852.0,1438.34350,,G,...,,Besluit genomen,"Bouw,Sloop",True,01_HOOFD_190_2,invoeren verzenddatum procedurebevestiging,complete,560852,EMPTY,muni-4
1,4511735,register submission date request,1254625,23/11/2009 9:00,,J,560604.0,258.80100,,G,...,4504496.0,Vergunning onherroepelijk,Brandveilig gebruik (vergunning),True,01_HOOFD_010,registratie datum binnenkomst aanvraag,complete,1254625,EMPTY,muni-5
2,3090877,enter senddate procedure confirmation,3148844,1/1/2010 9:00,,J,3442724.0,,Uitgebreid,G,...,,Vergunning verleend,Milieu (vergunning),True,01_HOOFD_065_2,invoeren verzenddatum procedurebevestiging,complete,560922,EMPTY,muni-3
3,5726442,enter senddate acknowledgement,2013365,13/5/2010 8:00,5726449.0,N,2013365.0,23847.62700,Uitgebreid,G,...,5726441.0,Ontwerpbesluit genomen,Bouw,False,08_AWB45_051_2,invoeren verzenddatum ontvangstbevestiging,complete,2013365,EMPTY,muni-3
4,3871304,date for inspection MER,560532,29/6/2010 8:00,,N,560458.0,,Uitgebreid,G,...,,Procedure afgebroken,Milieu (vergunning),True,01_HOOFD_080,datum ter inzage MER,complete,560458,29/6/2010 0:00,muni-2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262623,10069611,date draft decision for inspection,560752,5/3/2015 9:00,10069618.0,,560431.0,,Uitgebreid,O,...,,Ontwerpbesluit genomen,Milieu (omgevingsvergunning beperkte milieutoe...,False,10_UOV_030,datum ontwerpbeschikking ter inzage,complete,560431,EMPTY,muni-4
262624,12663845,start decision phase decision permitting sent,560912,5/3/2015 9:00,12663987.0,,560912.0,5019.30600,,O,...,12663844.0,Beschikking verzonden,Bouw,False,01_HOOFD_510_2a,instellen besluitfase:besluit vergunnen verzonden,complete,2670601,EMPTY,muni-1
262625,12999831,start decision phase decision permitting sent,560872,9/3/2015 9:00,13000092.0,,560872.0,220.82385,,O,...,717776.0,Beschikking verzonden,Kap,True,01_HOOFD_510_2a,instellen besluitfase:besluit vergunnen verzonden,complete,560872,EMPTY,muni-1
262626,12999831,enter senddate decision environmental permit,560872,9/3/2015 9:00,13000092.0,,560872.0,220.82385,,O,...,717776.0,Beschikking verzonden,Kap,True,01_HOOFD_510_2,invoeren verzenddatum beschikking omgevingsver...,complete,560872,EMPTY,muni-1


## Split based on time

The train set will contain X% cases that happened first, the remaining (100 -
X)% of cases will constitute the test set.

NB: This method assumes no concept dript happening in the process.

In [5]:
if log_name in {'propr'}:
    log['_start timestamp'] = pd.to_datetime(log['start timestamp'], format='mixed', dayfirst=True)
if log_name in {'bpic15', 'propr'}:
    log['_complete timestamp'] = pd.to_datetime(log['complete timestamp'], format='mixed', dayfirst=True)

df_case_time = list()
if log_name == 'bpic15':
    for case_id, trace in log.groupby('case_id'):
        df_case_time.append({
            'case_id': case_id,
            'case_start': trace['_complete timestamp'].min(),
            'case_end': trace['_complete timestamp'].max()
        })
if log_name == 'propr':
    for case_id, trace in log.groupby('case_id'):
        df_case_time.append({
            'case_id': case_id,
            'case_start': trace['_start timestamp'].min(),
            'case_end': trace['_complete timestamp'].max()
        })
df_case_time = pd.DataFrame(df_case_time)
df_case_time['case_duration_minutes'] = (
    df_case_time['case_end'] - df_case_time['case_start']
).dt.total_seconds() / 60
df_case_time

Unnamed: 0,case_id,case_start,case_end,case_duration_minutes
0,2742737,2011-06-01 08:00:00,2011-06-16 20:17:00,22337.0
1,2760925,2010-10-05 08:00:00,2010-10-07 22:57:00,3777.0
2,2771451,2010-10-06 08:00:00,2010-12-15 09:00:00,100860.0
3,2782209,2010-10-06 08:00:00,2010-11-24 09:00:00,70620.0
4,2783345,2010-10-07 08:00:00,2010-12-15 09:00:00,99420.0
...,...,...,...,...
5642,24129041,2015-02-13 09:00:00,2015-02-20 18:59:00,10679.0
5643,24129720,2015-02-16 09:00:00,2015-02-20 20:18:00,6438.0
5644,24130200,2015-02-16 09:00:00,2015-02-20 20:17:00,6437.0
5645,24153462,2015-02-19 09:00:00,2015-02-23 23:49:00,6649.0


In [6]:
df_case_time = df_case_time.sort_values(by='case_start')

num_train = int(len(df_case_time) * 0.7)
case_id_train = df_case_time.iloc[:(num_train+1)]['case_id']
case_id_test = df_case_time.iloc[(num_train+1):]['case_id']

print(log.loc[log['case_id'].isin(case_id_train), 'case_id'].nunique())
print(log.loc[log['case_id'].isin(case_id_test), 'case_id'].nunique())

fnout_train = f'{log_name}_train-70.csv'
fnout_test = f'{log_name}_test-30.csv'
log.loc[log['case_id'].isin(case_id_train)].to_csv(
    fnout_train, index=False
)
log.loc[log['case_id'].isin(case_id_test)].to_csv(
    fnout_test, index=False
)

3953
1694
