# Split a log into "train" and "test" sets 
This is used when a reference model is missing. 

The "train" set will be used to discover a model, which will then be used as an
artificial reference model to test the detection methods.

Splitting will be applied to cases.

In [80]:
import pandas as pd
import numpy as np
import altair as alt

In [81]:
log_name = 'propr'

fn_logs = {
    'propr': './data/preproc/proprietary.csv'
}

In [82]:
log = pd.read_csv(fn_logs[log_name])
log

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,Cost,"School holidays; 0 = no, 1 = yes",Approval; 1 = low risk to 4 = high risk,Type of loan; 0 = normal; 1 = special
0,Approval_Branch,ID-1,18/11/2010 15:27,18/11/2010 15:58,004-9-1,Thursday,"0,87 € per minute",0.0,2.0,0.0
1,Precheck,ID-1,19/11/2010 12:45,19/11/2010 12:46,000-3-01,Friday,"0,87 € per minute",0.0,2.0,0.0
2,Precheck,ID-1,24/11/2010 8:18,24/11/2010 8:26,000-2-01,Wednesday,"0,87 € per minute",0.0,2.0,0.0
3,Check_of_Processing_Applications,ID-1,24/11/2010 10:35,24/11/2010 10:35,000-2-01,Wednesday,"0,87 € per minute",0.0,2.0,0.0
4,Processing_of_Applications,ID-1,2/12/2010 10:46,2/12/2010 10:46,010-23-11,Thursday,"1,02 € per minute",0.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...
18440,Precheck,ID-1731,13/12/2010 11:38,13/12/2010 11:40,000-2-01,Monday,"0,87 € per minute",0.0,2.0,0.0
18441,Check_of_Processing_Applications,ID-1731,13/12/2010 12:06,13/12/2010 12:14,000-2-01,Monday,"0,87 € per minute",0.0,2.0,0.0
18442,Processing_of_Applications,ID-1731,22/12/2010 11:48,22/12/2010 11:48,010-23-07,Wednesday,"1,02 € per minute",1.0,2.0,0.0
18443,Archieving,ID-1731,22/12/2010 11:49,22/12/2010 13:19,010-23-07,Wednesday,"1,02 € per minute",1.0,2.0,0.0


## Split based on time

The train set will contain X% cases that happened first, the remaining (100 -
X)% of cases will constitute the test set.

NB: This method assumes no concept dript happening in the process.

In [83]:
if log_name in {'propr'}:
    log['_start timestamp'] = pd.to_datetime(log['start timestamp'], format='mixed', dayfirst=True)
if log_name in {'bpic15', 'propr'}:
    log['_complete timestamp'] = pd.to_datetime(log['complete timestamp'], format='mixed', dayfirst=True)

df_case_time = list()
if log_name == 'bpic15':
    for case_id, trace in log.groupby('case_id'):
        df_case_time.append({
            'case_id': case_id,
            'case_start': trace['_complete timestamp'].min(),
            'case_end': trace['_complete timestamp'].max()
        })
if log_name == 'propr':
    for case_id, trace in log.groupby('case_id'):
        df_case_time.append({
            'case_id': case_id,
            'case_start': trace['_start timestamp'].min(),
            'case_end': trace['_complete timestamp'].max()
        })
df_case_time = pd.DataFrame(df_case_time)
df_case_time['case_duration_minutes'] = (
    df_case_time['case_end'] - df_case_time['case_start']
).dt.total_seconds() / 60
df_case_time

Unnamed: 0,case_id,case_start,case_end,case_duration_minutes
0,ID-1,2010-11-18 15:27:00,2010-12-02 12:20:00,19973.0
1,ID-10,2011-04-15 14:33:00,2011-04-21 12:26:00,8513.0
2,ID-100,2011-01-26 14:17:00,2011-02-04 10:12:00,12715.0
3,ID-1000,2011-02-18 15:55:00,2011-02-25 14:36:00,10001.0
4,ID-1001,2010-11-04 09:57:00,2010-11-12 09:16:00,11479.0
...,...,...,...,...
1726,ID-995,2011-04-29 10:15:00,2011-05-03 15:40:00,6085.0
1727,ID-996,2011-01-11 11:27:00,2011-01-21 10:58:00,14371.0
1728,ID-997,2010-12-06 15:35:00,2011-01-11 09:41:00,51486.0
1729,ID-998,2010-10-19 14:50:00,2010-11-30 09:08:00,60138.0


In [84]:
df_case_time = df_case_time.sort_values(by='case_start')

num_train = int(len(df_case_time) * 0.7)
case_id_train = df_case_time.iloc[:(num_train+1)]['case_id']
case_id_test = df_case_time.iloc[(num_train+1):]['case_id']

print(log.loc[log['case_id'].isin(case_id_train), 'case_id'].nunique())
print(log.loc[log['case_id'].isin(case_id_test), 'case_id'].nunique())

fnout_train = f'{log_name}_train-70.csv'
fnout_test = f'{log_name}_test-30.csv'
log.loc[log['case_id'].isin(case_id_train)].to_csv(
    fnout_train, index=False
)
log.loc[log['case_id'].isin(case_id_test)].to_csv(
    fnout_test, index=False
)

1212
519


In [85]:
df_case_time.loc[df_case_time['case_id'].isin(case_id_train), ['case_id', 'case_start', 'case_end']]

Unnamed: 0,case_id,case_start,case_end
1596,ID-878,2010-08-02 08:21:00,2010-08-12 15:40:00
684,ID-1614,2010-08-02 08:33:00,2010-08-26 09:01:00
329,ID-1295,2010-08-02 08:37:00,2010-08-10 14:41:00
932,ID-28,2010-08-02 09:02:00,2010-08-04 15:51:00
393,ID-1352,2010-08-02 09:12:00,2010-08-19 17:19:00
...,...,...,...
914,ID-263,2011-02-25 10:38:00,2011-03-10 11:49:00
538,ID-1483,2011-02-25 11:16:00,2011-03-10 09:03:00
404,ID-1362,2011-02-25 11:58:00,2011-02-28 15:56:00
883,ID-235,2011-02-25 13:06:00,2011-03-21 12:13:00


In [86]:
df_case_time.loc[df_case_time['case_id'].isin(case_id_test), ['case_id', 'case_start', 'case_end']]

Unnamed: 0,case_id,case_start,case_end
1508,ID-799,2011-02-25 16:01:00,2011-05-09 16:00:00
1666,ID-940,2011-02-25 16:06:00,2011-03-04 15:11:00
88,ID-1078,2011-02-28 08:28:00,2011-04-05 16:14:00
689,ID-1619,2011-02-28 09:28:00,2011-03-08 09:32:00
1237,ID-554,2011-02-28 09:38:00,2011-03-15 14:01:00
...,...,...,...
402,ID-1360,2011-05-31 10:48:00,2011-05-31 11:50:00
1676,ID-95,2011-05-31 11:03:00,2011-05-31 15:55:00
1572,ID-856,2011-05-31 14:34:00,2011-05-31 14:46:00
1172,ID-496,2011-05-31 14:41:00,2011-05-31 16:30:00
