# Split a log into "train" and "test" sets 
This is used when a reference model is missing. 

The "train" set will be used to discover a model, which will then be used as an
artificial reference model to test the detection methods.

Splitting will be applied to cases.

In [42]:
import pandas as pd
import numpy as np
import altair as alt

In [43]:
log_name = 'propr'

fn_logs = {
    'bpic15': './data/preproc/bpic15.csv',
    'bpic17': './data/preproc/bpic2017.csv',
    'propr': './data/preproc/proprietary.csv'
}

In [44]:
log = pd.read_csv(fn_logs[log_name])
log

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,"School holidays; 0 = no, 1 = yes",Approval; 1 = low sum to 4 = high sum,Type of loan; 0 = normal; 1 = special,Cost
0,Approval_Branch,005141EEB1240B31C12577DF004F6A77,18/11/2010 15:27,18/11/2010 15:58,004-9-1,Thursday,0.0,2.0,0.0,"0,87 € per minute"
1,Precheck,005141EEB1240B31C12577DF004F6A77,19/11/2010 12:45,19/11/2010 12:46,000-3-01,Friday,0.0,2.0,0.0,"0,87 € per minute"
2,Precheck,005141EEB1240B31C12577DF004F6A77,24/11/2010 8:18,24/11/2010 8:26,000-2-01,Wednesday,0.0,2.0,0.0,"0,87 € per minute"
3,Check_of_Processing_Applications,005141EEB1240B31C12577DF004F6A77,24/11/2010 10:35,24/11/2010 10:35,000-2-01,Wednesday,0.0,2.0,0.0,"0,87 € per minute"
4,Processing_of_Applications,005141EEB1240B31C12577DF004F6A77,2/12/2010 10:46,2/12/2010 10:46,010-23-11,Thursday,0.0,2.0,0.0,"1,02 € per minute"
...,...,...,...,...,...,...,...,...,...,...
18440,Precheck,FFFF329EF772D73CC12577EA00534A1C,13/12/2010 11:38,13/12/2010 11:40,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute"
18441,Check_of_Processing_Applications,FFFF329EF772D73CC12577EA00534A1C,13/12/2010 12:06,13/12/2010 12:14,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute"
18442,Processing_of_Applications,FFFF329EF772D73CC12577EA00534A1C,22/12/2010 11:48,22/12/2010 11:48,010-23-07,Wednesday,1.0,2.0,0.0,"1,02 € per minute"
18443,Archieving,FFFF329EF772D73CC12577EA00534A1C,22/12/2010 11:49,22/12/2010 13:19,010-23-07,Wednesday,1.0,2.0,0.0,"1,02 € per minute"


## Split based on time

The train set will contain X% cases that happened first, the remaining (100 -
X)% of cases will constitute the test set.

NB: This method assumes no concept dript happening in the process.

In [45]:
if log_name in {'propr'}:
    log['_start timestamp'] = pd.to_datetime(log['start timestamp'], format='mixed', dayfirst=True)
if log_name in {'bpic15', 'propr'}:
    log['_complete timestamp'] = pd.to_datetime(log['complete timestamp'], format='mixed', dayfirst=True)

df_case_time = list()
if log_name == 'bpic15':
    for case_id, trace in log.groupby('case_id'):
        df_case_time.append({
            'case_id': case_id,
            'case_start': trace['_complete timestamp'].min(),
            'case_end': trace['_complete timestamp'].max()
        })
if log_name == 'propr':
    for case_id, trace in log.groupby('case_id'):
        df_case_time.append({
            'case_id': case_id,
            'case_start': trace['_start timestamp'].min(),
            'case_end': trace['_complete timestamp'].max()
        })
df_case_time = pd.DataFrame(df_case_time)
df_case_time['case_duration_minutes'] = (
    df_case_time['case_end'] - df_case_time['case_start']
).dt.total_seconds() / 60
df_case_time

Unnamed: 0,case_id,case_start,case_end,case_duration_minutes
0,005141EEB1240B31C12577DF004F6A77,2010-11-18 15:27:00,2010-12-02 12:20:00,19973.0
1,0051574BDF30D681C1257888004DC40D,2011-05-06 16:09:00,2011-05-18 10:28:00,16939.0
2,00FAC674CB682852C125783900354482,2011-02-16 10:41:00,2011-05-20 11:16:00,133955.0
3,01014611DCAF6000C12577B400256F79,2010-10-06 08:49:00,2010-10-22 08:45:00,23036.0
4,01100413B9061A56C1257864003CCA34,2011-03-31 13:04:00,2011-04-07 11:16:00,9972.0
...,...,...,...,...
1726,FF8174DDD2F1542BC12577FB00496ED4,2010-12-16 14:22:00,2010-12-20 14:54:00,5792.0
1727,FFAB25BE48C9B474C12577EE0034A3EA,2010-12-03 10:35:00,2010-12-10 15:07:00,10352.0
1728,FFAE4641B2711A82C125779200404B1C,2010-09-02 13:42:00,2010-09-14 11:38:00,17156.0
1729,FFDB7AA6D5B34B2DC12577F6004ADC13,2011-01-03 21:37:00,2011-01-14 09:55:00,15138.0


In [46]:
df_case_time = df_case_time.sort_values(by='case_start', ascending=True)

num_train = int(len(df_case_time) * 0.7)
case_id_train = df_case_time.iloc[:(num_train+1)]['case_id']
case_id_test = df_case_time.iloc[(num_train+1):]['case_id']

print(log.loc[log['case_id'].isin(case_id_train), 'case_id'].nunique())
print(log.loc[log['case_id'].isin(case_id_test), 'case_id'].nunique())

fnout_train = f'{log_name}_train-70.csv'
fnout_test = f'{log_name}_test-30.csv'
# log.loc[log['case_id'].isin(case_id_train)].to_csv(
#     fnout_train, index=False
# )
# log.loc[log['case_id'].isin(case_id_test)].to_csv(
#     fnout_test, index=False
# )

1212
519


In [47]:
df_case_time.loc[df_case_time['case_id'].isin(case_id_train), ['case_id', 'case_start', 'case_end']]

Unnamed: 0,case_id,case_start,case_end
877,80C8BA21F38CD0D5C12577730022EB50,2010-08-02 08:21:00,2010-08-12 15:40:00
1613,ED96EB26C865B9D3C125777300240137,2010-08-02 08:33:00,2010-08-26 09:01:00
1294,BED0B0E13FC745B8C12577700035483E,2010-08-02 08:37:00,2010-08-10 14:41:00
27,0544ACBCEAA1126FC125776D003B0633,2010-08-02 09:02:00,2010-08-04 15:51:00
1351,C7C78EA3897FE998C125776F0047898F,2010-08-02 09:12:00,2010-08-19 17:19:00
...,...,...,...
262,284A7FD07BC0ED3AC12578420034F1A9,2011-02-25 10:38:00,2011-03-10 11:49:00
1482,DB4EAF1F0BECFBFCC125784200387660,2011-02-25 11:16:00,2011-03-10 09:03:00
1361,C8856F24853AD616C1257842003C3E66,2011-02-25 11:58:00,2011-02-28 15:56:00
234,24B20825B8DCA991C1257842004275B7,2011-02-25 13:06:00,2011-03-21 12:13:00


In [48]:
df_case_time.loc[df_case_time['case_id'].isin(case_id_test), ['case_id', 'case_start', 'case_end']]

Unnamed: 0,case_id,case_start,case_end
798,7389AEB4BBE73512C125784200527D84,2011-02-25 16:01:00,2011-05-09 16:00:00
939,89031090D8B79148C12578420052F4F6,2011-02-25 16:06:00,2011-03-04 15:11:00
1077,9C7A67B58C33BC60C125784500290BD1,2011-02-28 08:28:00,2011-04-05 16:14:00
1618,EEBDCCE3EDF9A9BAC1257845002E8C51,2011-02-28 09:28:00,2011-03-08 09:32:00
553,4E9BA47E42469777C1257845002F7B0F,2011-02-28 09:38:00,2011-03-15 14:01:00
...,...,...,...
1359,C85C2DC477E39164C12578A100305E69,2011-05-31 10:48:00,2011-05-31 11:50:00
94,0E0191727EFD3B97C12578A10031B8DC,2011-05-31 11:03:00,2011-05-31 15:55:00
855,7D3112C85E39EB02C12578A100451349,2011-05-31 14:34:00,2011-05-31 14:46:00
495,4785DF959AF563EEC12578A10045BD94,2011-05-31 14:41:00,2011-05-31 16:30:00


In [49]:
log_test = log.loc[log['case_id'].isin(case_id_test)]
print(len(log_test))
print(log_test['case_id'].nunique())
print(log_test['activity'].nunique())
print(log_test['resource'].nunique())
print(log_test['start timestamp'].min())
print(log_test['start timestamp'].max())
print(log_test['complete timestamp'].min())
print(log_test['complete timestamp'].max())


5290
519
25
166
1/03/2011 10:00
9/05/2011 9:51
1/03/2011 10:05
9/05/2011 9:56
