# Performance Masking

## Description
When agents act in such a way, through their dealing with or knowledge of IT systems, that their true performance cannot (easily) be established.

In [81]:
import pandas as pd
import numpy as np
import altair as alt
alt.data_transformers.enable('vegafusion')

DataTransformerRegistry.enable('vegafusion')

In [82]:
log_name = 'propr_test'

fn_logs = {
    'bpic15': '../data/preproc/bpic15.csv',
    'bpic15_ref': '../data/preproc/bpic15_train-70.csv',
    'bpic15_test': '../data/preproc/bpic15_test-30.csv',

    'bpic17': '../data/preproc/bpic2017.csv',

    'propr': '../data/preproc/proprietary.csv',
    'propr_ref': '../data/preproc/proprietary_train-70.csv',
    'propr_test': '../data/preproc/proprietary_test-30.csv'
}

In [83]:
log = pd.read_csv(fn_logs[log_name])
log

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,"School holidays; 0 = no, 1 = yes",Approval; 1 = low sum to 4 = high sum,Type of loan; 0 = normal; 1 = special,Cost,_start timestamp,_complete timestamp
0,Precheck,0051574BDF30D681C1257888004DC40D,6/05/2011 16:09,6/05/2011 16:27,090-10-02,Friday,0.0,1.0,0.0,"0,87 € per minute",2011-05-06 16:09:00,2011-05-06 16:27:00
1,Application_Processing_Branches,0051574BDF30D681C1257888004DC40D,9/05/2011 11:14,9/05/2011 11:16,010-23-13,Monday,0.0,1.0,0.0,"1,02 € per minute",2011-05-09 11:14:00,2011-05-09 11:16:00
2,Precheck,0051574BDF30D681C1257888004DC40D,9/05/2011 15:44,9/05/2011 15:58,090-10-02,Monday,0.0,1.0,0.0,"0,87 € per minute",2011-05-09 15:44:00,2011-05-09 15:58:00
3,Processing_Incomplete_Orders,0051574BDF30D681C1257888004DC40D,9/05/2011 16:10,9/05/2011 16:10,010-23-13,Monday,0.0,1.0,0.0,"1,02 € per minute",2011-05-09 16:10:00,2011-05-09 16:10:00
4,Application_Processing_Branches,0051574BDF30D681C1257888004DC40D,10/05/2011 8:52,10/05/2011 9:32,010-23-13,Tuesday,0.0,1.0,0.0,"1,02 € per minute",2011-05-10 08:52:00,2011-05-10 09:32:00
...,...,...,...,...,...,...,...,...,...,...,...,...
5285,Precheck,FE3E1E0D3928202EC1257896004B8F00,20/05/2011 17:24,20/05/2011 17:26,000-3-01,Friday,0.0,2.0,0.0,"0,87 € per minute",2011-05-20 17:24:00,2011-05-20 17:26:00
5286,Check_of_Processing_Applications,FE3E1E0D3928202EC1257896004B8F00,23/05/2011 15:53,23/05/2011 16:06,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute",2011-05-23 15:53:00,2011-05-23 16:06:00
5287,Processing_of_Applications,FE3E1E0D3928202EC1257896004B8F00,26/05/2011 9:25,26/05/2011 9:25,010-23-07,Thursday,0.0,2.0,0.0,"1,02 € per minute",2011-05-26 09:25:00,2011-05-26 09:25:00
5288,Archieving,FE3E1E0D3928202EC1257896004B8F00,26/05/2011 10:38,26/05/2011 10:47,010-23-07,Thursday,0.0,2.0,0.0,"1,02 € per minute",2011-05-26 10:38:00,2011-05-26 10:47:00


In [84]:
# encode activity labels as short names for presentation purpose

print('There exist {} unique activity labels'.format(
    log['activity'].nunique()
))

# use initial letters for encoding
act_labels_short = dict()
for label in log['activity'].unique():
    label_short = ''.join([w[0] for w in label.split('_')])
    act_labels_short[label] = label_short
print('There are {} short labels after encoding'.format(
    len(act_labels_short.values())
))
for k in sorted(act_labels_short.keys()):
    print('{:>50} -> {:>30}'.format(k, act_labels_short[k]))

log['activity_short'] = log['activity'].apply(
    lambda x: act_labels_short[x]
)

# add activity duration
log['start timestamp'] = pd.to_datetime(log['start timestamp'], format='mixed', dayfirst=True)
log['complete timestamp'] = pd.to_datetime(log['complete timestamp'], 
format='mixed', dayfirst=True)
# duration measured in minutes
log['activity duration'] = (
    log['complete timestamp'] - log['start timestamp']
).dt.total_seconds() / 60

log

There exist 25 unique activity labels
There are 25 short labels after encoding
                              Administrative_Tasks ->                             AT
                   Application_Processing_Branches ->                            APB
                                          Approval ->                              A
                                   Approval_Branch ->                             AB
                          Approval_Executive_Board ->                            AEB
                                        Archieving ->                              A
                                 Check_of_Approval ->                            CoA
                                Check_of_Documents ->                            CoD
                  Check_of_Processing_Applications ->                           CoPA
                                               End ->                              E
                                   Further_inquiry ->                  

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,"School holidays; 0 = no, 1 = yes",Approval; 1 = low sum to 4 = high sum,Type of loan; 0 = normal; 1 = special,Cost,_start timestamp,_complete timestamp,activity_short,activity duration
0,Precheck,0051574BDF30D681C1257888004DC40D,2011-05-06 16:09:00,2011-05-06 16:27:00,090-10-02,Friday,0.0,1.0,0.0,"0,87 € per minute",2011-05-06 16:09:00,2011-05-06 16:27:00,P,18.0
1,Application_Processing_Branches,0051574BDF30D681C1257888004DC40D,2011-05-09 11:14:00,2011-05-09 11:16:00,010-23-13,Monday,0.0,1.0,0.0,"1,02 € per minute",2011-05-09 11:14:00,2011-05-09 11:16:00,APB,2.0
2,Precheck,0051574BDF30D681C1257888004DC40D,2011-05-09 15:44:00,2011-05-09 15:58:00,090-10-02,Monday,0.0,1.0,0.0,"0,87 € per minute",2011-05-09 15:44:00,2011-05-09 15:58:00,P,14.0
3,Processing_Incomplete_Orders,0051574BDF30D681C1257888004DC40D,2011-05-09 16:10:00,2011-05-09 16:10:00,010-23-13,Monday,0.0,1.0,0.0,"1,02 € per minute",2011-05-09 16:10:00,2011-05-09 16:10:00,PIO,0.0
4,Application_Processing_Branches,0051574BDF30D681C1257888004DC40D,2011-05-10 08:52:00,2011-05-10 09:32:00,010-23-13,Tuesday,0.0,1.0,0.0,"1,02 € per minute",2011-05-10 08:52:00,2011-05-10 09:32:00,APB,40.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5285,Precheck,FE3E1E0D3928202EC1257896004B8F00,2011-05-20 17:24:00,2011-05-20 17:26:00,000-3-01,Friday,0.0,2.0,0.0,"0,87 € per minute",2011-05-20 17:24:00,2011-05-20 17:26:00,P,2.0
5286,Check_of_Processing_Applications,FE3E1E0D3928202EC1257896004B8F00,2011-05-23 15:53:00,2011-05-23 16:06:00,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute",2011-05-23 15:53:00,2011-05-23 16:06:00,CoPA,13.0
5287,Processing_of_Applications,FE3E1E0D3928202EC1257896004B8F00,2011-05-26 09:25:00,2011-05-26 09:25:00,010-23-07,Thursday,0.0,2.0,0.0,"1,02 € per minute",2011-05-26 09:25:00,2011-05-26 09:25:00,PoA,0.0
5288,Archieving,FE3E1E0D3928202EC1257896004B8F00,2011-05-26 10:38:00,2011-05-26 10:47:00,010-23-07,Thursday,0.0,2.0,0.0,"1,02 € per minute",2011-05-26 10:38:00,2011-05-26 10:47:00,A,9.0


In [85]:
# proprietary
log['department'] = log['resource'].apply(
    lambda x: x.split('-')[0]
)
log['department_role'] = log['resource'].apply(
    lambda x: x.split('-')[0] + '-' + x.split('-')[1]
)
log

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,"School holidays; 0 = no, 1 = yes",Approval; 1 = low sum to 4 = high sum,Type of loan; 0 = normal; 1 = special,Cost,_start timestamp,_complete timestamp,activity_short,activity duration,department,department_role
0,Precheck,0051574BDF30D681C1257888004DC40D,2011-05-06 16:09:00,2011-05-06 16:27:00,090-10-02,Friday,0.0,1.0,0.0,"0,87 € per minute",2011-05-06 16:09:00,2011-05-06 16:27:00,P,18.0,090,090-10
1,Application_Processing_Branches,0051574BDF30D681C1257888004DC40D,2011-05-09 11:14:00,2011-05-09 11:16:00,010-23-13,Monday,0.0,1.0,0.0,"1,02 € per minute",2011-05-09 11:14:00,2011-05-09 11:16:00,APB,2.0,010,010-23
2,Precheck,0051574BDF30D681C1257888004DC40D,2011-05-09 15:44:00,2011-05-09 15:58:00,090-10-02,Monday,0.0,1.0,0.0,"0,87 € per minute",2011-05-09 15:44:00,2011-05-09 15:58:00,P,14.0,090,090-10
3,Processing_Incomplete_Orders,0051574BDF30D681C1257888004DC40D,2011-05-09 16:10:00,2011-05-09 16:10:00,010-23-13,Monday,0.0,1.0,0.0,"1,02 € per minute",2011-05-09 16:10:00,2011-05-09 16:10:00,PIO,0.0,010,010-23
4,Application_Processing_Branches,0051574BDF30D681C1257888004DC40D,2011-05-10 08:52:00,2011-05-10 09:32:00,010-23-13,Tuesday,0.0,1.0,0.0,"1,02 € per minute",2011-05-10 08:52:00,2011-05-10 09:32:00,APB,40.0,010,010-23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5285,Precheck,FE3E1E0D3928202EC1257896004B8F00,2011-05-20 17:24:00,2011-05-20 17:26:00,000-3-01,Friday,0.0,2.0,0.0,"0,87 € per minute",2011-05-20 17:24:00,2011-05-20 17:26:00,P,2.0,000,000-3
5286,Check_of_Processing_Applications,FE3E1E0D3928202EC1257896004B8F00,2011-05-23 15:53:00,2011-05-23 16:06:00,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute",2011-05-23 15:53:00,2011-05-23 16:06:00,CoPA,13.0,000,000-2
5287,Processing_of_Applications,FE3E1E0D3928202EC1257896004B8F00,2011-05-26 09:25:00,2011-05-26 09:25:00,010-23-07,Thursday,0.0,2.0,0.0,"1,02 € per minute",2011-05-26 09:25:00,2011-05-26 09:25:00,PoA,0.0,010,010-23
5288,Archieving,FE3E1E0D3928202EC1257896004B8F00,2011-05-26 10:38:00,2011-05-26 10:47:00,010-23-07,Thursday,0.0,2.0,0.0,"1,02 € per minute",2011-05-26 10:38:00,2011-05-26 10:47:00,A,9.0,010,010-23


## Performance-PM-2

In [86]:
sel_resources = log['resource'].unique()
print('{} resources are selected'.format(len(sel_resources)))

166 resources are selected


### For each resource, extract a set of activity sequences performed by this resource

In [87]:
# sort events by complete timestamps
log = log.sort_values(by='complete timestamp')

seq_log = []

for res, events in log.groupby('resource'):
    if res in sel_resources:
        for case_id, trace in events.groupby('case_id'):
            # ignore singletons
            if len(trace) <= 1:
                pass
            else:
                for i in range(len(trace) - 1):
                    for j in range(i + 1, len(trace)):
                        prefix = trace.iloc[i:j+1]
                        seq = ','.join(prefix['activity_short'])
                        length = len(prefix['activity_short'])
                        total_duration = prefix['activity duration'] = (
                            prefix.iloc[-1]['complete timestamp'] -
                            prefix.iloc[0]['start timestamp']
                        ).total_seconds() / 60
                        seq_log.append({
                            'seq': seq,
                            'resource': res,
                            'case_id': case_id,
                            'total_duration': total_duration
                        })

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  total_duration = prefix['activity duration'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  total_duration = prefix['activity duration'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  total_duration = prefix['activity duration'] = (
A value is trying to be set on a copy of a slice from a Da

In [88]:
seq_log = pd.DataFrame(seq_log)
seq_log['length'] = seq_log['seq'].apply(lambda x: len(x.split(',')))
seq_log

Unnamed: 0,seq,resource,case_id,total_duration,length
0,"APB,CoPA",000-2-01,14BB54577FDB7BDBC12578500033174A,5757.0,2
1,"PIO,PIO",000-2-01,179E57A178EEE99DC1257854005037A0,8382.0,2
2,"PIO,APB",000-2-01,2957F9167168C7A6C125784C002F28E8,1412.0,2
3,"PIO,APB,CoPA",000-2-01,2957F9167168C7A6C125784C002F28E8,4428.0,3
4,"APB,CoPA",000-2-01,2957F9167168C7A6C125784C002F28E8,3025.0,2
...,...,...,...,...,...
8628,"T,E,APB",111-9-1,5363FE723BEB494FC125785C00525EE0,0.0,3
8629,"E,APB",111-9-1,5363FE723BEB494FC125785C00525EE0,0.0,2
8630,"CoPA,Fi",33-01,635A8C14C5F4B799C1257872002D1BC9,43.0,2
8631,"CoPA,Fi,PR",33-01,635A8C14C5F4B799C1257872002D1BC9,83.0,3


In [89]:
print(seq_log['seq'].nunique())
print(seq_log['case_id'].nunique())
print(seq_log['resource'].nunique())

print(seq_log['length'].min())
print(seq_log['length'].max())
print(seq_log['total_duration'].min())
print(seq_log['total_duration'].max())


2318
499
145
2
18
0.0
105119.0


### Identify sequences that are long and performed in a short time

In [90]:
seq_low_threshold = []
for seq, stats in seq_log.groupby('seq'):
    lb = stats['total_duration'].quantile(0.25) - 1.5 * (
        stats['total_duration'].quantile(0.75) -
        stats['total_duration'].quantile(0.25)
    )
    seq_low_threshold.append({'seq': seq, 'lb': lb})

seq_low_threshold = pd.DataFrame(seq_low_threshold)

seq_log = seq_log.merge(seq_low_threshold, how='left', on='seq')
seq_log['is_short_duration'] = seq_log['total_duration'] < seq_log['lb']

seq_log

Unnamed: 0,seq,resource,case_id,total_duration,length,lb,is_short_duration
0,"APB,CoPA",000-2-01,14BB54577FDB7BDBC12578500033174A,5757.0,2,-3637.375,False
1,"PIO,PIO",000-2-01,179E57A178EEE99DC1257854005037A0,8382.0,2,-2169.000,False
2,"PIO,APB",000-2-01,2957F9167168C7A6C125784C002F28E8,1412.0,2,-2785.625,False
3,"PIO,APB,CoPA",000-2-01,2957F9167168C7A6C125784C002F28E8,4428.0,3,-7926.625,False
4,"APB,CoPA",000-2-01,2957F9167168C7A6C125784C002F28E8,3025.0,2,-3637.375,False
...,...,...,...,...,...,...,...
8628,"T,E,APB",111-9-1,5363FE723BEB494FC125785C00525EE0,0.0,3,0.000,False
8629,"E,APB",111-9-1,5363FE723BEB494FC125785C00525EE0,0.0,2,0.000,False
8630,"CoPA,Fi",33-01,635A8C14C5F4B799C1257872002D1BC9,43.0,2,-609.000,False
8631,"CoPA,Fi,PR",33-01,635A8C14C5F4B799C1257872002D1BC9,83.0,3,83.000,False


In [91]:
short_seqs = set(
    seq_log.loc[
        (seq_log['is_short_duration'] == True) &
        (seq_log['total_duration'] < 60),
        'seq'
    ].unique()
)
print(len(short_seqs))

4


In [92]:
# alt.layer(
#     alt.Chart(seq_log[seq_log['seq'] == 'GSoC,PoA,A']).mark_boxplot(outliers={'size': 0}).encode(
#         x='total_duration:Q',
#         y='seq:N',
#         color=alt.value('grey'),
#         opacity=alt.value(0.5)
#     ),
#     alt.Chart(seq_log[seq_log['seq'] == 'GSoC,PoA,A']).mark_circle().encode(
#         x='total_duration:Q',
#         y='seq:N',
#         color='is_short_duration:N'
#     ),
# )

charts = []
for seq in short_seqs:
    data = seq_log[seq_log['seq'] == seq]
    charts.append(
        alt.layer(
            alt.Chart(data).mark_boxplot(outliers={'size': 0}).encode(
                x=alt.X('total_duration:Q').title('total duration (minutes)'),
                y=alt.Y('seq:N').title(['activity', 'sequence']),
                color=alt.value('grey'),
                opacity=alt.value(0.5)
            ),
            alt.Chart(data).mark_circle().encode(
                x=alt.X('total_duration:Q').title('total duration (minutes)'),
                y=alt.Y('seq:N').title(['activity', 'sequence']),
                color=alt.Color('is_short_duration:N').title([
                    'is significantly', 'short duration?'
                ]),
                tooltip=['resource', 'case_id', 'total_duration']
            )
        )
    )

alt.vconcat(*charts).resolve_scale(x='independent')

In [93]:
charts = []
for seq in short_seqs:
    data = seq_log[seq_log['seq'] == seq].groupby('seq').agg(
        num_res=pd.NamedAgg('resource', 'nunique')
    )
    charts.append(
        alt.Chart(data).mark_bar().encode(
            x=alt.X('num_res:Q').title('number of resources'),
            y=alt.Y('seq:N').title(['activity', 'sequence'])
        )
    )

alt.vconcat(*charts).resolve_scale(x='shared')