# Performance Masking

## Description
When agents act in such a way, through their dealing with or knowledge of IT systems, that their true performance cannot (easily) be established.

In [122]:
import pandas as pd
import numpy as np
import altair as alt

In [123]:
log_name = 'propr_test'

fn_logs = {
    'bpic15': '../data/preproc/bpic15.csv',
    'bpic15_ref': '../data/preproc/bpic15_train-70.csv',
    'bpic15_test': '../data/preproc/bpic15_test-30.csv',

    'bpic17': '../data/preproc/bpic2017.csv',

    'propr': '../data/preproc/proprietary.csv',
    'propr_ref': '../data/preproc/proprietary_train-70.csv',
    'propr_test': '../data/preproc/proprietary_test-30.csv'
}

In [124]:
log = pd.read_csv(fn_logs[log_name])
log

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,"School holidays; 0 = no, 1 = yes",Approval; 1 = low sum to 4 = high sum,Type of loan; 0 = normal; 1 = special,Cost,_start timestamp,_complete timestamp
0,Precheck,0051574BDF30D681C1257888004DC40D,6/05/2011 16:09,6/05/2011 16:27,090-10-02,Friday,0.0,1.0,0.0,"0,87 € per minute",2011-05-06 16:09:00,2011-05-06 16:27:00
1,Application_Processing_Branches,0051574BDF30D681C1257888004DC40D,9/05/2011 11:14,9/05/2011 11:16,010-23-13,Monday,0.0,1.0,0.0,"1,02 € per minute",2011-05-09 11:14:00,2011-05-09 11:16:00
2,Precheck,0051574BDF30D681C1257888004DC40D,9/05/2011 15:44,9/05/2011 15:58,090-10-02,Monday,0.0,1.0,0.0,"0,87 € per minute",2011-05-09 15:44:00,2011-05-09 15:58:00
3,Processing_Incomplete_Orders,0051574BDF30D681C1257888004DC40D,9/05/2011 16:10,9/05/2011 16:10,010-23-13,Monday,0.0,1.0,0.0,"1,02 € per minute",2011-05-09 16:10:00,2011-05-09 16:10:00
4,Application_Processing_Branches,0051574BDF30D681C1257888004DC40D,10/05/2011 8:52,10/05/2011 9:32,010-23-13,Tuesday,0.0,1.0,0.0,"1,02 € per minute",2011-05-10 08:52:00,2011-05-10 09:32:00
...,...,...,...,...,...,...,...,...,...,...,...,...
5285,Precheck,FE3E1E0D3928202EC1257896004B8F00,20/05/2011 17:24,20/05/2011 17:26,000-3-01,Friday,0.0,2.0,0.0,"0,87 € per minute",2011-05-20 17:24:00,2011-05-20 17:26:00
5286,Check_of_Processing_Applications,FE3E1E0D3928202EC1257896004B8F00,23/05/2011 15:53,23/05/2011 16:06,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute",2011-05-23 15:53:00,2011-05-23 16:06:00
5287,Processing_of_Applications,FE3E1E0D3928202EC1257896004B8F00,26/05/2011 9:25,26/05/2011 9:25,010-23-07,Thursday,0.0,2.0,0.0,"1,02 € per minute",2011-05-26 09:25:00,2011-05-26 09:25:00
5288,Archieving,FE3E1E0D3928202EC1257896004B8F00,26/05/2011 10:38,26/05/2011 10:47,010-23-07,Thursday,0.0,2.0,0.0,"1,02 € per minute",2011-05-26 10:38:00,2011-05-26 10:47:00


In [125]:
# encode activity labels as short names for presentation purpose

print('There exist {} unique activity labels'.format(
    log['activity'].nunique()
))

# use initial letters for encoding
act_labels_short = dict()
for label in log['activity'].unique():
    label_short = ''.join([w[0] for w in label.split('_')])
    act_labels_short[label] = label_short
print('There are {} short labels after encoding'.format(
    len(act_labels_short.values())
))
for k in sorted(act_labels_short.keys()):
    print('{:>50} -> {:>30}'.format(k, act_labels_short[k]))

log['activity_short'] = log['activity'].apply(
    lambda x: act_labels_short[x]
)

# add activity duration
log['start timestamp'] = pd.to_datetime(log['start timestamp'], format='mixed', dayfirst=True)
log['complete timestamp'] = pd.to_datetime(log['complete timestamp'], 
format='mixed', dayfirst=True)
# duration measured in minutes
log['activity duration'] = (
    log['complete timestamp'] - log['start timestamp']
).dt.total_seconds() / 60

log

There exist 25 unique activity labels
There are 25 short labels after encoding
                              Administrative_Tasks ->                             AT
                   Application_Processing_Branches ->                            APB
                                          Approval ->                              A
                                   Approval_Branch ->                             AB
                          Approval_Executive_Board ->                            AEB
                                        Archieving ->                              A
                                 Check_of_Approval ->                            CoA
                                Check_of_Documents ->                            CoD
                  Check_of_Processing_Applications ->                           CoPA
                                               End ->                              E
                                   Further_inquiry ->                  

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,"School holidays; 0 = no, 1 = yes",Approval; 1 = low sum to 4 = high sum,Type of loan; 0 = normal; 1 = special,Cost,_start timestamp,_complete timestamp,activity_short,activity duration
0,Precheck,0051574BDF30D681C1257888004DC40D,2011-05-06 16:09:00,2011-05-06 16:27:00,090-10-02,Friday,0.0,1.0,0.0,"0,87 € per minute",2011-05-06 16:09:00,2011-05-06 16:27:00,P,18.0
1,Application_Processing_Branches,0051574BDF30D681C1257888004DC40D,2011-05-09 11:14:00,2011-05-09 11:16:00,010-23-13,Monday,0.0,1.0,0.0,"1,02 € per minute",2011-05-09 11:14:00,2011-05-09 11:16:00,APB,2.0
2,Precheck,0051574BDF30D681C1257888004DC40D,2011-05-09 15:44:00,2011-05-09 15:58:00,090-10-02,Monday,0.0,1.0,0.0,"0,87 € per minute",2011-05-09 15:44:00,2011-05-09 15:58:00,P,14.0
3,Processing_Incomplete_Orders,0051574BDF30D681C1257888004DC40D,2011-05-09 16:10:00,2011-05-09 16:10:00,010-23-13,Monday,0.0,1.0,0.0,"1,02 € per minute",2011-05-09 16:10:00,2011-05-09 16:10:00,PIO,0.0
4,Application_Processing_Branches,0051574BDF30D681C1257888004DC40D,2011-05-10 08:52:00,2011-05-10 09:32:00,010-23-13,Tuesday,0.0,1.0,0.0,"1,02 € per minute",2011-05-10 08:52:00,2011-05-10 09:32:00,APB,40.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5285,Precheck,FE3E1E0D3928202EC1257896004B8F00,2011-05-20 17:24:00,2011-05-20 17:26:00,000-3-01,Friday,0.0,2.0,0.0,"0,87 € per minute",2011-05-20 17:24:00,2011-05-20 17:26:00,P,2.0
5286,Check_of_Processing_Applications,FE3E1E0D3928202EC1257896004B8F00,2011-05-23 15:53:00,2011-05-23 16:06:00,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute",2011-05-23 15:53:00,2011-05-23 16:06:00,CoPA,13.0
5287,Processing_of_Applications,FE3E1E0D3928202EC1257896004B8F00,2011-05-26 09:25:00,2011-05-26 09:25:00,010-23-07,Thursday,0.0,2.0,0.0,"1,02 € per minute",2011-05-26 09:25:00,2011-05-26 09:25:00,PoA,0.0
5288,Archieving,FE3E1E0D3928202EC1257896004B8F00,2011-05-26 10:38:00,2011-05-26 10:47:00,010-23-07,Thursday,0.0,2.0,0.0,"1,02 € per minute",2011-05-26 10:38:00,2011-05-26 10:47:00,A,9.0


In [126]:
# proprietary
log['department'] = log['resource'].apply(
    lambda x: x.split('-')[0]
)
log['department_role'] = log['resource'].apply(
    lambda x: x.split('-')[0] + '-' + x.split('-')[1]
)
log

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,"School holidays; 0 = no, 1 = yes",Approval; 1 = low sum to 4 = high sum,Type of loan; 0 = normal; 1 = special,Cost,_start timestamp,_complete timestamp,activity_short,activity duration,department,department_role
0,Precheck,0051574BDF30D681C1257888004DC40D,2011-05-06 16:09:00,2011-05-06 16:27:00,090-10-02,Friday,0.0,1.0,0.0,"0,87 € per minute",2011-05-06 16:09:00,2011-05-06 16:27:00,P,18.0,090,090-10
1,Application_Processing_Branches,0051574BDF30D681C1257888004DC40D,2011-05-09 11:14:00,2011-05-09 11:16:00,010-23-13,Monday,0.0,1.0,0.0,"1,02 € per minute",2011-05-09 11:14:00,2011-05-09 11:16:00,APB,2.0,010,010-23
2,Precheck,0051574BDF30D681C1257888004DC40D,2011-05-09 15:44:00,2011-05-09 15:58:00,090-10-02,Monday,0.0,1.0,0.0,"0,87 € per minute",2011-05-09 15:44:00,2011-05-09 15:58:00,P,14.0,090,090-10
3,Processing_Incomplete_Orders,0051574BDF30D681C1257888004DC40D,2011-05-09 16:10:00,2011-05-09 16:10:00,010-23-13,Monday,0.0,1.0,0.0,"1,02 € per minute",2011-05-09 16:10:00,2011-05-09 16:10:00,PIO,0.0,010,010-23
4,Application_Processing_Branches,0051574BDF30D681C1257888004DC40D,2011-05-10 08:52:00,2011-05-10 09:32:00,010-23-13,Tuesday,0.0,1.0,0.0,"1,02 € per minute",2011-05-10 08:52:00,2011-05-10 09:32:00,APB,40.0,010,010-23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5285,Precheck,FE3E1E0D3928202EC1257896004B8F00,2011-05-20 17:24:00,2011-05-20 17:26:00,000-3-01,Friday,0.0,2.0,0.0,"0,87 € per minute",2011-05-20 17:24:00,2011-05-20 17:26:00,P,2.0,000,000-3
5286,Check_of_Processing_Applications,FE3E1E0D3928202EC1257896004B8F00,2011-05-23 15:53:00,2011-05-23 16:06:00,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute",2011-05-23 15:53:00,2011-05-23 16:06:00,CoPA,13.0,000,000-2
5287,Processing_of_Applications,FE3E1E0D3928202EC1257896004B8F00,2011-05-26 09:25:00,2011-05-26 09:25:00,010-23-07,Thursday,0.0,2.0,0.0,"1,02 € per minute",2011-05-26 09:25:00,2011-05-26 09:25:00,PoA,0.0,010,010-23
5288,Archieving,FE3E1E0D3928202EC1257896004B8F00,2011-05-26 10:38:00,2011-05-26 10:47:00,010-23-07,Thursday,0.0,2.0,0.0,"1,02 € per minute",2011-05-26 10:38:00,2011-05-26 10:47:00,A,9.0,010,010-23


## Performance-PM-2

In [127]:
sel_resources = log.loc[log['department_role'] == '010-23', 'resource'].unique()
print('{} resources are selected'.format(len(sel_resources)))

10 resources are selected


### For each resource, extract a set of activity sequences performed by this resource

In [128]:
# sort events by complete timestamps
log = log.sort_values(by='complete timestamp')

from collections import defaultdict
seq_counters = defaultdict(
    lambda: {
        'length': 0,
        'frequency': 0,
        'resources': set(),
        'cases': set(),
    }
)
seq_log = []

for res, events in log.groupby('resource'):
    if res in sel_resources:
        for case_id, trace in events.groupby('case_id'):
            seq = ','.join(trace['activity_short'])
            length = len(trace['activity_short'])
            act_duration = trace['activity duration'].mean()
            # only count for sequences with length > 0
            if length > 0:
                seq_counters[seq]['length'] = len(trace['activity_short'])
                seq_counters[seq]['frequency'] += 1
                seq_counters[seq]['resources'].add(res)
                seq_counters[seq]['cases'].add(case_id)
                seq_counters[seq]['avg_activity_duration'] = act_duration
                seq_log.append({
                    'seq': seq,
                    'res': res,
                    'case_id': case_id,
                    'avg_activity_duration': act_duration
                })

seq_stats = []
for seq, stats in seq_counters.items():
    seq_stats.append({
        'seq': seq,
        'length': stats['length'],
        'frequency': stats['frequency'],
        'res': list(stats['resources']),
        'num_res': len(stats['resources']),
        'num_cases': len(stats['cases']),
        'avg_activity_duration': stats['avg_activity_duration'].mean()
    })
seq_stats = pd.DataFrame(seq_stats)

print('There are in total {} selected resources'.format(len(sel_resources)))
print('These resources executed {} unique sequences'.format(seq_stats['seq'].nunique()))

seq_stats

There are in total 10 selected resources
These resources executed 109 unique sequences


Unnamed: 0,seq,length,frequency,res,num_res,num_cases,avg_activity_duration
0,"PoAS,AT,AT,PoAS,CoPA",5,1,[010-23-01],1,1,13.800000
1,"PSP,PSP,AT",3,1,[010-23-01],1,1,30.333333
2,"PoAS,PoAS,PoAS",3,1,[010-23-01],1,1,2.666667
3,"PoAS,CoPA",2,1,[010-23-01],1,1,15.500000
4,"PoA,CoD",2,14,"[010-23-06, 010-23-02, 010-23-07]",3,14,71.500000
...,...,...,...,...,...,...,...
104,"P,APB,CoPA",3,1,[010-23-13],1,1,7.666667
105,"APB,CoPA,PIO,CoPA",4,1,[010-23-13],1,1,5.750000
106,AT,1,1,[010-23-15],1,1,1.000000
107,"PSP,PoAS,PoAS,PoAS,CoPA",5,1,[010-23-15],1,1,3.400000


In [129]:
seq_log = pd.DataFrame(seq_log)
seq_log

Unnamed: 0,seq,res,case_id,avg_activity_duration
0,"PoAS,AT,AT,PoAS,CoPA",010-23-01,5856B187BE5DC2BFC1257870004314A7,13.800000
1,"PSP,PSP,AT",010-23-01,ABBEAE5FCB39B150C125789C004525B4,30.333333
2,"PoAS,PoAS,PoAS",010-23-01,D5573864F7B60C16C1257856005A9F8C,2.666667
3,"PoAS,CoPA",010-23-01,F7D7B4B5186E19A4C1257850004D65BE,15.500000
4,"PoA,CoD",010-23-02,017FA12C13EE5A75C12578730044F7C4,17.000000
...,...,...,...,...
460,"APB,CoPA,PIO,CoPA",010-23-13,FBEDD3F7E98727F8C1257887004A768A,5.750000
461,CoPA,010-23-15,2A518CBDC3DB248DC125788D003752EF,0.000000
462,AT,010-23-15,5856B187BE5DC2BFC1257870004314A7,1.000000
463,"PSP,PoAS,PoAS,PoAS,CoPA",010-23-15,D5573864F7B60C16C1257856005A9F8C,3.400000


### Identify sequences that are unique to certain resources

In [130]:
# find sequences that
# have a length > 1
# appeared in more than x cases
seq_stats = seq_stats[
    (seq_stats['length'] > 1)
]
seq_log = seq_log[
    (seq_log['seq'].isin(seq_stats['seq'].unique()))
]

In [131]:
alt.hconcat(
    alt.Chart(seq_stats).mark_bar().encode(
        x=alt.X('seq:N').sort(
            alt.EncodingSortField('num_res', order='ascending')
        ).title('Activity sequences executed by resources'),
        y=alt.Y('num_res:Q'),
        tooltip=['seq', 'num_res', 'res']
    ),
    alt.Chart(seq_stats).mark_bar().encode(
        x=alt.X('seq:N').sort(
            alt.EncodingSortField('num_res', order='ascending')
        ).title('Activity sequences executed by resources'),
        y=alt.Y('frequency:Q')
    )
)

In [132]:
data = seq_log.groupby(['seq', 'res']).agg(
    avg_activity_duration=pd.NamedAgg('avg_activity_duration', 'mean')
)
alt.Chart(seq_log).mark_boxplot().encode(
    x='avg_activity_duration:Q'
)