# Performance Masking

## Description
When agents act in such a way, through their dealing with or knowledge of IT systems, that their true performance cannot (easily) be established.

In [288]:
import pandas as pd
import numpy as np
import altair as alt

In [289]:
log_name = 'propr'

fn_logs = {
    'bpic15': '../data/preproc/bpic2015_disco.csv',
    'bpic17': '../data/preproc/bpic2017.csv',
    'propr': '../data/preproc/proprietary.csv'
}

In [290]:
log = pd.read_csv(fn_logs[log_name])
log

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,"School holidays; 0 = no, 1 = yes",Approval; 1 = low sum to 4 = high sum,Type of loan; 0 = normal; 1 = special,Cost
0,Approval_Branch,005141EEB1240B31C12577DF004F6A77,18/11/2010 15:27,18/11/2010 15:58,004-9-1,Thursday,0.0,2.0,0.0,"0,87 € per minute"
1,Precheck,005141EEB1240B31C12577DF004F6A77,19/11/2010 12:45,19/11/2010 12:46,000-3-01,Friday,0.0,2.0,0.0,"0,87 € per minute"
2,Precheck,005141EEB1240B31C12577DF004F6A77,24/11/2010 8:18,24/11/2010 8:26,000-2-01,Wednesday,0.0,2.0,0.0,"0,87 € per minute"
3,Check_of_Processing_Applications,005141EEB1240B31C12577DF004F6A77,24/11/2010 10:35,24/11/2010 10:35,000-2-01,Wednesday,0.0,2.0,0.0,"0,87 € per minute"
4,Processing_of_Applications,005141EEB1240B31C12577DF004F6A77,2/12/2010 10:46,2/12/2010 10:46,010-23-11,Thursday,0.0,2.0,0.0,"1,02 € per minute"
...,...,...,...,...,...,...,...,...,...,...
18440,Precheck,FFFF329EF772D73CC12577EA00534A1C,13/12/2010 11:38,13/12/2010 11:40,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute"
18441,Check_of_Processing_Applications,FFFF329EF772D73CC12577EA00534A1C,13/12/2010 12:06,13/12/2010 12:14,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute"
18442,Processing_of_Applications,FFFF329EF772D73CC12577EA00534A1C,22/12/2010 11:48,22/12/2010 11:48,010-23-07,Wednesday,1.0,2.0,0.0,"1,02 € per minute"
18443,Archieving,FFFF329EF772D73CC12577EA00534A1C,22/12/2010 11:49,22/12/2010 13:19,010-23-07,Wednesday,1.0,2.0,0.0,"1,02 € per minute"


In [291]:
# encode activity labels as short names for presentation purpose

print('There exist {} unique activity labels'.format(
    log['activity'].nunique()
))

# use initial letters for encoding
act_labels_short = dict()
for label in log['activity'].unique():
    label_short = ''.join([w[0] for w in label.split('_')])
    act_labels_short[label] = label_short
print('There are {} short labels after encoding'.format(
    len(act_labels_short.values())
))
for k in sorted(act_labels_short.keys()):
    print('{:>50} -> {:>30}'.format(k, act_labels_short[k]))

log['activity_short'] = log['activity'].apply(
    lambda x: act_labels_short[x]
)
log

There exist 25 unique activity labels
There are 25 short labels after encoding
                              Administrative_Tasks ->                             AT
                   Application_Processing_Branches ->                            APB
                                          Approval ->                              A
                                   Approval_Branch ->                             AB
                          Approval_Executive_Board ->                            AEB
                                        Archieving ->                              A
                                 Check_of_Approval ->                            CoA
                                Check_of_Documents ->                            CoD
                  Check_of_Processing_Applications ->                           CoPA
                                               End ->                              E
                                   Further_inquiry ->                  

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,"School holidays; 0 = no, 1 = yes",Approval; 1 = low sum to 4 = high sum,Type of loan; 0 = normal; 1 = special,Cost,activity_short
0,Approval_Branch,005141EEB1240B31C12577DF004F6A77,18/11/2010 15:27,18/11/2010 15:58,004-9-1,Thursday,0.0,2.0,0.0,"0,87 € per minute",AB
1,Precheck,005141EEB1240B31C12577DF004F6A77,19/11/2010 12:45,19/11/2010 12:46,000-3-01,Friday,0.0,2.0,0.0,"0,87 € per minute",P
2,Precheck,005141EEB1240B31C12577DF004F6A77,24/11/2010 8:18,24/11/2010 8:26,000-2-01,Wednesday,0.0,2.0,0.0,"0,87 € per minute",P
3,Check_of_Processing_Applications,005141EEB1240B31C12577DF004F6A77,24/11/2010 10:35,24/11/2010 10:35,000-2-01,Wednesday,0.0,2.0,0.0,"0,87 € per minute",CoPA
4,Processing_of_Applications,005141EEB1240B31C12577DF004F6A77,2/12/2010 10:46,2/12/2010 10:46,010-23-11,Thursday,0.0,2.0,0.0,"1,02 € per minute",PoA
...,...,...,...,...,...,...,...,...,...,...,...
18440,Precheck,FFFF329EF772D73CC12577EA00534A1C,13/12/2010 11:38,13/12/2010 11:40,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute",P
18441,Check_of_Processing_Applications,FFFF329EF772D73CC12577EA00534A1C,13/12/2010 12:06,13/12/2010 12:14,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute",CoPA
18442,Processing_of_Applications,FFFF329EF772D73CC12577EA00534A1C,22/12/2010 11:48,22/12/2010 11:48,010-23-07,Wednesday,1.0,2.0,0.0,"1,02 € per minute",PoA
18443,Archieving,FFFF329EF772D73CC12577EA00534A1C,22/12/2010 11:49,22/12/2010 13:19,010-23-07,Wednesday,1.0,2.0,0.0,"1,02 € per minute",A


## Performance-PM-2

In [292]:
# sel_resources = log['resource'].unique()

# optional: select only the most active resources
# proprietary: select resources who
# were involved in at least 100 cases
sel_resources = set()
for res, events in log.groupby('resource'):
    total_num_cases = events['case_id'].nunique()
    if (
        total_num_cases >= 100 and
        res.startswith('010-23')
        # 
        # sorted(events['activity_short'].unique()) == sorted([
        #     #'AB', 'APB', 'E', 'P', 'RB', 'T'
        # ])
        # 
        # 'A' in events['activity_short'].unique()
    ):
        sel_resources.add(res)
print('{} resources are selected'.format(len(sel_resources)))

5 resources are selected


### For each resource, extract a set of activity sequences performed by this resource

In [293]:
# sort events by complete timestamps
log = log.sort_values(by='complete timestamp')

from collections import defaultdict
seq_counters = defaultdict(
    lambda: {
        'length': 0,
        'frequency': 0,
        'resources': set(),
        'cases': set(),
    }
)
for res, events in log.groupby('resource'):
    if res in sel_resources:
        for case_id, trace in events.groupby('case_id'):
            seq = ','.join(trace['activity_short'])
            length = len(trace['activity_short'])
            # only count for sequences with length > 0
            if length > 0:
                seq_counters[seq]['length'] = len(trace['activity_short'])
                seq_counters[seq]['frequency'] += 1
                seq_counters[seq]['resources'].add(res)
                seq_counters[seq]['cases'].add(case_id)

seq_stats = []
for seq, stats in seq_counters.items():
    seq_stats.append({
        'seq': seq,
        'length': stats['length'],
        'frequency': stats['frequency'],
        'res': list(stats['resources']),
        'num_res': len(stats['resources']),
        'num_cases': len(stats['cases'])
    })
seq_stats = pd.DataFrame(seq_stats)

print('There are in total {} selected resources'.format(len(sel_resources)))
print('These resources executed {} unique sequences'.format(seq_stats['seq'].nunique()))

There are in total 5 selected resources
These resources executed 233 unique sequences


### Identify sequences that are unique to certain resources

In [294]:
# discover resource grouping based on activity types (labels)
res_act_matrix = []
for res, events in log.groupby('resource'):
    if res in sel_resources:
        v = events.groupby('activity_short').size().reset_index().rename(columns={0: 'count'})
        v['count'] /= v['count'].sum()
        v['resource'] = res
        res_act_matrix.append(v)
res_act_matrix = pd.concat(res_act_matrix)
res_act_matrix

Unnamed: 0,activity_short,count,resource
0,A,0.236943,010-23-02
1,CoD,0.075159,010-23-02
2,E,0.236943,010-23-02
3,GSoC,0.003822,010-23-02
4,P,0.008917,010-23-02
5,PoA,0.407643,010-23-02
6,R,0.002548,010-23-02
7,RB,0.002548,010-23-02
8,SvK,0.025478,010-23-02
0,A,0.238189,010-23-06


In [295]:
ra_mat_wide = res_act_matrix.pivot(
    index='resource', 
    columns='activity_short', values='count'
).fillna(0)
# ra_mat_wide.values
from sklearn.cluster import AgglomerativeClustering
clustering = AgglomerativeClustering(
    n_clusters=len(ra_mat_wide),
    metric='euclidean',
).fit(ra_mat_wide.values).labels_
print(clustering)
ra_mat_wide['label'] = clustering
# ra_mat_wide
labels = ra_mat_wide['label'].reset_index()
res_act_matrix = res_act_matrix.merge(labels, on='resource')
res_act_matrix

[4 3 1 2 0]


Unnamed: 0,activity_short,count,resource,label
0,A,0.236943,010-23-02,4
1,CoD,0.075159,010-23-02,4
2,E,0.236943,010-23-02,4
3,GSoC,0.003822,010-23-02,4
4,P,0.008917,010-23-02,4
5,PoA,0.407643,010-23-02,4
6,R,0.002548,010-23-02,4
7,RB,0.002548,010-23-02,4
8,SvK,0.025478,010-23-02,4
9,A,0.238189,010-23-06,3


In [296]:
# NOTE: resource ids seem to have encoding role information (group id is the
# first two parts of the code? e.g., xxx-xx)
# NOTE2: in each group, there seems to be a special resource for "CoPA"

alt.Chart(res_act_matrix).mark_rect().encode(
    y=alt.Y('resource:N').sort(alt.EncodingSortField('label', order='ascending')),
    x='activity_short:N',
    color='count',
    tooltip=['count', 'resource', 'activity_short']
)

In [297]:
# find sequences that
# have a length > 1
# appeared in more than 10 cases
seq_stats = seq_stats[
    (seq_stats['length'] > 1) & 
    (seq_stats['num_cases'] > 10)
]

In [298]:
alt.vconcat(
    alt.Chart(seq_stats).mark_bar().encode(
        x=alt.X('seq:N').sort(
            alt.EncodingSortField('num_res', order='ascending')
        ).title('Activity sequences executed by resources'),
        y=alt.Y('frequency:Q')
    ),
    alt.Chart(seq_stats).mark_bar().encode(
        x=alt.X('seq:N').sort(
            alt.EncodingSortField('num_res', order='ascending')
        ).title('Activity sequences executed by resources'),
        y=alt.Y('num_res:Q'),
        tooltip=['seq', 'num_res', 'res']
    ),
)