# Preferential Work Selection

## Description
Agents specifically choose process instances or work items with particular characteristics.

In [241]:
import pandas as pd
import numpy as np
import altair as alt
alt.data_transformers.enable('vegafusion')

DataTransformerRegistry.enable('vegafusion')

In [242]:
log_name = 'propr'

fn_logs = {
    'bpic15': '../data/preproc/bpic15.csv',
    'bpic15_ref': '../data/preproc/bpic15_train-70.csv',
    'bpic15_test': '../data/preproc/bpic15_test-30.csv',

    'bpic17': '../data/preproc/bpic2017.csv',

    'propr': '../data/preproc/proprietary.csv',
    'propr_ref': '../data/preproc/proprietary_train-70.csv',
    'propr_test': '../data/preproc/proprietary_test-30.csv'
}

In [243]:
log = pd.read_csv(fn_logs[log_name])
print(log['case_id'].nunique())
log

1731


Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,"School holidays; 0 = no, 1 = yes",Approval; 1 = low sum to 4 = high sum,Type of loan; 0 = normal; 1 = special,Cost
0,Approval_Branch,005141EEB1240B31C12577DF004F6A77,18/11/2010 15:27,18/11/2010 15:58,004-9-1,Thursday,0.0,2.0,0.0,"0,87 € per minute"
1,Precheck,005141EEB1240B31C12577DF004F6A77,19/11/2010 12:45,19/11/2010 12:46,000-3-01,Friday,0.0,2.0,0.0,"0,87 € per minute"
2,Precheck,005141EEB1240B31C12577DF004F6A77,24/11/2010 8:18,24/11/2010 8:26,000-2-01,Wednesday,0.0,2.0,0.0,"0,87 € per minute"
3,Check_of_Processing_Applications,005141EEB1240B31C12577DF004F6A77,24/11/2010 10:35,24/11/2010 10:35,000-2-01,Wednesday,0.0,2.0,0.0,"0,87 € per minute"
4,Processing_of_Applications,005141EEB1240B31C12577DF004F6A77,2/12/2010 10:46,2/12/2010 10:46,010-23-11,Thursday,0.0,2.0,0.0,"1,02 € per minute"
...,...,...,...,...,...,...,...,...,...,...
18440,Precheck,FFFF329EF772D73CC12577EA00534A1C,13/12/2010 11:38,13/12/2010 11:40,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute"
18441,Check_of_Processing_Applications,FFFF329EF772D73CC12577EA00534A1C,13/12/2010 12:06,13/12/2010 12:14,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute"
18442,Processing_of_Applications,FFFF329EF772D73CC12577EA00534A1C,22/12/2010 11:48,22/12/2010 11:48,010-23-07,Wednesday,1.0,2.0,0.0,"1,02 € per minute"
18443,Archieving,FFFF329EF772D73CC12577EA00534A1C,22/12/2010 11:49,22/12/2010 13:19,010-23-07,Wednesday,1.0,2.0,0.0,"1,02 € per minute"


### Apply the given criteria to categorize case types

Select cases (process instances) with particular characteristics

* For the proprietary data, select the loans marked as special

In [244]:
# proprietary
log['case_type'] = log['Type of loan; 0 = normal; 1 = special'].apply(
    lambda x: 'special loan' if x == 1 else 'normal loan'
)

case_types = list(log['case_type'].unique())
log

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,"School holidays; 0 = no, 1 = yes",Approval; 1 = low sum to 4 = high sum,Type of loan; 0 = normal; 1 = special,Cost,case_type
0,Approval_Branch,005141EEB1240B31C12577DF004F6A77,18/11/2010 15:27,18/11/2010 15:58,004-9-1,Thursday,0.0,2.0,0.0,"0,87 € per minute",normal loan
1,Precheck,005141EEB1240B31C12577DF004F6A77,19/11/2010 12:45,19/11/2010 12:46,000-3-01,Friday,0.0,2.0,0.0,"0,87 € per minute",normal loan
2,Precheck,005141EEB1240B31C12577DF004F6A77,24/11/2010 8:18,24/11/2010 8:26,000-2-01,Wednesday,0.0,2.0,0.0,"0,87 € per minute",normal loan
3,Check_of_Processing_Applications,005141EEB1240B31C12577DF004F6A77,24/11/2010 10:35,24/11/2010 10:35,000-2-01,Wednesday,0.0,2.0,0.0,"0,87 € per minute",normal loan
4,Processing_of_Applications,005141EEB1240B31C12577DF004F6A77,2/12/2010 10:46,2/12/2010 10:46,010-23-11,Thursday,0.0,2.0,0.0,"1,02 € per minute",normal loan
...,...,...,...,...,...,...,...,...,...,...,...
18440,Precheck,FFFF329EF772D73CC12577EA00534A1C,13/12/2010 11:38,13/12/2010 11:40,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute",normal loan
18441,Check_of_Processing_Applications,FFFF329EF772D73CC12577EA00534A1C,13/12/2010 12:06,13/12/2010 12:14,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute",normal loan
18442,Processing_of_Applications,FFFF329EF772D73CC12577EA00534A1C,22/12/2010 11:48,22/12/2010 11:48,010-23-07,Wednesday,1.0,2.0,0.0,"1,02 € per minute",normal loan
18443,Archieving,FFFF329EF772D73CC12577EA00534A1C,22/12/2010 11:49,22/12/2010 13:19,010-23-07,Wednesday,1.0,2.0,0.0,"1,02 € per minute",normal loan


## Rerouting-PWS-1

### Profile resource workload by frequency of cases they were involved

In [245]:
res_workload = list()
for res, events in log.groupby('resource'):
    total_num_cases = events['case_id'].nunique()
    for ct in case_types:
        res_workload.append({
            'resource': res, 
            'case_type': ct,
            'frequency': (
                events.loc[events['case_type'] == ct, 'case_id'].nunique()
            ),
            'total_num_cases': total_num_cases
        })

res_workload = pd.DataFrame(res_workload)
res_workload['rel_frequency'] = res_workload['frequency'] / res_workload['total_num_cases']
# only select resources who 
# participated in the selected case type; and
# are involved in at least 100 cases
sel_resources = set(res_workload.loc[
    (res_workload['case_type'] == 'special loan') & 
    (res_workload['frequency'] > 0) &
    (res_workload['total_num_cases'] >= 100),
    'resource'
].unique())
res_workload = res_workload[
    (res_workload['resource'].isin(sel_resources))
]
res_workload

Unnamed: 0,resource,case_type,frequency,total_num_cases,rel_frequency
4,000-2-01,normal loan,530,540,0.981481
5,000-2-01,special loan,10,540,0.018519
6,000-3-01,normal loan,396,397,0.997481
7,000-3-01,special loan,1,397,0.002519
98,010-21-01,normal loan,91,112,0.8125
99,010-21-01,special loan,21,112,0.1875
104,010-23-02,normal loan,247,251,0.984064
105,010-23-02,special loan,4,251,0.015936
110,010-23-06,normal loan,152,157,0.968153
111,010-23-06,special loan,5,157,0.031847


In [246]:
# plot numbers over different case types and resources
data = res_workload[res_workload['case_type'] == 'special loan']
alt.vconcat(
    alt.Chart(res_workload[['resource', 'total_num_cases']].drop_duplicates()).mark_bar().encode(
        x=alt.X('resource:N', sort=alt.EncodingSortField(field='total_num_cases', order='descending')),
        y=alt.Y('total_num_cases:Q')
    ),
    alt.Chart(res_workload).mark_bar().encode(
        x=alt.X('resource:N', sort=alt.EncodingSortField(field='total_num_cases', order='descending')),
        y=alt.Y('rel_frequency:Q'),
        color=alt.Color('case_type:N')
    )
)

### Perform statistical test to identify resources with a significantly higher workload (on special cases)

In [247]:
# data = res_workload.pivot(
#     index='resource',
#     columns='case_type',
#     values='frequency'
# ).reset_index()
sel_ct = 'special loan'
data = res_workload[res_workload['case_type'] == sel_ct]
# compute the overall relative frequency of cases with selected resources
involved_cases = set()
for case_id, trace in log.groupby('case_id'):
    if len(set(trace['resource']).intersection(sel_resources)) > 0:
        involved_cases.add(case_id)
overall_rel_frequency = log[log['case_id'].isin(involved_cases)].groupby('case_type').agg(num_cases=pd.NamedAgg('case_id', 'nunique'))
overall_rel_frequency = (
    overall_rel_frequency.loc[sel_ct, 'num_cases'] /
    overall_rel_frequency['num_cases'].sum()
)
print(overall_rel_frequency)
alt.Chart(
    data
).mark_circle().encode(
    x=alt.X('rel_frequency:Q').title(f'Relative frequency of {sel_ct} cases'),
    y=alt.Y('total_num_cases:Q').title('Total number of participated cases'),
    color=alt.Color('resource:N').title('Selected resources (most active)')
).properties(width=500)

0.05385556915544676


## Rerouting-PWS-2