# Preferential Work Selection

## Description
Agents specifically choose process instances or work items with particular characteristics.

In [1]:
import pandas as pd
import numpy as np
import altair as alt
alt.data_transformers.enable('vegafusion')
import editdistance

In [2]:
log_name = 'propr_test'

fn_logs = {
    'bpic15': '../data/preproc/bpic15.csv',
    'bpic15_ref': '../data/preproc/bpic15_train-70.csv',
    'bpic15_test': '../data/preproc/bpic15_test-30.csv',

    'bpic17': '../data/preproc/bpic2017.csv',

    'propr': '../data/preproc/proprietary.csv',
    'propr_ref': '../data/preproc/proprietary_train-70.csv',
    'propr_test': '../data/preproc/proprietary_test-30.csv'
}

In [3]:
log = pd.read_csv(fn_logs[log_name])
print(log['case_id'].nunique())
log

519


Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,"School holidays; 0 = no, 1 = yes",Approval; 1 = low sum to 4 = high sum,Type of loan; 0 = normal; 1 = special,Cost,_start timestamp,_complete timestamp
0,Precheck,0051574BDF30D681C1257888004DC40D,6/05/2011 16:09,6/05/2011 16:27,090-10-02,Friday,0.0,1.0,0.0,"0,87 € per minute",2011-05-06 16:09:00,2011-05-06 16:27:00
1,Application_Processing_Branches,0051574BDF30D681C1257888004DC40D,9/05/2011 11:14,9/05/2011 11:16,010-23-13,Monday,0.0,1.0,0.0,"1,02 € per minute",2011-05-09 11:14:00,2011-05-09 11:16:00
2,Precheck,0051574BDF30D681C1257888004DC40D,9/05/2011 15:44,9/05/2011 15:58,090-10-02,Monday,0.0,1.0,0.0,"0,87 € per minute",2011-05-09 15:44:00,2011-05-09 15:58:00
3,Processing_Incomplete_Orders,0051574BDF30D681C1257888004DC40D,9/05/2011 16:10,9/05/2011 16:10,010-23-13,Monday,0.0,1.0,0.0,"1,02 € per minute",2011-05-09 16:10:00,2011-05-09 16:10:00
4,Application_Processing_Branches,0051574BDF30D681C1257888004DC40D,10/05/2011 8:52,10/05/2011 9:32,010-23-13,Tuesday,0.0,1.0,0.0,"1,02 € per minute",2011-05-10 08:52:00,2011-05-10 09:32:00
...,...,...,...,...,...,...,...,...,...,...,...,...
5285,Precheck,FE3E1E0D3928202EC1257896004B8F00,20/05/2011 17:24,20/05/2011 17:26,000-3-01,Friday,0.0,2.0,0.0,"0,87 € per minute",2011-05-20 17:24:00,2011-05-20 17:26:00
5286,Check_of_Processing_Applications,FE3E1E0D3928202EC1257896004B8F00,23/05/2011 15:53,23/05/2011 16:06,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute",2011-05-23 15:53:00,2011-05-23 16:06:00
5287,Processing_of_Applications,FE3E1E0D3928202EC1257896004B8F00,26/05/2011 9:25,26/05/2011 9:25,010-23-07,Thursday,0.0,2.0,0.0,"1,02 € per minute",2011-05-26 09:25:00,2011-05-26 09:25:00
5288,Archieving,FE3E1E0D3928202EC1257896004B8F00,26/05/2011 10:38,26/05/2011 10:47,010-23-07,Thursday,0.0,2.0,0.0,"1,02 € per minute",2011-05-26 10:38:00,2011-05-26 10:47:00


### Apply the given criteria to categorize case types

Select cases (process instances) with particular characteristics


In [4]:
# proprietary
log['case_type'] = log['Type of loan; 0 = normal; 1 = special'].apply(
    lambda x: 'special loan' if x == 1 else 'normal loan'
)
# log['case_type'] = log['Approval; 1 = low sum to 4 = high sum'].apply(
#     # lambda x: 'low-med' if x <= 2 else 'med-high'
#     lambda x: x
# )

case_types = list(log['case_type'].unique())
log

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,"School holidays; 0 = no, 1 = yes",Approval; 1 = low sum to 4 = high sum,Type of loan; 0 = normal; 1 = special,Cost,_start timestamp,_complete timestamp,case_type
0,Precheck,0051574BDF30D681C1257888004DC40D,6/05/2011 16:09,6/05/2011 16:27,090-10-02,Friday,0.0,1.0,0.0,"0,87 € per minute",2011-05-06 16:09:00,2011-05-06 16:27:00,normal loan
1,Application_Processing_Branches,0051574BDF30D681C1257888004DC40D,9/05/2011 11:14,9/05/2011 11:16,010-23-13,Monday,0.0,1.0,0.0,"1,02 € per minute",2011-05-09 11:14:00,2011-05-09 11:16:00,normal loan
2,Precheck,0051574BDF30D681C1257888004DC40D,9/05/2011 15:44,9/05/2011 15:58,090-10-02,Monday,0.0,1.0,0.0,"0,87 € per minute",2011-05-09 15:44:00,2011-05-09 15:58:00,normal loan
3,Processing_Incomplete_Orders,0051574BDF30D681C1257888004DC40D,9/05/2011 16:10,9/05/2011 16:10,010-23-13,Monday,0.0,1.0,0.0,"1,02 € per minute",2011-05-09 16:10:00,2011-05-09 16:10:00,normal loan
4,Application_Processing_Branches,0051574BDF30D681C1257888004DC40D,10/05/2011 8:52,10/05/2011 9:32,010-23-13,Tuesday,0.0,1.0,0.0,"1,02 € per minute",2011-05-10 08:52:00,2011-05-10 09:32:00,normal loan
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5285,Precheck,FE3E1E0D3928202EC1257896004B8F00,20/05/2011 17:24,20/05/2011 17:26,000-3-01,Friday,0.0,2.0,0.0,"0,87 € per minute",2011-05-20 17:24:00,2011-05-20 17:26:00,normal loan
5286,Check_of_Processing_Applications,FE3E1E0D3928202EC1257896004B8F00,23/05/2011 15:53,23/05/2011 16:06,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute",2011-05-23 15:53:00,2011-05-23 16:06:00,normal loan
5287,Processing_of_Applications,FE3E1E0D3928202EC1257896004B8F00,26/05/2011 9:25,26/05/2011 9:25,010-23-07,Thursday,0.0,2.0,0.0,"1,02 € per minute",2011-05-26 09:25:00,2011-05-26 09:25:00,normal loan
5288,Archieving,FE3E1E0D3928202EC1257896004B8F00,26/05/2011 10:38,26/05/2011 10:47,010-23-07,Thursday,0.0,2.0,0.0,"1,02 € per minute",2011-05-26 10:38:00,2011-05-26 10:47:00,normal loan


### Add the department and role information on resources

In [5]:
# proprietary
log['department'] = log['resource'].apply(
    lambda x: x.split('-')[0]
)
log['department_role'] = log['resource'].apply(
    lambda x: x.split('-')[0] + '-' + x.split('-')[1]
)
log

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,"School holidays; 0 = no, 1 = yes",Approval; 1 = low sum to 4 = high sum,Type of loan; 0 = normal; 1 = special,Cost,_start timestamp,_complete timestamp,case_type,department,department_role
0,Precheck,0051574BDF30D681C1257888004DC40D,6/05/2011 16:09,6/05/2011 16:27,090-10-02,Friday,0.0,1.0,0.0,"0,87 € per minute",2011-05-06 16:09:00,2011-05-06 16:27:00,normal loan,090,090-10
1,Application_Processing_Branches,0051574BDF30D681C1257888004DC40D,9/05/2011 11:14,9/05/2011 11:16,010-23-13,Monday,0.0,1.0,0.0,"1,02 € per minute",2011-05-09 11:14:00,2011-05-09 11:16:00,normal loan,010,010-23
2,Precheck,0051574BDF30D681C1257888004DC40D,9/05/2011 15:44,9/05/2011 15:58,090-10-02,Monday,0.0,1.0,0.0,"0,87 € per minute",2011-05-09 15:44:00,2011-05-09 15:58:00,normal loan,090,090-10
3,Processing_Incomplete_Orders,0051574BDF30D681C1257888004DC40D,9/05/2011 16:10,9/05/2011 16:10,010-23-13,Monday,0.0,1.0,0.0,"1,02 € per minute",2011-05-09 16:10:00,2011-05-09 16:10:00,normal loan,010,010-23
4,Application_Processing_Branches,0051574BDF30D681C1257888004DC40D,10/05/2011 8:52,10/05/2011 9:32,010-23-13,Tuesday,0.0,1.0,0.0,"1,02 € per minute",2011-05-10 08:52:00,2011-05-10 09:32:00,normal loan,010,010-23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5285,Precheck,FE3E1E0D3928202EC1257896004B8F00,20/05/2011 17:24,20/05/2011 17:26,000-3-01,Friday,0.0,2.0,0.0,"0,87 € per minute",2011-05-20 17:24:00,2011-05-20 17:26:00,normal loan,000,000-3
5286,Check_of_Processing_Applications,FE3E1E0D3928202EC1257896004B8F00,23/05/2011 15:53,23/05/2011 16:06,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute",2011-05-23 15:53:00,2011-05-23 16:06:00,normal loan,000,000-2
5287,Processing_of_Applications,FE3E1E0D3928202EC1257896004B8F00,26/05/2011 9:25,26/05/2011 9:25,010-23-07,Thursday,0.0,2.0,0.0,"1,02 € per minute",2011-05-26 09:25:00,2011-05-26 09:25:00,normal loan,010,010-23
5288,Archieving,FE3E1E0D3928202EC1257896004B8F00,26/05/2011 10:38,26/05/2011 10:47,010-23-07,Thursday,0.0,2.0,0.0,"1,02 € per minute",2011-05-26 10:38:00,2011-05-26 10:47:00,normal loan,010,010-23


In [6]:
sel_resources = log.loc[log['department_role'] == '010-24', 'resource'].unique()

## Rerouting-PWS-1

### Profile resource workload by frequency of cases they were involved

In [7]:
res_workload = list()
for res, events in log.groupby('resource'):
    total_num_cases = events['case_id'].nunique()
    for ct in case_types:
        res_workload.append({
            'resource': res, 
            'case_type': ct,
            'frequency': (
                events.loc[events['case_type'] == ct, 'case_id'].nunique()
            ),
            'total_num_cases': total_num_cases
        })

res_workload = pd.DataFrame(res_workload)
res_workload['rel_frequency'] = res_workload['frequency'] / res_workload['total_num_cases']

res_workload = res_workload[
    (res_workload['resource'].isin(sel_resources))
]
res_workload

Unnamed: 0,resource,case_type,frequency,total_num_cases,rel_frequency
98,010-24-01,normal loan,53,61,0.868852
99,010-24-01,special loan,8,61,0.131148
100,010-24-02,normal loan,32,34,0.941176
101,010-24-02,special loan,2,34,0.058824
102,010-24-03,normal loan,1,1,1.0
103,010-24-03,special loan,0,1,0.0
104,010-24-04,normal loan,30,30,1.0
105,010-24-04,special loan,0,30,0.0
106,010-24-11,normal loan,8,8,1.0
107,010-24-11,special loan,0,8,0.0


In [15]:
# plot numbers over different case types and resources
data = res_workload
alt.Chart(res_workload, title='Workload of resources in role 010-24 by case types').mark_bar().encode(
    y=alt.Y('resource:N').title('resource ID'),
    x=alt.X('frequency:Q').title('Percentage of cases').stack('normalize'),
    color=alt.Color('case_type:N').title('case type')
).properties(height=150, width=300)


### Perform statistical test to identify resources with a significantly higher workload

In [9]:
# log level reference
freq_ct_exp = log[['case_id', 'case_type']].drop_duplicates().groupby('case_type').size()
freq_ct_exp = (freq_ct_exp / log['case_id'].nunique()).to_numpy()

# department-role level reference
freq_ct_exp_dept = dict()
for department_role, events in log.groupby('department_role'):
    # print(department_role)
    freq_exp = []
    for ct in sorted(log['case_type'].unique()):
        # print(ct)
        freq = events.loc[(events['case_type'] == ct), 'case_id'].nunique()
        # print(freq)
        freq_exp.append(freq)
    freq_exp = np.array(freq_exp)
    freq_exp = freq_exp / events['case_id'].nunique()
    freq_ct_exp_dept[department_role] = freq_exp

from scipy.stats import chisquare
for res, events in log.groupby('resource'):
    if res in sel_resources:
        department_role = '-'.join(res.split('-')[0:2])
        freq_ct_res = []
        for ct in sorted(log['case_type'].unique()):
            freq_ct_res.append(
                events.loc[(events['case_type'] == ct), 'case_id'].nunique()
            )
        freq_ct_res = np.array(freq_ct_res)
        # print(freq_ct_res)
        # freq_ref_ct_exp = freq_ct_exp * events['case_id'].nunique()
        freq_ref_ct_exp = freq_ct_exp_dept[department_role] * events['case_id'].nunique()
        # print(freq_ref_ct_exp)
        test_result = chisquare(f_obs=freq_ct_res, f_exp=freq_ref_ct_exp)
        if test_result.pvalue < 0.05:
            print(department_role)
            print(res)
            print('This resource: {}'.format(freq_ct_res))
            print('Reference: {}'.format(freq_ref_ct_exp))
            print(freq_ct_exp_dept[department_role])
            print(test_result)
            print('*' * 80)

010-24
010-24-12
This resource: [2 5]
Reference: [6.34080717 0.65919283]
[0.9058296 0.0941704]
Power_divergenceResult(statistic=31.556004580049844, pvalue=1.93770274447675e-08)
********************************************************************************


## Rerouting-PWS-2

Conclusion: The different ordering of cases by resources does not seem to be
related to the different case types (normal vs. loan)

### Extract the order of case initiation

In [10]:
log['start timestamp'] = pd.to_datetime(log['start timestamp'], format='mixed', dayfirst=True)
log['complete timestamp'] = pd.to_datetime(log['complete timestamp'], format='mixed', dayfirst=True)
order_case_init = log.sort_values(by='start timestamp', ascending=True).drop_duplicates(subset='case_id')['case_id'].tolist()

### Extract the case ordering of each resource
### Measure difference between resource case ordering and the initiation

In [11]:
res_order_edit_dist = list()

for res, events in log.groupby('resource'):
    order_case_res = events.sort_values(by='start timestamp', ascending=True).drop_duplicates(subset='case_id')['case_id'].to_list()
    if len(order_case_res) > 1:
        indices = [order_case_init.index(x) for x in order_case_res]
        prioritized_case_ids = []
        # print(indices)
        for j, i in enumerate(indices):
            if j != len(indices) - 1 and i > min(indices[j+1:]):
                prioritized_case_ids.append(order_case_init[i])
        res_order_edit_dist.append({
            'resource': res,
            'normalized_edit_dist': (
                editdistance.eval(sorted(indices), indices) 
                / len(order_case_res)
            ),
            'pct_prioritized_cases': len(prioritized_case_ids) / len(order_case_res)
        })
res_order_edit_dist = pd.DataFrame(res_order_edit_dist)
res_order_edit_dist = res_order_edit_dist[res_order_edit_dist['resource'].isin(sel_resources)]

### Identify resources with the most significantly different case ordering

In [12]:
alt.vconcat(
    alt.Chart(res_order_edit_dist).mark_boxplot().encode(
        x=alt.X('normalized_edit_dist')
    ),
    alt.Chart(res_order_edit_dist).mark_boxplot().encode(
        x=alt.X('pct_prioritized_cases')
    )
)

In [13]:
# use Q3 + 1.5IQR (upper whisker end) as threshold
threshold_ned = (
    res_order_edit_dist['normalized_edit_dist'].quantile(0.75) +
    1.5 * (
        res_order_edit_dist['normalized_edit_dist'].quantile(0.75) -
        res_order_edit_dist['normalized_edit_dist'].quantile(0.25)
    )
)
threshold_ppc = (
    res_order_edit_dist['pct_prioritized_cases'].quantile(0.75) +
    1.5 * (
        res_order_edit_dist['pct_prioritized_cases'].quantile(0.75) -
        res_order_edit_dist['pct_prioritized_cases'].quantile(0.25)
    )
)
outlier_resources = res_order_edit_dist.loc[
    res_order_edit_dist['normalized_edit_dist'] > threshold_ppc,
    'resource'
].tolist()
outlier_resources

[]

### Test if the prioritized cases are statistically dependent on case type

In [14]:
case_types_mapping = log[['case_id', 'case_type']].drop_duplicates()
case_types_mapping = dict(case_types_mapping.set_index('case_id').to_records())
for res, events in log.groupby('resource'):
    if res in outlier_resources:
        print('*' * 80)
        print(res)
        order_case_res = events.sort_values(by='start timestamp', ascending=True).drop_duplicates(subset='case_id')['case_id'].to_list()
        # print(order_case_res)
        prioritized_case_ids = []
        if len(order_case_res) > 1:
            indices = [order_case_init.index(x) for x in order_case_res]
            # print(indices)
            for j, i in enumerate(indices):
                if j != len(indices) - 1 and i > min(indices[j+1:]):
                    prioritized_case_ids.append(order_case_init[i])
        print([case_types_mapping[c] for c in prioritized_case_ids])