# Social Borrowing

## Description

When agents manage to get other agents to do (some of) their work for them without them being credited or acknowledged for it and without it being part of the work that these agents are expected to contribute to.

In [66]:
import pandas as pd
import numpy as np
import altair as alt
alt.data_transformers.enable('vegafusion')

DataTransformerRegistry.enable('vegafusion')

In [67]:
log_name = 'propr'

fn_logs = {
    'bpic15': '../data/preproc/bpic15.csv',
    'bpic17': '../data/preproc/bpic17.csv',
    'propr': '../data/preproc/proprietary.csv'
}

In [68]:
log = pd.read_csv(fn_logs[log_name])
log

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,"School holidays; 0 = no, 1 = yes",Approval; 1 = low sum to 4 = high sum,Type of loan; 0 = normal; 1 = special,Cost
0,Approval_Branch,005141EEB1240B31C12577DF004F6A77,18/11/2010 15:27,18/11/2010 15:58,004-9-1,Thursday,0.0,2.0,0.0,"0,87 € per minute"
1,Precheck,005141EEB1240B31C12577DF004F6A77,19/11/2010 12:45,19/11/2010 12:46,000-3-01,Friday,0.0,2.0,0.0,"0,87 € per minute"
2,Precheck,005141EEB1240B31C12577DF004F6A77,24/11/2010 8:18,24/11/2010 8:26,000-2-01,Wednesday,0.0,2.0,0.0,"0,87 € per minute"
3,Check_of_Processing_Applications,005141EEB1240B31C12577DF004F6A77,24/11/2010 10:35,24/11/2010 10:35,000-2-01,Wednesday,0.0,2.0,0.0,"0,87 € per minute"
4,Processing_of_Applications,005141EEB1240B31C12577DF004F6A77,2/12/2010 10:46,2/12/2010 10:46,010-23-11,Thursday,0.0,2.0,0.0,"1,02 € per minute"
...,...,...,...,...,...,...,...,...,...,...
18440,Precheck,FFFF329EF772D73CC12577EA00534A1C,13/12/2010 11:38,13/12/2010 11:40,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute"
18441,Check_of_Processing_Applications,FFFF329EF772D73CC12577EA00534A1C,13/12/2010 12:06,13/12/2010 12:14,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute"
18442,Processing_of_Applications,FFFF329EF772D73CC12577EA00534A1C,22/12/2010 11:48,22/12/2010 11:48,010-23-07,Wednesday,1.0,2.0,0.0,"1,02 € per minute"
18443,Archieving,FFFF329EF772D73CC12577EA00534A1C,22/12/2010 11:49,22/12/2010 13:19,010-23-07,Wednesday,1.0,2.0,0.0,"1,02 € per minute"


## Social-SB-1

In [69]:
# convert to timestamps to calculate case/activity duration

log = log.sort_values(by='start timestamp')
log['start timestamp'] = pd.to_datetime(log['start timestamp'], format='mixed', dayfirst=True)
log['complete timestamp'] = pd.to_datetime(log['complete timestamp'], format='mixed', dayfirst=True)
log['activity_duration_minutes'] = (
    log['complete timestamp'] - log['start timestamp']
).dt.total_seconds() / 60

### Determine the attendance of resources


In [70]:
res_attendance = dict()
for res, events in log.groupby('resource'):
    res_attendance[res] = [
        pd.Interval(e['start timestamp'], e['complete timestamp'], closed='both')
        for i, e in events.iterrows()
    ]

### Discover co-presence of resources

- Direct co-presence: resources worked in the same case
- Indirect co-presence: working time of resources overlap

In [71]:
# Calculate direct co-presence
if False:
    direct_coapp = []
    for case, events in log.groupby('case_id'):
        case_team = set(events['resource'].unique())
        for i, e in events.iterrows():
            direct_coapp.append((
                i, 
                ','.join(sorted(case_team.difference({e['resource']})))
            ))
    direct_coapp = pd.DataFrame(
        direct_coapp, columns=['index', 'direct_copresenting_resources']
    )
    direct_coapp = direct_coapp.set_index(keys='index', drop=True)
    log = pd.merge(log, direct_coapp, left_index=True, right_index=True)

In [72]:
# Calculate indirect co-presence:
if False:
    all_resources = set(log['resource'].unique())
    indirect_coapp = []
    for res, events in log.groupby('resource'):
        for i, e in events.iterrows():
            overlapped_resources = set()
            interval = pd.Interval(
                e['start timestamp'], e['complete timestamp'], closed='both'
            )
            for other in all_resources.difference({res}):
                for x in res_attendance[other]:
                    if interval.overlaps(x):
                        overlapped_resources.add(other)
                        break
            indirect_coapp.append((i, ','.join(sorted(overlapped_resources))))
    indirect_coapp = pd.DataFrame(
        indirect_coapp, columns=['index', 'indirect_copresenting_resources']
    )
    indirect_coapp = indirect_coapp.set_index(keys='index', drop=True)
    log = pd.merge(log, indirect_coapp, left_index=True, right_index=True)

In [73]:
# log.to_csv(fn_logs[log_name] + '.coapp.csv', index=False)
log = pd.read_csv(fn_logs[log_name] + '.coapp.csv')
log

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,"School holidays; 0 = no, 1 = yes",Approval; 1 = low sum to 4 = high sum,Type of loan; 0 = normal; 1 = special,Cost,activity_duration_minutes,direct_copresenting_resources,indirect_copresenting_resources
0,Preparation_Draft_Executive_Board,6C9B95FFEBE57A0DC125782A002A3229,2011-02-01 10:01:00,2011-02-01 10:02:00,001-17-02,Tuesday,0.0,4.0,0.0,"2,04 € per minute",1.0,"010-23-07,010-23-13,070-15-02","010-23-11,010-24-01,010-24-23,010-24-26,070-10..."
1,Administrative_Tasks,08BC7C266A535237C1257826002ADD6E,2011-02-01 10:02:00,2011-02-01 10:51:00,010-24-26,Tuesday,0.0,1.0,1.0,"1,13 € per minute",49.0,"001-9-01,010-24-23","000-2-01,001-17-02,010-23-11,010-24-01,010-24-..."
2,Processing_of_Applications_Sonderkredit,08BC7C266A535237C1257826002ADD6E,2011-02-01 10:02:00,2011-02-01 10:02:00,010-24-26,Tuesday,0.0,1.0,1.0,"1,13 € per minute",0.0,"001-9-01,010-24-23","001-17-02,010-23-11,010-24-01,010-24-23,070-10..."
3,End,14050B07C84527C5C1257824002E58A8,2011-02-01 10:03:00,2011-02-01 10:03:00,010-23-11,Tuesday,0.0,3.0,0.0,"1,02 € per minute",0.0,"000-2-01,000-3-01,010-25-01,110-8-3","001-17-02,010-24-01,010-24-23,010-24-26,050-8-..."
4,Approval_Branch,3C3A3DDEF08D2225C1257829004E319B,2011-02-01 10:03:00,2011-02-01 10:09:00,050-8-1,Tuesday,0.0,2.0,0.0,"0,87 € per minute",6.0,"000-3-01,010-23-02,010-23-06,010-23-13","000-2-01,001-17-02,010-23-11,010-24-01,010-24-..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18440,Precheck,AB750FD25DEABED1C12577F300518367,2010-12-09 09:26:00,2010-12-09 10:13:00,070-7-06,Thursday,0.0,1.0,0.0,"1,02 € per minute",47.0,"010-23-13,010-24-26","010-24-01,010-24-17,010-24-19,010-7-04,010-8-1..."
18441,Precheck,059B8D0F8B58DFDEC12577F4002FAB94,2010-12-09 09:40:00,2010-12-09 14:18:00,093-18-1,Thursday,0.0,1.0,0.0,"0,87 € per minute",278.0,"010-24-04,010-24-26","001-15-03,001-5-01,001-6-02,001-7-02,010-23-02..."
18442,Precheck,401C68F8E2854EA7C12577F400309E68,2010-12-09 09:51:00,2010-12-09 09:54:00,020-9-1,Thursday,0.0,1.0,0.0,"0,87 € per minute",3.0,"010-23-13,010-24-26","010-24-17,010-24-19,010-8-1,070-7-06,093-18-1"
18443,Second_vote_KfW,DB060AF533AF5D13C1257735002E3378,2010-12-09 09:55:00,2010-12-09 09:55:00,010-24-01,Thursday,0.0,1.0,0.0,"1,13 € per minute",0.0,"010-24-17,010-24-19,010-24-23","010-24-17,010-24-19,010-8-1,070-7-06,093-18-1"


In [74]:
# determine the activity duration distribution stats
act_dur_stats = []
for act, events in log.groupby('activity'):
    act_dur_stats.append({
        'activity': act,
        'duration_q3': events['activity_duration_minutes'].quantile(0.75),
        'duration_q1': events['activity_duration_minutes'].quantile(0.25),
    })
act_dur_stats = pd.DataFrame(act_dur_stats)
act_dur_stats['duration_ub_outlier'] = (
    act_dur_stats['duration_q3'] + 
    1.5 * (act_dur_stats['duration_q3'] - act_dur_stats['duration_q1'])
)
act_dur_stats

Unnamed: 0,activity,duration_q3,duration_q1,duration_ub_outlier
0,Administrative_Tasks,33.0,5.0,75.0
1,Application_Processing_Branches,30.0,2.0,72.0
2,Approval,11.0,2.0,24.5
3,Approval_Branch,39.0,3.0,93.0
4,Approval_Executive_Board,2.0,1.0,3.5
5,Archieving,64.0,6.0,151.0
6,Check_of_Approval,16.5,2.0,38.25
7,Check_of_Documents,73.25,30.0,138.125
8,Check_of_Processing_Applications,10.0,2.0,22.0
9,End,0.0,0.0,0.0


In [75]:
log = log.merge(act_dur_stats, how='left', on='activity')
log['is_low_performance'] = (
    log['activity_duration_minutes'] > log['duration_ub_outlier']
)
log

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,"School holidays; 0 = no, 1 = yes",Approval; 1 = low sum to 4 = high sum,Type of loan; 0 = normal; 1 = special,Cost,activity_duration_minutes,direct_copresenting_resources,indirect_copresenting_resources,duration_q3,duration_q1,duration_ub_outlier,is_low_performance
0,Preparation_Draft_Executive_Board,6C9B95FFEBE57A0DC125782A002A3229,2011-02-01 10:01:00,2011-02-01 10:02:00,001-17-02,Tuesday,0.0,4.0,0.0,"2,04 € per minute",1.0,"010-23-07,010-23-13,070-15-02","010-23-11,010-24-01,010-24-23,010-24-26,070-10...",16.75,2.0,38.875,False
1,Administrative_Tasks,08BC7C266A535237C1257826002ADD6E,2011-02-01 10:02:00,2011-02-01 10:51:00,010-24-26,Tuesday,0.0,1.0,1.0,"1,13 € per minute",49.0,"001-9-01,010-24-23","000-2-01,001-17-02,010-23-11,010-24-01,010-24-...",33.00,5.0,75.000,False
2,Processing_of_Applications_Sonderkredit,08BC7C266A535237C1257826002ADD6E,2011-02-01 10:02:00,2011-02-01 10:02:00,010-24-26,Tuesday,0.0,1.0,1.0,"1,13 € per minute",0.0,"001-9-01,010-24-23","001-17-02,010-23-11,010-24-01,010-24-23,070-10...",1.00,0.0,2.500,False
3,End,14050B07C84527C5C1257824002E58A8,2011-02-01 10:03:00,2011-02-01 10:03:00,010-23-11,Tuesday,0.0,3.0,0.0,"1,02 € per minute",0.0,"000-2-01,000-3-01,010-25-01,110-8-3","001-17-02,010-24-01,010-24-23,010-24-26,050-8-...",0.00,0.0,0.000,False
4,Approval_Branch,3C3A3DDEF08D2225C1257829004E319B,2011-02-01 10:03:00,2011-02-01 10:09:00,050-8-1,Tuesday,0.0,2.0,0.0,"0,87 € per minute",6.0,"000-3-01,010-23-02,010-23-06,010-23-13","000-2-01,001-17-02,010-23-11,010-24-01,010-24-...",39.00,3.0,93.000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18440,Precheck,AB750FD25DEABED1C12577F300518367,2010-12-09 09:26:00,2010-12-09 10:13:00,070-7-06,Thursday,0.0,1.0,0.0,"1,02 € per minute",47.0,"010-23-13,010-24-26","010-24-01,010-24-17,010-24-19,010-7-04,010-8-1...",16.50,1.0,39.750,True
18441,Precheck,059B8D0F8B58DFDEC12577F4002FAB94,2010-12-09 09:40:00,2010-12-09 14:18:00,093-18-1,Thursday,0.0,1.0,0.0,"0,87 € per minute",278.0,"010-24-04,010-24-26","001-15-03,001-5-01,001-6-02,001-7-02,010-23-02...",16.50,1.0,39.750,True
18442,Precheck,401C68F8E2854EA7C12577F400309E68,2010-12-09 09:51:00,2010-12-09 09:54:00,020-9-1,Thursday,0.0,1.0,0.0,"0,87 € per minute",3.0,"010-23-13,010-24-26","010-24-17,010-24-19,010-8-1,070-7-06,093-18-1",16.50,1.0,39.750,False
18443,Second_vote_KfW,DB060AF533AF5D13C1257735002E3378,2010-12-09 09:55:00,2010-12-09 09:55:00,010-24-01,Thursday,0.0,1.0,0.0,"1,13 € per minute",0.0,"010-24-17,010-24-19,010-24-23","010-24-17,010-24-19,010-8-1,070-7-06,093-18-1",24.00,1.0,58.500,False


### Profile resource performance workload and productivity

### Find correlation between co-presence periods and decreased performance (and increased workload) of resources

In [76]:
all_resources = sorted(log['resource'].unique())
col_copresenting_res = 'direct_copresenting_resources'
coapp_normal = np.zeros((len(all_resources), len(all_resources)))
coapp_low_perf = np.zeros((len(all_resources), len(all_resources)))
log = log.fillna('')
for res, events in log.groupby('resource'):
    px = all_resources.index(res)
    for i, e in events.iterrows():
        if e[col_copresenting_res] != '':
            for r in e[col_copresenting_res].split(','):
                py = all_resources.index(r)
                if e['is_low_performance']:
                    coapp_low_perf[px,py] += 1
                else:
                    coapp_normal[px,py] += 1

coapp_all = coapp_normal + coapp_low_perf
coapp_low_perf /= coapp_all
# convert matrices into dataframe
coapp_all = pd.DataFrame.from_dict(coapp_all)
coapp_all['resource'] = all_resources
coapp_all = coapp_all.set_index('resource')
coapp_all = coapp_all.rename(columns=dict(
    (k, all_resources[k])
    for k in range(len(all_resources))
))
coapp_all = coapp_all.reset_index().melt(id_vars='resource').rename(columns={'variable': 'copresenting_resource', 'value': 'num_while_copresenting'})
coapp_all = coapp_all.replace(0, np.nan)
# coapp_all

coapp_low_perf = pd.DataFrame.from_dict(coapp_low_perf)
coapp_low_perf['resource'] = all_resources
coapp_low_perf = coapp_low_perf.set_index('resource')
coapp_low_perf = coapp_low_perf.rename(columns=dict(
    (k, all_resources[k]) 
    for k in range(len(all_resources))
))
coapp_low_perf = coapp_low_perf.reset_index().melt(id_vars='resource').rename(columns={'variable': 'copresenting_resource', 'value': 'pct_low_perf_while_copresenting'})
coapp_low_perf = coapp_low_perf.replace(0, np.nan)
# coapp_low_perf

  coapp_low_perf /= coapp_all


In [77]:
sel_resources = [
    r for r in log['resource'].unique()
    if r.startswith('010-24')
]
coapp_all = coapp_all[
    (coapp_all['resource'].isin(sel_resources)) & 
    (coapp_all['copresenting_resource'].isin(sel_resources))
]
coapp_low_perf = coapp_low_perf[
    (coapp_low_perf['resource'].isin(sel_resources)) & 
    (coapp_low_perf['copresenting_resource'].isin(sel_resources))
]

In [78]:
alt.hconcat(
    alt.Chart(coapp_all).mark_rect().encode(
        x='resource:N',
        y='copresenting_resource:N',
        color=alt.Color('num_while_copresenting:Q').scale(scheme='lightgreyred'),
        tooltip=['resource', 'copresenting_resource', 'num_while_copresenting']
    ),
    alt.Chart(coapp_low_perf).mark_rect().encode(
        x='resource:N',
        y='copresenting_resource:N',
        color=alt.Color('pct_low_perf_while_copresenting:Q').scale(scheme='lightgreyred'),
        tooltip=['resource', 'copresenting_resource', 'pct_low_perf_while_copresenting']
    )
).resolve_scale(color='independent')