# Social Borrowing

## Description

When agents manage to get other agents to do (some of) their work for them without them being credited or acknowledged for it and without it being part of the work that these agents are expected to contribute to.

In [1]:
import pandas as pd
import numpy as np
import altair as alt
alt.data_transformers.enable('vegafusion')

DataTransformerRegistry.enable('vegafusion')

In [2]:
log_name = 'propr_test'

fn_logs = {
    'propr': '../data/preproc/proprietary.csv',
    'propr_ref': '../data/preproc/propr_train-70.csv',
    'propr_test': '../data/preproc/propr_test-30.csv'
}

In [3]:
log = pd.read_csv(fn_logs[log_name])
log

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,Cost,"School holidays; 0 = no, 1 = yes",Approval; 1 = low risk to 4 = high risk,Type of loan; 0 = normal; 1 = special,_start timestamp,_complete timestamp
0,Precheck,ID-2,6/05/2011 16:09,6/05/2011 16:27,090-10-02,Friday,"0,87 € per minute",0.0,1.0,0.0,2011-05-06 16:09:00,2011-05-06 16:27:00
1,Application_Processing_Branches,ID-2,9/05/2011 11:14,9/05/2011 11:16,010-23-13,Monday,"1,02 € per minute",0.0,1.0,0.0,2011-05-09 11:14:00,2011-05-09 11:16:00
2,Precheck,ID-2,9/05/2011 15:44,9/05/2011 15:58,090-10-02,Monday,"0,87 € per minute",0.0,1.0,0.0,2011-05-09 15:44:00,2011-05-09 15:58:00
3,Processing_Incomplete_Orders,ID-2,9/05/2011 16:10,9/05/2011 16:10,010-23-13,Monday,"1,02 € per minute",0.0,1.0,0.0,2011-05-09 16:10:00,2011-05-09 16:10:00
4,Application_Processing_Branches,ID-2,10/05/2011 8:52,10/05/2011 9:32,010-23-13,Tuesday,"1,02 € per minute",0.0,1.0,0.0,2011-05-10 08:52:00,2011-05-10 09:32:00
...,...,...,...,...,...,...,...,...,...,...,...,...
5285,Precheck,ID-1720,20/05/2011 17:24,20/05/2011 17:26,000-3-01,Friday,"0,87 € per minute",0.0,2.0,0.0,2011-05-20 17:24:00,2011-05-20 17:26:00
5286,Check_of_Processing_Applications,ID-1720,23/05/2011 15:53,23/05/2011 16:06,000-2-01,Monday,"0,87 € per minute",0.0,2.0,0.0,2011-05-23 15:53:00,2011-05-23 16:06:00
5287,Processing_of_Applications,ID-1720,26/05/2011 9:25,26/05/2011 9:25,010-23-07,Thursday,"1,02 € per minute",0.0,2.0,0.0,2011-05-26 09:25:00,2011-05-26 09:25:00
5288,Archieving,ID-1720,26/05/2011 10:38,26/05/2011 10:47,010-23-07,Thursday,"1,02 € per minute",0.0,2.0,0.0,2011-05-26 10:38:00,2011-05-26 10:47:00


In [4]:
# proprietary
log['department'] = log['resource'].apply(
    lambda x: x.split('-')[0]
)
log['department_role'] = log['resource'].apply(
    lambda x: x.split('-')[0] + '-' + x.split('-')[1]
)
log

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,Cost,"School holidays; 0 = no, 1 = yes",Approval; 1 = low risk to 4 = high risk,Type of loan; 0 = normal; 1 = special,_start timestamp,_complete timestamp,department,department_role
0,Precheck,ID-2,6/05/2011 16:09,6/05/2011 16:27,090-10-02,Friday,"0,87 € per minute",0.0,1.0,0.0,2011-05-06 16:09:00,2011-05-06 16:27:00,090,090-10
1,Application_Processing_Branches,ID-2,9/05/2011 11:14,9/05/2011 11:16,010-23-13,Monday,"1,02 € per minute",0.0,1.0,0.0,2011-05-09 11:14:00,2011-05-09 11:16:00,010,010-23
2,Precheck,ID-2,9/05/2011 15:44,9/05/2011 15:58,090-10-02,Monday,"0,87 € per minute",0.0,1.0,0.0,2011-05-09 15:44:00,2011-05-09 15:58:00,090,090-10
3,Processing_Incomplete_Orders,ID-2,9/05/2011 16:10,9/05/2011 16:10,010-23-13,Monday,"1,02 € per minute",0.0,1.0,0.0,2011-05-09 16:10:00,2011-05-09 16:10:00,010,010-23
4,Application_Processing_Branches,ID-2,10/05/2011 8:52,10/05/2011 9:32,010-23-13,Tuesday,"1,02 € per minute",0.0,1.0,0.0,2011-05-10 08:52:00,2011-05-10 09:32:00,010,010-23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5285,Precheck,ID-1720,20/05/2011 17:24,20/05/2011 17:26,000-3-01,Friday,"0,87 € per minute",0.0,2.0,0.0,2011-05-20 17:24:00,2011-05-20 17:26:00,000,000-3
5286,Check_of_Processing_Applications,ID-1720,23/05/2011 15:53,23/05/2011 16:06,000-2-01,Monday,"0,87 € per minute",0.0,2.0,0.0,2011-05-23 15:53:00,2011-05-23 16:06:00,000,000-2
5287,Processing_of_Applications,ID-1720,26/05/2011 9:25,26/05/2011 9:25,010-23-07,Thursday,"1,02 € per minute",0.0,2.0,0.0,2011-05-26 09:25:00,2011-05-26 09:25:00,010,010-23
5288,Archieving,ID-1720,26/05/2011 10:38,26/05/2011 10:47,010-23-07,Thursday,"1,02 € per minute",0.0,2.0,0.0,2011-05-26 10:38:00,2011-05-26 10:47:00,010,010-23


## Social-SB-1

In [5]:
# convert to timestamps to calculate case/activity duration

log = log.sort_values(by='start timestamp')
log['start timestamp'] = pd.to_datetime(log['start timestamp'], format='mixed', dayfirst=True)
log['complete timestamp'] = pd.to_datetime(log['complete timestamp'], format='mixed', dayfirst=True)
log['activity_duration_minutes'] = (
    log['complete timestamp'] - log['start timestamp']
).dt.total_seconds() / 60

### Determine the attendance of resources


In [6]:
res_attendance = dict()
for res, events in log.groupby('resource'):
    res_attendance[res] = [
        pd.Interval(e['start timestamp'], e['complete timestamp'], closed='both')
        for i, e in events.iterrows()
    ]

### Discover co-presence of resources

- Direct co-presence: resources worked in the same case
- Indirect co-presence: working time of resources overlap

In [7]:
# Calculate direct co-presence
if True:
    direct_coapp = []
    for case, events in log.groupby('case_id'):
        case_team = set(events['resource'].unique())
        for i, e in events.iterrows():
            direct_coapp.append((
                i, 
                ','.join(sorted(case_team.difference({e['resource']})))
            ))
    direct_coapp = pd.DataFrame(
        direct_coapp, columns=['index', 'direct_copresenting_resources']
    )
    direct_coapp = direct_coapp.set_index(keys='index', drop=True)
    log = pd.merge(log, direct_coapp, left_index=True, right_index=True)

In [8]:
# Calculate indirect co-presence:
if True:
    all_resources = set(log['resource'].unique())
    indirect_coapp = []
    for res, events in log.groupby('resource'):
        for i, e in events.iterrows():
            overlapped_resources = set()
            interval = pd.Interval(
                e['start timestamp'], e['complete timestamp'], closed='both'
            )
            for other in all_resources.difference({res}):
                for x in res_attendance[other]:
                    if interval.overlaps(x):
                        overlapped_resources.add(other)
                        break
            indirect_coapp.append((i, ','.join(sorted(overlapped_resources))))
    indirect_coapp = pd.DataFrame(
        indirect_coapp, columns=['index', 'indirect_copresenting_resources']
    )
    indirect_coapp = indirect_coapp.set_index(keys='index', drop=True)
    log = pd.merge(log, indirect_coapp, left_index=True, right_index=True)

In [9]:
# log.to_csv(fn_logs[log_name] + '.coapp.csv', index=False)
# log = pd.read_csv(fn_logs[log_name] + '.coapp.csv')
log

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,Cost,"School holidays; 0 = no, 1 = yes",Approval; 1 = low risk to 4 = high risk,Type of loan; 0 = normal; 1 = special,_start timestamp,_complete timestamp,department,department_role,activity_duration_minutes,direct_copresenting_resources,indirect_copresenting_resources
811,Precheck,ID-270,2011-03-01 10:00:00,2011-03-01 10:05:00,001-7-03,Tuesday,"0,87 € per minute",0.0,1.0,0.0,2011-03-01 10:00:00,2011-03-01 10:05:00,001,001-7,5.0,"000-1-01,001-5-01,010-24-01,010-24-12,010-24-1...",
4980,Check_of_Processing_Applications,ID-1619,2011-03-01 10:06:00,2011-03-01 10:07:00,000-2-01,Tuesday,"0,87 € per minute",0.0,2.0,0.0,2011-03-01 10:06:00,2011-03-01 10:07:00,000,000-2,1.0,"000-4-01,010-25-06,020-11-01",
1338,Approval_Branch,ID-427,2011-03-01 10:34:00,2011-03-01 10:36:00,010-9-1,Tuesday,"0,87 € per minute",0.0,2.0,0.0,2011-03-01 10:34:00,2011-03-01 10:36:00,010,010-9,2.0,"010-23-07,010-23-13,010-8-1",
1803,Application_Processing_Branches,ID-546,2011-03-01 10:38:00,2011-03-01 16:27:00,080-10-03,Tuesday,"0,87 € per minute",0.0,1.0,0.0,2011-03-01 10:38:00,2011-03-01 16:27:00,080,080-10,349.0,010-23-13,"000-1-01,000-2-01,000-3-01,000-4-02,010-24-13,..."
1339,Refusal_Branches,ID-427,2011-03-01 10:39:00,2011-03-01 10:43:00,010-9-1,Tuesday,"0,87 € per minute",0.0,2.0,0.0,2011-03-01 10:39:00,2011-03-01 10:43:00,010,010-9,4.0,"010-23-07,010-23-13,010-8-1",080-10-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1485,Get_Signature_of_Customer,ID-456,2011-05-09 09:42:00,2011-05-09 09:44:00,010-24-23,Monday,"1,13 € per minute",0.0,1.0,1.0,2011-05-09 09:42:00,2011-05-09 09:44:00,010,010-24,2.0,"010-24-26,070-6-04","010-23-13,010-8-1"
2620,Precheck,ID-774,2011-05-09 09:43:00,2011-05-09 09:45:00,010-8-1,Monday,"0,87 € per minute",0.0,1.0,0.0,2011-05-09 09:43:00,2011-05-09 09:45:00,010,010-8,2.0,"010-23-06,010-23-13","010-23-13,010-24-23"
1292,Processing_of_Applications,ID-398,2011-05-09 09:49:00,2011-05-09 09:49:00,010-25-07,Monday,"1,02 € per minute",0.0,3.0,0.0,2011-05-09 09:49:00,2011-05-09 09:49:00,010,010-25,0.0,"000-3-01,010-23-13,010-30-01,041-18-1",010-23-13
1293,Archieving,ID-398,2011-05-09 09:50:00,2011-05-09 09:51:00,010-25-07,Monday,"1,02 € per minute",0.0,3.0,0.0,2011-05-09 09:50:00,2011-05-09 09:51:00,010,010-25,1.0,"000-3-01,010-23-13,010-30-01,041-18-1",010-23-13


In [10]:
log = log[log['department_role'] == '010-24']
sel_resources = log['resource'].unique()

### Profile resource performance workload and productivity

In [11]:
col_copresence = 'direct_copresenting_resources'

def is_resource_in(r, str_resources):
    resources = str_resources.split(',')
    if len(resources) > 0:
        return r in resources
    return np.nan

In [12]:
res_cores_stats = []
from itertools import permutations
for (res, cores) in permutations(sel_resources, r=2):
    # compute stats when with cores
    res_cores_stats.append({
        'copresence': 'w',
        'res': res,
        'cores': cores,
        'mean_act_duration': (
            log.loc[
                log.apply(
                    lambda row: (
                        row['resource'] == res and
                        is_resource_in(cores, row[col_copresence])
                    ),
                    axis=1
                ),
                'activity_duration_minutes'
            ].mean()
        ),
        # number of events normalized by number of cases
        'workload': (
            len(log.loc[
                log.apply(
                    lambda row: (
                        row['resource'] == res and
                        is_resource_in(cores, row[col_copresence])
                    ),
                    axis=1
                ),
                :
            ]) /
            log.loc[
                log.apply(
                    lambda row: (
                        row['resource'] == res and
                        is_resource_in(cores, row[col_copresence])
                    ),
                    axis=1
                ),
                'case_id'
            ].nunique() 
            if 
            len(log.loc[
                log.apply(
                    lambda row: (
                        row['resource'] == res and
                        is_resource_in(cores, row[col_copresence])
                    ),
                    axis=1
                ),
                :
            ]) > 0
            else 0
        )
    })
    # compute stats when without cores
    res_cores_stats.append({
        'copresence': 'wo',
        'res': res,
        'cores': cores,
        'mean_act_duration': (
            log.loc[
                log.apply(
                    lambda row: (
                        row['resource'] == res and
                        not is_resource_in(cores, row[col_copresence])
                    ),
                    axis=1
                ),
                'activity_duration_minutes'
            ].mean()
        ),
        # number of events normalized by number of cases
        'workload': (
            len(log.loc[
                log.apply(
                    lambda row: (
                        row['resource'] == res and
                        not is_resource_in(cores, row[col_copresence])
                    ),
                    axis=1
                ),
                :
            ]) /
            log.loc[
                log.apply(
                    lambda row: (
                        row['resource'] == res and
                        not is_resource_in(cores, row[col_copresence])
                    ),
                    axis=1
                ),
                'case_id'
            ].nunique()
            if 
            len(log.loc[
                log.apply(
                    lambda row: (
                        row['resource'] == res and
                        not is_resource_in(cores, row[col_copresence])
                    ),
                    axis=1
                ),
                :
            ]) > 0
            else 0
        )
    })
res_cores_stats = pd.DataFrame(res_cores_stats)
res_cores_stats['productivity'] = 1 / res_cores_stats['mean_act_duration']
res_cores_stats

Unnamed: 0,copresence,res,cores,mean_act_duration,workload,productivity
0,w,010-24-13,010-24-26,14.777778,2.500000,0.067669
1,wo,010-24-13,010-24-26,3.428571,1.750000,0.291667
2,w,010-24-13,010-24-02,1.000000,1.333333,1.000000
3,wo,010-24-13,010-24-02,11.822581,2.296296,0.084584
4,w,010-24-13,010-24-17,4.750000,2.400000,0.210526
...,...,...,...,...,...,...
215,wo,010-24-03,010-24-23,2.000000,1.000000,0.500000
216,w,010-24-03,010-24-11,,0.000000,
217,wo,010-24-03,010-24-11,2.000000,1.000000,0.500000
218,w,010-24-03,010-24-16,,0.000000,


### Find correlation between co-presence periods and decreased performance (and increased workload) of resources

In [13]:
# compute workload and productivity difference
res_cores_stats = res_cores_stats.pivot(
    index=['res', 'cores'], 
    columns=['copresence'], values=['workload', 'productivity']
).reset_index()
res_cores_stats.columns = res_cores_stats.columns.map('_'.join)
res_cores_stats = res_cores_stats.rename(columns={
    'res_': 'resource',
    'cores_': 'copresent_resource'
})
res_cores_stats

Unnamed: 0,resource,copresent_resource,workload_w,workload_wo,productivity_w,productivity_wo
0,010-24-01,010-24-02,4.333333,5.724138,0.095588,0.057810
1,010-24-01,010-24-03,0.000000,5.655738,,0.058683
2,010-24-01,010-24-04,2.000000,5.716667,1.000000,0.058363
3,010-24-01,010-24-11,0.000000,5.655738,,0.058683
4,010-24-01,010-24-12,8.000000,5.576271,0.073733,0.058107
...,...,...,...,...,...,...
105,010-24-26,010-24-12,1.714286,2.597015,0.139535,0.108569
106,010-24-26,010-24-13,3.444444,2.484211,0.085635,0.113189
107,010-24-26,010-24-16,1.000000,2.574879,0.111111,0.109110
108,010-24-26,010-24-17,2.062500,2.659091,0.110368,0.108939


In [14]:
# calculate workload differences
res_cores_stats['workload_diff_pct'] = (
    (res_cores_stats['workload_w'] - res_cores_stats['workload_wo']) / 
    res_cores_stats['workload_wo']
)
# calculate productivity differences
res_cores_stats['productivity_diff_pct'] = (
    (res_cores_stats['productivity_w'] - res_cores_stats['productivity_wo']) / 
    res_cores_stats['productivity_wo']
)

In [15]:
# focus on only when productivity decreases
workload_chart = alt.Chart(
    res_cores_stats[res_cores_stats['productivity_diff_pct'] < 0]
).mark_rect().encode(
    x='resource',
    y='copresent_resource',
    color=alt.Color('workload_diff_pct').scale(scheme='lightgreyred'),
    tooltip=['resource', 'copresent_resource', 'workload_w', 'workload_wo', 'workload_diff_pct']
)
# focus on only the decreases
productivity_chart_neg = alt.layer(
    alt.Chart(
        res_cores_stats[res_cores_stats['productivity_diff_pct'] < 0],
        title=[
            'decrease (%) in resource productivity', 
            'when other resources are present'
        ]
    ).mark_rect().encode(
        x=alt.X('resource').title('resource ID'),
        y=alt.Y('copresent_resource').title('co-present resource ID'),
        color=alt.Color('productivity_diff_pct:Q').scale(scheme='lightgreyred', reverse=True).legend(None),
        tooltip=['resource', 'copresent_resource', 'productivity_w', 'productivity_wo', 'productivity_diff_pct']
    ),
    alt.Chart(
        res_cores_stats[res_cores_stats['productivity_diff_pct'] < 0]
    ).mark_text(baseline='middle').encode(
        x=alt.X('resource').title('resource ID'),
        y=alt.Y('copresent_resource').title('co-present resource ID'),
        text=alt.Text('productivity_diff_pct:Q').format('+.0%')
    ),
).properties(height=200, width=300)


In [16]:
alt.hconcat(
    # workload_chart, 
    productivity_chart_neg
).resolve_scale(x='independent', y='independent', color='independent')

In [17]:
# perform stats test (pairwise between resources)
from scipy.stats import mannwhitneyu
charts = []
for (res, cores) in permutations(sel_resources, r=2):
    # compute stats when with cores
    arr_w = log.loc[
        log.apply(
            lambda row: (
                row['resource'] == res and
                is_resource_in(cores, row[col_copresence])
            ),
            axis=1
        ),
        'activity_duration_minutes'
    ]
    # compute stats when without cores
    arr_wo = log.loc[
        log.apply(
            lambda row: (
                row['resource'] == res and
                not is_resource_in(cores, row[col_copresence])
            ),
            axis=1
        ),
        'activity_duration_minutes'
    ]
    if len(arr_w) > 1 and len(arr_wo) > 1:
        charts.append(
            alt.vconcat(
                alt.Chart(arr_w.to_frame(), title=f'{res} w/ {cores}').mark_boxplot().encode(
                    x='activity_duration_minutes:Q'
                ),
                alt.Chart(arr_wo.to_frame(), title=f'{res} wo/ {cores}').mark_boxplot().encode(
                    x='activity_duration_minutes:Q'
                ),
            ).resolve_scale(x='shared')
        )
        test_result = mannwhitneyu(x=arr_w, y=arr_wo, alternative='greater')
        if test_result.pvalue < 0.05:
            # reject H0 in favor of alternative: arr_x is greater than arr_wo
            print(res)
            print(cores)
            print(test_result)
alt.vconcat(*charts)

010-24-13
010-24-26
MannwhitneyuResult(statistic=619.5, pvalue=0.020893262301245108)
010-24-26
010-24-02
MannwhitneyuResult(statistic=18449.0, pvalue=0.03279618407787761)


In [18]:
# plot the identified pairs
pairs = [
    ('010-24-13', '010-24-26'),
    ('010-24-26', '010-24-02'),
]

charts = []
for pair in pairs:
    w = log.loc[
        log.apply(
            lambda row: (
                row['resource'] == pair[0] and
                is_resource_in(pair[1], row[col_copresence])
            ),
            axis=1
        ),
        'activity_duration_minutes'
    ]
    w = w.to_frame()
    w[f'with {pair[1]}'] = True
    wo = log.loc[
        log.apply(
            lambda row: (
                row['resource'] == pair[0] and
                not is_resource_in(pair[1], row[col_copresence])
            ),
            axis=1
        ),
        'activity_duration_minutes'
    ]
    wo = wo.to_frame()
    wo[f'with {pair[1]}'] = False
    data = pd.concat([w, wo])
    charts.append(
        alt.Chart(data, title='resource ID {}'.format(pair[0])).mark_boxplot().encode(
            x=alt.X('activity_duration_minutes:Q').title('activity duration (minutes)'),
            y=f'with {pair[1]}:N',
            color=alt.Color(f'with {pair[1]}:N').legend(None)
        )
    )
alt.vconcat(*charts)

In [19]:
# perform stats test (based on the change of productivity measured as mean
# activity duration time)
from scipy.stats import wilcoxon
for cores in sorted(sel_resources):
    print(cores)
    # exclude irrational values
    res_data = res_cores_stats[res_cores_stats['copresent_resource'] == cores].dropna()
    res_data = res_data[
        (res_cores_stats['productivity_diff_pct'] != np.inf) |
        (res_cores_stats['productivity_diff_pct'] != -np.inf)
    ]
    if len(res_data) > 1:
        # print(sorted(res_data['resource'].unique()))
        test_result = wilcoxon(
            x=res_data['productivity_w'].to_numpy(), 
            y=res_data['productivity_wo'].to_numpy(), 
            alternative='less'
        )
        if test_result.pvalue < 0.05:
            # reject H0 in favor of alternative: productivity_w is lower
            print(test_result)

010-24-01
010-24-02
010-24-03
010-24-04
010-24-11
010-24-12
010-24-13
010-24-16
010-24-17
010-24-23
010-24-26


  res_data = res_data[
  res_data = res_data[
  res_data = res_data[
  res_data = res_data[
  res_data = res_data[
  res_data = res_data[
  res_data = res_data[
  res_data = res_data[
  res_data = res_data[
  res_data = res_data[
  res_data = res_data[
