# Social Loafing

## Description
Group member hopes for a free (or cheap) ride.

In [977]:
import pandas as pd
import numpy as np
import altair as alt
alt.data_transformers.enable('vegafusion')

DataTransformerRegistry.enable('vegafusion')

In [978]:
log_name = 'propr'

fn_logs = {
    'bpic15': '../data/preproc/bpic15.csv',
    'bpic17': '../data/preproc/bpic17.csv',
    'propr': '../data/preproc/proprietary.csv'
}

In [979]:
log = pd.read_csv(fn_logs[log_name])
log

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,"School holidays; 0 = no, 1 = yes",Approval; 1 = low sum to 4 = high sum,Type of loan; 0 = normal; 1 = special,Cost
0,Approval_Branch,005141EEB1240B31C12577DF004F6A77,18/11/2010 15:27,18/11/2010 15:58,004-9-1,Thursday,0.0,2.0,0.0,"0,87 € per minute"
1,Precheck,005141EEB1240B31C12577DF004F6A77,19/11/2010 12:45,19/11/2010 12:46,000-3-01,Friday,0.0,2.0,0.0,"0,87 € per minute"
2,Precheck,005141EEB1240B31C12577DF004F6A77,24/11/2010 8:18,24/11/2010 8:26,000-2-01,Wednesday,0.0,2.0,0.0,"0,87 € per minute"
3,Check_of_Processing_Applications,005141EEB1240B31C12577DF004F6A77,24/11/2010 10:35,24/11/2010 10:35,000-2-01,Wednesday,0.0,2.0,0.0,"0,87 € per minute"
4,Processing_of_Applications,005141EEB1240B31C12577DF004F6A77,2/12/2010 10:46,2/12/2010 10:46,010-23-11,Thursday,0.0,2.0,0.0,"1,02 € per minute"
...,...,...,...,...,...,...,...,...,...,...
18440,Precheck,FFFF329EF772D73CC12577EA00534A1C,13/12/2010 11:38,13/12/2010 11:40,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute"
18441,Check_of_Processing_Applications,FFFF329EF772D73CC12577EA00534A1C,13/12/2010 12:06,13/12/2010 12:14,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute"
18442,Processing_of_Applications,FFFF329EF772D73CC12577EA00534A1C,22/12/2010 11:48,22/12/2010 11:48,010-23-07,Wednesday,1.0,2.0,0.0,"1,02 € per minute"
18443,Archieving,FFFF329EF772D73CC12577EA00534A1C,22/12/2010 11:49,22/12/2010 13:19,010-23-07,Wednesday,1.0,2.0,0.0,"1,02 € per minute"


In [980]:
# only consider special type of cases
# proprietary: special loans
# log = log[log['Type of loan; 0 = normal; 1 = special'] == 1]

## Social-SL-1

In [981]:
# convert to timestamps to calculate case/activity duration

log = log.sort_values(by='start timestamp')
log['start timestamp'] = pd.to_datetime(log['start timestamp'], format='mixed', dayfirst=True)
log['complete timestamp'] = pd.to_datetime(log['complete timestamp'], format='mixed', dayfirst=True)
log['activity_duration_minutes'] = (
    log['complete timestamp'] - log['start timestamp']
).dt.total_seconds() / 60

In [982]:
# encode activity labels as short names for presentation purpose

print('There exist {} unique activity labels'.format(
    log['activity'].nunique()
))

# use initial letters for encoding
act_labels_short = dict()
for label in log['activity'].unique():
    label_short = ''.join([w[0] for w in label.split('_')])
    act_labels_short[label] = label_short
print('There are {} short labels after encoding'.format(
    len(act_labels_short.values())
))
for k in sorted(act_labels_short.keys()):
    print('{:>50} -> {:>30}'.format(k, act_labels_short[k]))

log['activity_short'] = log['activity'].apply(
    lambda x: act_labels_short[x]
)
log

There exist 25 unique activity labels
There are 25 short labels after encoding
                              Administrative_Tasks ->                             AT
                   Application_Processing_Branches ->                            APB
                                          Approval ->                              A
                                   Approval_Branch ->                             AB
                          Approval_Executive_Board ->                            AEB
                                        Archieving ->                              A
                                 Check_of_Approval ->                            CoA
                                Check_of_Documents ->                            CoD
                  Check_of_Processing_Applications ->                           CoPA
                                               End ->                              E
                                   Further_inquiry ->                  

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,"School holidays; 0 = no, 1 = yes",Approval; 1 = low sum to 4 = high sum,Type of loan; 0 = normal; 1 = special,Cost,activity_duration_minutes,activity_short
8231,Preparation_Draft_Executive_Board,6C9B95FFEBE57A0DC125782A002A3229,2011-02-01 10:01:00,2011-02-01 10:02:00,001-17-02,Tuesday,0.0,4.0,0.0,"2,04 € per minute",1.0,PDEB
649,Administrative_Tasks,08BC7C266A535237C1257826002ADD6E,2011-02-01 10:02:00,2011-02-01 10:51:00,010-24-26,Tuesday,0.0,1.0,1.0,"1,13 € per minute",49.0,AT
648,Processing_of_Applications_Sonderkredit,08BC7C266A535237C1257826002ADD6E,2011-02-01 10:02:00,2011-02-01 10:02:00,010-24-26,Tuesday,0.0,1.0,1.0,"1,13 € per minute",0.0,PoAS
1527,End,14050B07C84527C5C1257824002E58A8,2011-02-01 10:03:00,2011-02-01 10:03:00,010-23-11,Tuesday,0.0,3.0,0.0,"1,02 € per minute",0.0,E
4528,Approval_Branch,3C3A3DDEF08D2225C1257829004E319B,2011-02-01 10:03:00,2011-02-01 10:09:00,050-8-1,Tuesday,0.0,2.0,0.0,"0,87 € per minute",6.0,AB
...,...,...,...,...,...,...,...,...,...,...,...,...
12587,Precheck,AB750FD25DEABED1C12577F300518367,2010-12-09 09:26:00,2010-12-09 10:13:00,070-7-06,Thursday,0.0,1.0,0.0,"1,02 € per minute",47.0,P
322,Precheck,059B8D0F8B58DFDEC12577F4002FAB94,2010-12-09 09:40:00,2010-12-09 14:18:00,093-18-1,Thursday,0.0,1.0,0.0,"0,87 € per minute",278.0,P
4849,Precheck,401C68F8E2854EA7C12577F400309E68,2010-12-09 09:51:00,2010-12-09 09:54:00,020-9-1,Thursday,0.0,1.0,0.0,"0,87 € per minute",3.0,P
15872,Second_vote_KfW,DB060AF533AF5D13C1257735002E3378,2010-12-09 09:55:00,2010-12-09 09:55:00,010-24-01,Thursday,0.0,1.0,0.0,"1,13 € per minute",0.0,SvK


### Determine groups/teams

In [983]:
# proprietary
# use the first two parts of the resource ids to determine resource groups
log['resource_group'] = log['resource'].apply(
    lambda x: '-'.join(x.split('-')[:2])
)
print(log['resource_group'].nunique())

# use only the top-10 most active, non-singleton groups (by case numbers)
rg_case_num = [
    (case_id, events['case_id'].nunique())
    for case_id, events in log.groupby('resource_group')
    if events['resource'].nunique() > 1
]
sel_rgs = sorted(rg_case_num, key=lambda x: x[1], reverse=True)[:10]
sel_rgs

129


[('010-23', 857),
 ('010-24', 839),
 ('070-6', 214),
 ('070-7', 152),
 ('010-25', 146),
 ('001-6', 97),
 ('001-7', 80),
 ('070-8', 67),
 ('000-4', 64),
 ('001-5', 53)]

In [984]:
# discover resource grouping based on activity types (labels)
from sklearn.cluster import AgglomerativeClustering
def extract_res_act_matrix(log):
    res_act_matrix = []
    for res, events in log.groupby('resource'):
        v = events.groupby('activity_short').size().reset_index().rename(columns={0: 'count'})
        v['count'] /= v['count'].sum()
        v['resource'] = res
        res_act_matrix.append(v)
    res_act_matrix = pd.concat(res_act_matrix)
    ra_mat_wide = res_act_matrix.pivot(
        index='resource', 
        columns='activity_short', values='count'
    ).fillna(0)
    # ra_mat_wide.values
    clustering = AgglomerativeClustering(
        n_clusters=len(ra_mat_wide),
        metric='euclidean',
    ).fit(ra_mat_wide.values).labels_
    # print(clustering)   
    ra_mat_wide['label'] = clustering
    # ra_mat_wide
    labels = ra_mat_wide['label'].reset_index()
    res_act_matrix = res_act_matrix.merge(labels, on='resource')
    return res_act_matrix

In [985]:
charts = []
for rg, case_number in sel_rgs:
    res_act_matrix = extract_res_act_matrix(log[log['resource_group'] == rg])
    charts.append(alt.Chart(res_act_matrix, title=f'Group {rg}: {case_number} cases').mark_rect().encode(
        y=alt.Y('resource:N').sort(alt.EncodingSortField('label', order='ascending')),
        x='activity_short:N',
        color='count',
        tooltip=['count', 'resource', 'activity_short']
    ))
# alt.vconcat(*charts)

# NOTE: proprietary
# groups to be used for the following steps:
# ['070-7', '001-6', '001-7', '001-5']

In [986]:
# sel_rgs = ['070-7', '001-6', '001-7', '001-5']
sel_rgs = [x[0] for x in sel_rgs]

### Profile group performance

In [987]:
# average activity duration
# group_perf = log[log['resource_group'].isin(sel_rgs)].groupby([
#     'resource_group', 'activity'
# ]).agg(
#     mean_activity_duration=pd.NamedAgg(
#         'activity_duration_minutes', aggfunc='mean'
#     ),
#     num_events=pd.NamedAgg(
#         'activity_duration_minutes', aggfunc=len
#     )
# ).reset_index()
# group_perf

In [988]:
# data = log[log['resource_group'].isin(sel_rgs)]
# alt.Chart(data).mark_boxplot().encode(
#     y='activity',
#     x='activity_duration_minutes',
#     row='resource_group'
# ).resolve_scale(x='independent', y='independent')

### Profile group member performance
- productivity (activity duration)
- member assignment (number of events)

In [989]:
# group_member_perf = log[log['resource_group'].isin(sel_rgs)].groupby([
#     'resource_group', 'resource', 'activity'
# ]).agg(
#     mean_activity_duration=pd.NamedAgg(
#         'activity_duration_minutes', aggfunc='mean'
#     ),
#     num_events=pd.NamedAgg(
#         'activity_duration_minutes', aggfunc=len
#     )
# ).reset_index()
# group_member_perf

In [990]:
charts = []
for rg in sel_rgs:
    data = log[log['resource_group'] == rg]
    group_perf_chart = alt.Chart(data).mark_boxplot().encode(
        y='activity:N',
        x='activity_duration_minutes:Q',
        opacity=alt.value(0.2)
    )
    member_perf_chart = alt.Chart(data).mark_circle(size=60).encode(
        y='activity:N',
        x='mean(activity_duration_minutes):Q',
        color='resource:N',
        tooltip=['resource', 'activity', 'mean(activity_duration_minutes)']
    )
    data_assign = data.groupby(['resource', 'activity']).agg(
        member_assignment=pd.NamedAgg('activity', aggfunc=len)
    ).reset_index()
    activity_gb = data.groupby('activity').agg(
        group_assignment=pd.NamedAgg('activity', aggfunc=len)
    ).reset_index()
    data_assign = data_assign.merge(activity_gb, on='activity')
    data_assign['normalized_member_assignment'] = (
        data_assign['member_assignment'] /
        data_assign['group_assignment']
    )
    member_assign_chart = alt.Chart(data_assign).mark_rect().encode(
        x='resource:N',
        y='activity:N',
        color='normalized_member_assignment:Q'
    )

    charts.append(
        alt.hconcat(
            alt.layer(group_perf_chart, member_perf_chart),
            member_assign_chart
        ).resolve_scale(color='independent')
    )
alt.vconcat(*charts).resolve_scale(x='independent', y='independent', color='independent')


### Determine typical group performance

This can be established by the distribution of activity duration time
(visualized as boxplots above)

### Discover specific group members who had significantly lower performance