# Social Loafing

## Description
Group member hopes for a free (or cheap) ride.

In [262]:
import pandas as pd
import numpy as np
import altair as alt
from sklearn.cluster import AgglomerativeClustering
alt.data_transformers.enable('vegafusion')

DataTransformerRegistry.enable('vegafusion')

In [263]:
log_name = 'propr_test'

fn_logs = {
    'bpic15': '../data/preproc/bpic15.csv',
    'bpic15_ref': '../data/preproc/bpic15_train-70.csv',
    'bpic15_test': '../data/preproc/bpic15_test-30.csv',

    'bpic17': '../data/preproc/bpic2017.csv',

    'propr': '../data/preproc/proprietary.csv',
    'propr_ref': '../data/preproc/proprietary_train-70.csv',
    'propr_test': '../data/preproc/proprietary_test-30.csv'
}

In [264]:
log = pd.read_csv(fn_logs[log_name])
log

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,Cost,"School holidays; 0 = no, 1 = yes",Approval; 1 = low risk to 4 = high risk,Type of loan; 0 = normal; 1 = special,_start timestamp,_complete timestamp
0,Application_Processing_Branches,ID-799,25/02/2011 16:01,2011-02-25 16:10:00,070-6-05,Friday,"1,02 € per minute",0.0,1.0,0.0,2011-02-25 16:01:00,2011-02-25 16:10:00
1,Precheck,ID-940,25/02/2011 16:06,2011-02-25 16:19:00,070-9-2,Friday,"1,02 € per minute",0.0,1.0,0.0,2011-02-25 16:06:00,2011-02-25 16:19:00
2,Precheck,ID-1078,28/02/2011 8:28,2011-02-28 09:02:00,001-6-02,Monday,"0,87 € per minute",0.0,1.0,0.0,2011-02-28 08:28:00,2011-02-28 09:02:00
3,Precheck,ID-546,28/02/2011 10:40,2011-02-28 10:45:00,080-10-03,Monday,"0,87 € per minute",0.0,1.0,0.0,2011-02-28 10:40:00,2011-02-28 10:45:00
4,Application_Processing_Branches,ID-1457,28/02/2011 10:10,2011-02-28 10:55:00,070-7-06,Monday,"1,02 € per minute",0.0,1.0,0.0,2011-02-28 10:10:00,2011-02-28 10:55:00
...,...,...,...,...,...,...,...,...,...,...,...,...
5285,Approval_Branch,ID-496,31/05/2011 14:41,2011-05-31 16:11:00,001-7-03,Tuesday,"0,87 € per minute",0.0,2.0,0.0,2011-05-31 14:41:00,2011-05-31 16:11:00
5286,Precheck,ID-1384,31/05/2011 16:20,2011-05-31 16:21:00,010-21-01,Tuesday,"0,87 € per minute",0.0,2.0,0.0,2011-05-31 16:20:00,2011-05-31 16:21:00
5287,Approval_Branch,ID-496,31/05/2011 16:29,2011-05-31 16:30:00,010-21-01,Tuesday,"0,87 € per minute",0.0,2.0,0.0,2011-05-31 16:29:00,2011-05-31 16:30:00
5288,Check_of_Documents,ID-99,31/05/2011 15:42,2011-05-31 16:30:00,010-23-06,Tuesday,"1,02 € per minute",0.0,2.0,0.0,2011-05-31 15:42:00,2011-05-31 16:30:00


In [265]:
# only consider special type of cases
# proprietary: special loans
# log = log[log['Type of loan; 0 = normal; 1 = special'] == 1]

In [266]:
# proprietary
log['department'] = log['resource'].apply(
    lambda x: x.split('-')[0]
)
log['department_role'] = log['resource'].apply(
    lambda x: x.split('-')[0] + '-' + x.split('-')[1]
)
log

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,Cost,"School holidays; 0 = no, 1 = yes",Approval; 1 = low risk to 4 = high risk,Type of loan; 0 = normal; 1 = special,_start timestamp,_complete timestamp,department,department_role
0,Application_Processing_Branches,ID-799,25/02/2011 16:01,2011-02-25 16:10:00,070-6-05,Friday,"1,02 € per minute",0.0,1.0,0.0,2011-02-25 16:01:00,2011-02-25 16:10:00,070,070-6
1,Precheck,ID-940,25/02/2011 16:06,2011-02-25 16:19:00,070-9-2,Friday,"1,02 € per minute",0.0,1.0,0.0,2011-02-25 16:06:00,2011-02-25 16:19:00,070,070-9
2,Precheck,ID-1078,28/02/2011 8:28,2011-02-28 09:02:00,001-6-02,Monday,"0,87 € per minute",0.0,1.0,0.0,2011-02-28 08:28:00,2011-02-28 09:02:00,001,001-6
3,Precheck,ID-546,28/02/2011 10:40,2011-02-28 10:45:00,080-10-03,Monday,"0,87 € per minute",0.0,1.0,0.0,2011-02-28 10:40:00,2011-02-28 10:45:00,080,080-10
4,Application_Processing_Branches,ID-1457,28/02/2011 10:10,2011-02-28 10:55:00,070-7-06,Monday,"1,02 € per minute",0.0,1.0,0.0,2011-02-28 10:10:00,2011-02-28 10:55:00,070,070-7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5285,Approval_Branch,ID-496,31/05/2011 14:41,2011-05-31 16:11:00,001-7-03,Tuesday,"0,87 € per minute",0.0,2.0,0.0,2011-05-31 14:41:00,2011-05-31 16:11:00,001,001-7
5286,Precheck,ID-1384,31/05/2011 16:20,2011-05-31 16:21:00,010-21-01,Tuesday,"0,87 € per minute",0.0,2.0,0.0,2011-05-31 16:20:00,2011-05-31 16:21:00,010,010-21
5287,Approval_Branch,ID-496,31/05/2011 16:29,2011-05-31 16:30:00,010-21-01,Tuesday,"0,87 € per minute",0.0,2.0,0.0,2011-05-31 16:29:00,2011-05-31 16:30:00,010,010-21
5288,Check_of_Documents,ID-99,31/05/2011 15:42,2011-05-31 16:30:00,010-23-06,Tuesday,"1,02 € per minute",0.0,2.0,0.0,2011-05-31 15:42:00,2011-05-31 16:30:00,010,010-23


## Social-SL-1

In [267]:
# convert to timestamps to calculate case/activity duration

log = log.sort_values(by='start timestamp')
log['start timestamp'] = pd.to_datetime(log['start timestamp'], format='mixed', dayfirst=True)
log['complete timestamp'] = pd.to_datetime(log['complete timestamp'], format='mixed', dayfirst=True)
log['activity_duration_minutes'] = (
    log['complete timestamp'] - log['start timestamp']
).dt.total_seconds() / 60

### Determine groups/teams

In [268]:
# proprietary
# use the first two parts of the resource ids to determine resource groups
log['resource_group'] = log['department_role']

# use only the top-10 most active, non-singleton groups (by case numbers)
rg_case_num = [
    (case_id, events['case_id'].nunique())
    for case_id, events in log.groupby('resource_group')
    if events['resource'].nunique() > 1
]
print(sorted(rg_case_num, key=lambda x: x[1], reverse=True))
# sel_rgs = sorted(rg_case_num, key=lambda x: x[1], reverse=True)[:10]
# sel_rgs = [x[0] for x in sel_rgs]
sel_rgs = ['010-23']

[('010-23', 269), ('010-24', 223), ('010-25', 63), ('070-6', 55), ('070-7', 43), ('001-6', 31), ('001-7', 26), ('001-15', 20), ('001-8', 20), ('000-4', 17), ('001-5', 17), ('010-7', 16), ('010-6', 15), ('010-8', 15), ('050-8', 15), ('070-8', 15), ('100-8', 13), ('001-17', 8), ('020-10', 8), ('100-10', 8), ('010-5', 7), ('040-10', 7), ('110-8', 7), ('080-10', 6), ('070-10', 5), ('001-10', 4), ('070-15', 4), ('080-8', 4), ('090-8', 3), ('010-10', 2), ('090-10', 2)]


### Profile group performance

### Profile group member performance
- productivity (activity duration)
- member assignment (number of events)

In [269]:
charts = []
if len(sel_rgs) > 0:
    for rg in sel_rgs:
        # normalize activity duration by activity
        sublog = log[log['resource_group'] == rg]
        act_dur_min_max = []
        for act in sublog['activity'].unique():
            act_dur_min_max.append({
                'activity': act,
                'duration_min': sublog.loc[sublog['activity'] == act, 'activity_duration_minutes'].min(),
                'duration_max': sublog.loc[sublog['activity'] == act, 'activity_duration_minutes'].max()
            })
        act_dur_min_max = pd.DataFrame(act_dur_min_max)
        act_dur_min_max['duration_range'] = (
            act_dur_min_max['duration_max'] - act_dur_min_max['duration_min']
        )
        # NOTE: avoid division by 0
        act_dur_min_max.loc[act_dur_min_max['duration_range'] == 0, 'duration_range'] = 1 / 60
        sublog = sublog.merge(act_dur_min_max, how='left', on='activity')
        # print(act_dur_min_max)
        sublog['normalized_activity_duration_minutes'] = sublog.apply(
            lambda row: (
                (row['activity_duration_minutes'] - row['duration_min']) / 
                row['duration_range']
            ),
            axis=1
        )

        group_member_perf = sublog.groupby([
            'resource_group', 'resource'
        ]).agg(
            mean_activity_duration=pd.NamedAgg(
                'activity_duration_minutes', aggfunc='mean'
            ),
            num_events=pd.NamedAgg(
                'activity_duration_minutes', aggfunc=len
            ),
            num_cases=pd.NamedAgg(
                'case_id', aggfunc='nunique'
            )
        ).reset_index()

        # consider only resources with more than 10 cases
        group_member_perf = group_member_perf[
            group_member_perf['num_cases'] >= 10
        ]

        group_member_perf['inv_mean_activity_duration'] = 1 / group_member_perf['mean_activity_duration']

        X = group_member_perf[['resource', 'mean_activity_duration', 'num_events']].set_index('resource')
        clustering = AgglomerativeClustering(n_clusters=3).fit(X)
        # print(X.index)
        # print(clustering.labels_)
        for i in range(len(X)):
            group_member_perf.loc[group_member_perf['resource'] == X.index[i], 'cluster'] = clustering.labels_[i]

        charts.append(alt.Chart(group_member_perf).mark_point(size=60).encode(
            x=alt.X('num_events:Q').title(['resource utilization', '(number of events completed)']),
            y=alt.Y('inv_mean_activity_duration:Q').title(['resource productivity', '(inverse of mean activity duration time)']),
            color=alt.Color('resource:N').title('resource ID'),
            shape=alt.Shape('cluster:N'),
            tooltip=['resource', 'num_events', 'num_cases', 'mean_activity_duration']
        ).properties(width=200, height=200))
alt.vconcat(*charts).resolve_scale(x='independent', y='independent', color='independent')
        

### Identify resources with low utilization and low productivity

In [270]:
charts = []
if len(sel_rgs) > 0:
    for rg in sel_rgs:
        # normalize activity duration by activity
        sublog = log[log['resource_group'] == rg]
        data = sublog.groupby([
            'resource', 'activity'
        ]).agg(
            mean_activity_duration=pd.NamedAgg(
                'activity_duration_minutes', aggfunc='mean'
            ),
            num_events=pd.NamedAgg(
                'activity_duration_minutes', aggfunc=len
            ),
        ).reset_index()
        # productivity chart (based on duration, interpreted inversely)
        chart_productivity = []
        for act in data['activity'].unique():
            chart_productivity.append(
                alt.Chart(data[data['activity'] == act], title=act).mark_bar().encode(
                    x='resource:N',
                    y='mean_activity_duration:Q'
                )
            )
        chart_productivity = alt.vconcat(*chart_productivity)
        # utilization chart
        chart_utilization = alt.Chart(data).mark_rect().encode(
            x='resource:N',
            y='activity:N',
            color=alt.Color('num_events:Q').scale(scheme='lightgreyred')
        )
        charts.append(
            alt.hconcat(chart_productivity, chart_utilization)
        )
alt.vconcat(*charts)