# Social Loafing

## Description
Group member hopes for a free (or cheap) ride.

In [495]:
import pandas as pd
import numpy as np
import altair as alt
alt.data_transformers.enable('vegafusion')

DataTransformerRegistry.enable('vegafusion')

In [496]:
log_name = 'propr'

fn_logs = {
    'bpic15': '../data/preproc/bpic15.csv',
    'bpic17': '../data/preproc/bpic17.csv',
    'propr': '../data/preproc/proprietary.csv'
}

In [497]:
log = pd.read_csv(fn_logs[log_name])
log

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,"School holidays; 0 = no, 1 = yes",Approval; 1 = low sum to 4 = high sum,Type of loan; 0 = normal; 1 = special,Cost
0,Approval_Branch,005141EEB1240B31C12577DF004F6A77,18/11/2010 15:27,18/11/2010 15:58,004-9-1,Thursday,0.0,2.0,0.0,"0,87 € per minute"
1,Precheck,005141EEB1240B31C12577DF004F6A77,19/11/2010 12:45,19/11/2010 12:46,000-3-01,Friday,0.0,2.0,0.0,"0,87 € per minute"
2,Precheck,005141EEB1240B31C12577DF004F6A77,24/11/2010 8:18,24/11/2010 8:26,000-2-01,Wednesday,0.0,2.0,0.0,"0,87 € per minute"
3,Check_of_Processing_Applications,005141EEB1240B31C12577DF004F6A77,24/11/2010 10:35,24/11/2010 10:35,000-2-01,Wednesday,0.0,2.0,0.0,"0,87 € per minute"
4,Processing_of_Applications,005141EEB1240B31C12577DF004F6A77,2/12/2010 10:46,2/12/2010 10:46,010-23-11,Thursday,0.0,2.0,0.0,"1,02 € per minute"
...,...,...,...,...,...,...,...,...,...,...
18440,Precheck,FFFF329EF772D73CC12577EA00534A1C,13/12/2010 11:38,13/12/2010 11:40,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute"
18441,Check_of_Processing_Applications,FFFF329EF772D73CC12577EA00534A1C,13/12/2010 12:06,13/12/2010 12:14,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute"
18442,Processing_of_Applications,FFFF329EF772D73CC12577EA00534A1C,22/12/2010 11:48,22/12/2010 11:48,010-23-07,Wednesday,1.0,2.0,0.0,"1,02 € per minute"
18443,Archieving,FFFF329EF772D73CC12577EA00534A1C,22/12/2010 11:49,22/12/2010 13:19,010-23-07,Wednesday,1.0,2.0,0.0,"1,02 € per minute"


In [498]:
# only consider special type of cases
# proprietary: special loans
# log = log[log['Type of loan; 0 = normal; 1 = special'] == 1]

In [499]:
# proprietary
log['department'] = log['resource'].apply(
    lambda x: x.split('-')[0]
)
log['department_role'] = log['resource'].apply(
    lambda x: x.split('-')[0] + '-' + x.split('-')[1]
)
log

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,"School holidays; 0 = no, 1 = yes",Approval; 1 = low sum to 4 = high sum,Type of loan; 0 = normal; 1 = special,Cost,department,department_role
0,Approval_Branch,005141EEB1240B31C12577DF004F6A77,18/11/2010 15:27,18/11/2010 15:58,004-9-1,Thursday,0.0,2.0,0.0,"0,87 € per minute",004,004-9
1,Precheck,005141EEB1240B31C12577DF004F6A77,19/11/2010 12:45,19/11/2010 12:46,000-3-01,Friday,0.0,2.0,0.0,"0,87 € per minute",000,000-3
2,Precheck,005141EEB1240B31C12577DF004F6A77,24/11/2010 8:18,24/11/2010 8:26,000-2-01,Wednesday,0.0,2.0,0.0,"0,87 € per minute",000,000-2
3,Check_of_Processing_Applications,005141EEB1240B31C12577DF004F6A77,24/11/2010 10:35,24/11/2010 10:35,000-2-01,Wednesday,0.0,2.0,0.0,"0,87 € per minute",000,000-2
4,Processing_of_Applications,005141EEB1240B31C12577DF004F6A77,2/12/2010 10:46,2/12/2010 10:46,010-23-11,Thursday,0.0,2.0,0.0,"1,02 € per minute",010,010-23
...,...,...,...,...,...,...,...,...,...,...,...,...
18440,Precheck,FFFF329EF772D73CC12577EA00534A1C,13/12/2010 11:38,13/12/2010 11:40,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute",000,000-2
18441,Check_of_Processing_Applications,FFFF329EF772D73CC12577EA00534A1C,13/12/2010 12:06,13/12/2010 12:14,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute",000,000-2
18442,Processing_of_Applications,FFFF329EF772D73CC12577EA00534A1C,22/12/2010 11:48,22/12/2010 11:48,010-23-07,Wednesday,1.0,2.0,0.0,"1,02 € per minute",010,010-23
18443,Archieving,FFFF329EF772D73CC12577EA00534A1C,22/12/2010 11:49,22/12/2010 13:19,010-23-07,Wednesday,1.0,2.0,0.0,"1,02 € per minute",010,010-23


## Social-SL-1

In [500]:
# convert to timestamps to calculate case/activity duration

log = log.sort_values(by='start timestamp')
log['start timestamp'] = pd.to_datetime(log['start timestamp'], format='mixed', dayfirst=True)
log['complete timestamp'] = pd.to_datetime(log['complete timestamp'], format='mixed', dayfirst=True)
log['activity_duration_minutes'] = (
    log['complete timestamp'] - log['start timestamp']
).dt.total_seconds() / 60

### Determine groups/teams

In [501]:
# proprietary
# use the first two parts of the resource ids to determine resource groups
log['resource_group'] = log['department_role']

# use only the top-10 most active, non-singleton groups (by case numbers)
rg_case_num = [
    (case_id, events['case_id'].nunique())
    for case_id, events in log.groupby('resource_group')
    if events['resource'].nunique() > 1
]
print(sorted(rg_case_num, key=lambda x: x[1], reverse=True))
# sel_rgs = sorted(rg_case_num, key=lambda x: x[1], reverse=True)[:10]
# sel_rgs = [x[0] for x in sel_rgs]
sel_rgs = ['010-23']

[('010-23', 857), ('010-24', 839), ('070-6', 214), ('070-7', 152), ('010-25', 146), ('001-6', 97), ('001-7', 80), ('070-8', 67), ('000-4', 64), ('001-5', 53), ('010-6', 52), ('001-8', 48), ('010-7', 46), ('001-15', 44), ('010-8', 44), ('070-9', 36), ('110-8', 30), ('050-8', 29), ('001-17', 27), ('100-8', 26), ('040-10', 23), ('100-10', 22), ('020-10', 21), ('080-8', 21), ('070-15', 18), ('080-10', 17), ('090-10', 16), ('010-5', 15), ('070-10', 13), ('001-10', 11), ('090-8', 10), ('070-5', 7), ('010-10', 5), ('001-12', 4), ('001-28', 3), ('050-19', 3), ('101-19', 3), ('010-15', 2)]


### Profile group performance

### Profile group member performance
- productivity (activity duration)
- member assignment (number of events)

In [502]:
charts = []
if len(sel_rgs) > 0:
    for rg in sel_rgs:
        # normalize activity duration by activity
        sublog = log[log['resource_group'] == rg]
        act_dur_min_max = []
        for act in sublog['activity'].unique():
            act_dur_min_max.append({
                'activity': act,
                'duration_min': sublog.loc[sublog['activity'] == act, 'activity_duration_minutes'].min(),
                'duration_max': sublog.loc[sublog['activity'] == act, 'activity_duration_minutes'].max()
            })
        act_dur_min_max = pd.DataFrame(act_dur_min_max)
        act_dur_min_max['duration_range'] = (
            act_dur_min_max['duration_max'] - act_dur_min_max['duration_min']
        )
        # NOTE: avoid division by 0
        act_dur_min_max.loc[act_dur_min_max['duration_range'] == 0, 'duration_range'] = 1 / 60
        sublog = sublog.merge(act_dur_min_max, how='left', on='activity')
        # print(act_dur_min_max)
        sublog['normalized_activity_duration_minutes'] = sublog.apply(
            lambda row: (
                (row['activity_duration_minutes'] - row['duration_min']) / 
                row['duration_range']
            ),
            axis=1
        )

        group_member_perf = sublog.groupby([
            'resource_group', 'resource'
        ]).agg(
            mean_activity_duration=pd.NamedAgg(
                'activity_duration_minutes', aggfunc='mean'
            ),
            num_events=pd.NamedAgg(
                'activity_duration_minutes', aggfunc=len
            ),
            num_cases=pd.NamedAgg(
                'case_id', aggfunc='nunique'
            )
        ).reset_index()
        charts.append(alt.Chart(group_member_perf).mark_circle(size=60).encode(
            x='num_events:Q',
            y='mean_activity_duration:Q',
            color='resource:N',
            tooltip=['resource', 'num_events', 'num_cases', 'mean_activity_duration']
        ))
alt.vconcat(*charts).resolve_scale(x='independent', y='independent', color='independent')
        

### Identify resources with low utilization and low productivity

In [503]:
charts = []
if len(sel_rgs) > 0:
    for rg in sel_rgs:
        # normalize activity duration by activity
        sublog = log[log['resource_group'] == rg]
        data = sublog.groupby([
            'resource', 'activity'
        ]).agg(
            mean_activity_duration=pd.NamedAgg(
                'activity_duration_minutes', aggfunc='mean'
            ),
            num_events=pd.NamedAgg(
                'activity_duration_minutes', aggfunc=len
            ),
        ).reset_index()
        # productivity chart (based on duration, interpreted inversely)
        chart_productivity = []
        for act in data['activity'].unique():
            chart_productivity.append(
                alt.Chart(data[data['activity'] == act], title=act).mark_bar().encode(
                    x='resource:N',
                    y='mean_activity_duration:Q'
                )
            )
        chart_productivity = alt.vconcat(*chart_productivity)
        # utilization chart
        chart_utilization = alt.Chart(data).mark_rect().encode(
            x='resource:N',
            y='activity:N',
            color=alt.Color('num_events:Q').scale(scheme='lightgreyred')
        )
        charts.append(
            alt.hconcat(chart_productivity, chart_utilization)
        )
alt.vconcat(*charts)