# Social Loafing

## Description
Group member hopes for a free (or cheap) ride.

In [1]:
import pandas as pd
import numpy as np
import altair as alt
from sklearn.cluster import AgglomerativeClustering
alt.data_transformers.enable('vegafusion')

DataTransformerRegistry.enable('vegafusion')

In [2]:
log_name = 'propr_test'

fn_logs = {
    'propr': '../data/preproc/proprietary.csv',
    'propr_ref': '../data/preproc/propr_train-70.csv',
    'propr_test': '../data/preproc/propr_test-30.csv'
}

In [3]:
log = pd.read_csv(fn_logs[log_name])
log

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,Cost,"School holidays; 0 = no, 1 = yes",Approval; 1 = low risk to 4 = high risk,Type of loan; 0 = normal; 1 = special,_start timestamp,_complete timestamp
0,Precheck,ID-2,6/05/2011 16:09,6/05/2011 16:27,090-10-02,Friday,"0,87 € per minute",0.0,1.0,0.0,2011-05-06 16:09:00,2011-05-06 16:27:00
1,Application_Processing_Branches,ID-2,9/05/2011 11:14,9/05/2011 11:16,010-23-13,Monday,"1,02 € per minute",0.0,1.0,0.0,2011-05-09 11:14:00,2011-05-09 11:16:00
2,Precheck,ID-2,9/05/2011 15:44,9/05/2011 15:58,090-10-02,Monday,"0,87 € per minute",0.0,1.0,0.0,2011-05-09 15:44:00,2011-05-09 15:58:00
3,Processing_Incomplete_Orders,ID-2,9/05/2011 16:10,9/05/2011 16:10,010-23-13,Monday,"1,02 € per minute",0.0,1.0,0.0,2011-05-09 16:10:00,2011-05-09 16:10:00
4,Application_Processing_Branches,ID-2,10/05/2011 8:52,10/05/2011 9:32,010-23-13,Tuesday,"1,02 € per minute",0.0,1.0,0.0,2011-05-10 08:52:00,2011-05-10 09:32:00
...,...,...,...,...,...,...,...,...,...,...,...,...
5285,Precheck,ID-1720,20/05/2011 17:24,20/05/2011 17:26,000-3-01,Friday,"0,87 € per minute",0.0,2.0,0.0,2011-05-20 17:24:00,2011-05-20 17:26:00
5286,Check_of_Processing_Applications,ID-1720,23/05/2011 15:53,23/05/2011 16:06,000-2-01,Monday,"0,87 € per minute",0.0,2.0,0.0,2011-05-23 15:53:00,2011-05-23 16:06:00
5287,Processing_of_Applications,ID-1720,26/05/2011 9:25,26/05/2011 9:25,010-23-07,Thursday,"1,02 € per minute",0.0,2.0,0.0,2011-05-26 09:25:00,2011-05-26 09:25:00
5288,Archieving,ID-1720,26/05/2011 10:38,26/05/2011 10:47,010-23-07,Thursday,"1,02 € per minute",0.0,2.0,0.0,2011-05-26 10:38:00,2011-05-26 10:47:00


In [4]:
# only consider special type of cases
# proprietary: special loans
# log = log[log['Type of loan; 0 = normal; 1 = special'] == 1]

In [5]:
# proprietary
log['department'] = log['resource'].apply(
    lambda x: x.split('-')[0]
)
log['department_role'] = log['resource'].apply(
    lambda x: x.split('-')[0] + '-' + x.split('-')[1]
)
log

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,Cost,"School holidays; 0 = no, 1 = yes",Approval; 1 = low risk to 4 = high risk,Type of loan; 0 = normal; 1 = special,_start timestamp,_complete timestamp,department,department_role
0,Precheck,ID-2,6/05/2011 16:09,6/05/2011 16:27,090-10-02,Friday,"0,87 € per minute",0.0,1.0,0.0,2011-05-06 16:09:00,2011-05-06 16:27:00,090,090-10
1,Application_Processing_Branches,ID-2,9/05/2011 11:14,9/05/2011 11:16,010-23-13,Monday,"1,02 € per minute",0.0,1.0,0.0,2011-05-09 11:14:00,2011-05-09 11:16:00,010,010-23
2,Precheck,ID-2,9/05/2011 15:44,9/05/2011 15:58,090-10-02,Monday,"0,87 € per minute",0.0,1.0,0.0,2011-05-09 15:44:00,2011-05-09 15:58:00,090,090-10
3,Processing_Incomplete_Orders,ID-2,9/05/2011 16:10,9/05/2011 16:10,010-23-13,Monday,"1,02 € per minute",0.0,1.0,0.0,2011-05-09 16:10:00,2011-05-09 16:10:00,010,010-23
4,Application_Processing_Branches,ID-2,10/05/2011 8:52,10/05/2011 9:32,010-23-13,Tuesday,"1,02 € per minute",0.0,1.0,0.0,2011-05-10 08:52:00,2011-05-10 09:32:00,010,010-23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5285,Precheck,ID-1720,20/05/2011 17:24,20/05/2011 17:26,000-3-01,Friday,"0,87 € per minute",0.0,2.0,0.0,2011-05-20 17:24:00,2011-05-20 17:26:00,000,000-3
5286,Check_of_Processing_Applications,ID-1720,23/05/2011 15:53,23/05/2011 16:06,000-2-01,Monday,"0,87 € per minute",0.0,2.0,0.0,2011-05-23 15:53:00,2011-05-23 16:06:00,000,000-2
5287,Processing_of_Applications,ID-1720,26/05/2011 9:25,26/05/2011 9:25,010-23-07,Thursday,"1,02 € per minute",0.0,2.0,0.0,2011-05-26 09:25:00,2011-05-26 09:25:00,010,010-23
5288,Archieving,ID-1720,26/05/2011 10:38,26/05/2011 10:47,010-23-07,Thursday,"1,02 € per minute",0.0,2.0,0.0,2011-05-26 10:38:00,2011-05-26 10:47:00,010,010-23


## Social-SL-1

In [6]:
# convert to timestamps to calculate case/activity duration

log = log.sort_values(by='start timestamp')
log['start timestamp'] = pd.to_datetime(log['start timestamp'], format='mixed', dayfirst=True)
log['complete timestamp'] = pd.to_datetime(log['complete timestamp'], format='mixed', dayfirst=True)
log['activity_duration_minutes'] = (
    log['complete timestamp'] - log['start timestamp']
).dt.total_seconds() / 60

### Determine groups/teams

In [7]:
# proprietary
# use the first two parts of the resource ids to determine resource groups
log['resource_group'] = log['department_role']

# use only the top-10 most active, non-singleton groups (by case numbers)
rg_case_num = [
    (case_id, events['case_id'].nunique())
    for case_id, events in log.groupby('resource_group')
    if events['resource'].nunique() > 1
]
print(sorted(rg_case_num, key=lambda x: x[1], reverse=True))
# sel_rgs = sorted(rg_case_num, key=lambda x: x[1], reverse=True)[:10]
# sel_rgs = [x[0] for x in sel_rgs]
sel_rgs = ['010-23']

[('010-23', 269), ('010-24', 223), ('010-25', 63), ('070-6', 55), ('070-7', 43), ('001-6', 31), ('001-7', 26), ('001-15', 20), ('001-8', 20), ('000-4', 17), ('001-5', 17), ('010-7', 16), ('010-6', 15), ('010-8', 15), ('050-8', 15), ('070-8', 15), ('100-8', 13), ('001-17', 8), ('020-10', 8), ('100-10', 8), ('010-5', 7), ('040-10', 7), ('110-8', 7), ('080-10', 6), ('070-10', 5), ('001-10', 4), ('070-15', 4), ('080-8', 4), ('090-8', 3), ('010-10', 2), ('090-10', 2)]


### Profile group performance

### Profile group member performance
- productivity (activity duration)
- member assignment (number of events)

In [8]:
charts = []
if len(sel_rgs) > 0:
    for rg in sel_rgs:
        # normalize activity duration by activity
        sublog = log[log['resource_group'] == rg]
        act_dur_min_max = []
        for act in sublog['activity'].unique():
            act_dur_min_max.append({
                'activity': act,
                'duration_min': sublog.loc[sublog['activity'] == act, 'activity_duration_minutes'].min(),
                'duration_max': sublog.loc[sublog['activity'] == act, 'activity_duration_minutes'].max()
            })
        act_dur_min_max = pd.DataFrame(act_dur_min_max)
        act_dur_min_max['duration_range'] = (
            act_dur_min_max['duration_max'] - act_dur_min_max['duration_min']
        )
        # NOTE: avoid division by 0
        act_dur_min_max.loc[act_dur_min_max['duration_range'] == 0, 'duration_range'] = 1 / 60
        sublog = sublog.merge(act_dur_min_max, how='left', on='activity')
        # print(act_dur_min_max)
        sublog['normalized_activity_duration_minutes'] = sublog.apply(
            lambda row: (
                (row['activity_duration_minutes'] - row['duration_min']) / 
                row['duration_range']
            ),
            axis=1
        )

        group_member_perf = sublog.groupby([
            'resource_group', 'resource'
        ]).agg(
            mean_activity_duration=pd.NamedAgg(
                'activity_duration_minutes', aggfunc='mean'
            ),
            num_events=pd.NamedAgg(
                'activity_duration_minutes', aggfunc=len
            ),
            num_cases=pd.NamedAgg(
                'case_id', aggfunc='nunique'
            )
        ).reset_index()

        # consider only active resources with more than 10 cases
        group_member_perf = group_member_perf[
            group_member_perf['num_cases'] >= 10
        ]

        group_member_perf['inv_mean_activity_duration'] = 1 / group_member_perf['mean_activity_duration']

        X = group_member_perf[['resource', 'mean_activity_duration', 'num_events']].set_index('resource')
        clustering = AgglomerativeClustering(n_clusters=3).fit(X)
        # print(X.index)
        # print(clustering.labels_)
        for i in range(len(X)):
            group_member_perf.loc[group_member_perf['resource'] == X.index[i], 'cluster'] = clustering.labels_[i] + 1

        charts.append(alt.Chart(group_member_perf).mark_point(size=60).encode(
            x=alt.X('num_events:Q').title(['resource utilization', '(number of events completed)']),
            y=alt.Y('inv_mean_activity_duration:Q').title(['resource productivity', '(inverse of mean activity duration time)']),
            color=alt.Color('resource:N').title('resource ID'),
            shape=alt.Shape('cluster:N'),
            tooltip=['resource', 'num_events', 'num_cases', 'mean_activity_duration', 'inv_mean_activity_duration']
        ).properties(width=300, height=200))
alt.vconcat(*charts).resolve_scale(x='independent', y='independent', color='independent')
        

### Identify resources with low utilization and low productivity

In [9]:
charts = []
if len(sel_rgs) > 0:
    for rg in sel_rgs:
        # normalize activity duration by activity
        sublog = log[log['resource_group'] == rg]
        data = sublog.groupby([
            'resource', 'activity'
        ]).agg(
            mean_activity_duration=pd.NamedAgg(
                'activity_duration_minutes', aggfunc='mean'
            ),
            num_events=pd.NamedAgg(
                'activity_duration_minutes', aggfunc=len
            ),
        ).reset_index()
        # productivity chart (based on duration, interpreted inversely)
        chart_productivity = []
        for act in data['activity'].unique():
            chart_productivity.append(
                alt.Chart(data[data['activity'] == act], title=act).mark_bar().encode(
                    x='resource:N',
                    y='mean_activity_duration:Q'
                )
            )
        chart_productivity = alt.vconcat(*chart_productivity)
        # utilization chart
        chart_utilization = alt.Chart(data).mark_rect().encode(
            x='resource:N',
            y='activity:N',
            color=alt.Color('num_events:Q').scale(scheme='lightgreyred')
        )
        charts.append(
            alt.hconcat(chart_productivity, chart_utilization)
        )
alt.vconcat(*charts)