# Idling

## Description
When agents do not perform work during work time, but rather focus on non-work related activities or spend time socializing with their colleagues.

In [833]:
import pandas as pd
import numpy as np
import altair as alt
alt.data_transformers.enable('vegafusion')

DataTransformerRegistry.enable('vegafusion')

In [834]:
log_name = 'propr_test'

fn_logs = {
    'bpic15': '../data/preproc/bpic15.csv',
    'bpic15_ref': '../data/preproc/bpic15_train-70.csv',
    'bpic15_test': '../data/preproc/bpic15_test-30.csv',

    'bpic17': '../data/preproc/bpic2017.csv',

    'propr': '../data/preproc/proprietary.csv',
    'propr_ref': '../data/preproc/proprietary_train-70.csv',
    'propr_test': '../data/preproc/proprietary_test-30.csv'
}

In [835]:
log = pd.read_csv(fn_logs[log_name])
log

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,"School holidays; 0 = no, 1 = yes",Approval; 1 = low sum to 4 = high sum,Type of loan; 0 = normal; 1 = special,Cost,_start timestamp,_complete timestamp
0,Precheck,0051574BDF30D681C1257888004DC40D,6/05/2011 16:09,6/05/2011 16:27,090-10-02,Friday,0.0,1.0,0.0,"0,87 € per minute",2011-05-06 16:09:00,2011-05-06 16:27:00
1,Application_Processing_Branches,0051574BDF30D681C1257888004DC40D,9/05/2011 11:14,9/05/2011 11:16,010-23-13,Monday,0.0,1.0,0.0,"1,02 € per minute",2011-05-09 11:14:00,2011-05-09 11:16:00
2,Precheck,0051574BDF30D681C1257888004DC40D,9/05/2011 15:44,9/05/2011 15:58,090-10-02,Monday,0.0,1.0,0.0,"0,87 € per minute",2011-05-09 15:44:00,2011-05-09 15:58:00
3,Processing_Incomplete_Orders,0051574BDF30D681C1257888004DC40D,9/05/2011 16:10,9/05/2011 16:10,010-23-13,Monday,0.0,1.0,0.0,"1,02 € per minute",2011-05-09 16:10:00,2011-05-09 16:10:00
4,Application_Processing_Branches,0051574BDF30D681C1257888004DC40D,10/05/2011 8:52,10/05/2011 9:32,010-23-13,Tuesday,0.0,1.0,0.0,"1,02 € per minute",2011-05-10 08:52:00,2011-05-10 09:32:00
...,...,...,...,...,...,...,...,...,...,...,...,...
5285,Precheck,FE3E1E0D3928202EC1257896004B8F00,20/05/2011 17:24,20/05/2011 17:26,000-3-01,Friday,0.0,2.0,0.0,"0,87 € per minute",2011-05-20 17:24:00,2011-05-20 17:26:00
5286,Check_of_Processing_Applications,FE3E1E0D3928202EC1257896004B8F00,23/05/2011 15:53,23/05/2011 16:06,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute",2011-05-23 15:53:00,2011-05-23 16:06:00
5287,Processing_of_Applications,FE3E1E0D3928202EC1257896004B8F00,26/05/2011 9:25,26/05/2011 9:25,010-23-07,Thursday,0.0,2.0,0.0,"1,02 € per minute",2011-05-26 09:25:00,2011-05-26 09:25:00
5288,Archieving,FE3E1E0D3928202EC1257896004B8F00,26/05/2011 10:38,26/05/2011 10:47,010-23-07,Thursday,0.0,2.0,0.0,"1,02 € per minute",2011-05-26 10:38:00,2011-05-26 10:47:00


In [836]:
# encode activity labels as short names for presentation purpose

print('There exist {} unique activity labels'.format(
    log['activity'].nunique()
))

# use initial letters for encoding
act_labels_short = dict()
for label in log['activity'].unique():
    label_short = ''.join([w[0] for w in label.split('_')])
    act_labels_short[label] = label_short
print('There are {} short labels after encoding'.format(
    len(act_labels_short.values())
))
for k in sorted(act_labels_short.keys()):
    print('{:>50} -> {:>30}'.format(k, act_labels_short[k]))

log['activity_short'] = log['activity'].apply(
    lambda x: act_labels_short[x]
)

# add activity duration
log['start timestamp'] = pd.to_datetime(log['start timestamp'], format='mixed', dayfirst=True)
log['complete timestamp'] = pd.to_datetime(log['complete timestamp'], 
format='mixed', dayfirst=True)
# duration measured in minutes
log['activity duration'] = (
    log['complete timestamp'] - log['start timestamp']
).dt.total_seconds() / 60

log['activity'] = log['activity_short']

log

There exist 25 unique activity labels
There are 25 short labels after encoding
                              Administrative_Tasks ->                             AT
                   Application_Processing_Branches ->                            APB
                                          Approval ->                              A
                                   Approval_Branch ->                             AB
                          Approval_Executive_Board ->                            AEB
                                        Archieving ->                              A
                                 Check_of_Approval ->                            CoA
                                Check_of_Documents ->                            CoD
                  Check_of_Processing_Applications ->                           CoPA
                                               End ->                              E
                                   Further_inquiry ->                  

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,"School holidays; 0 = no, 1 = yes",Approval; 1 = low sum to 4 = high sum,Type of loan; 0 = normal; 1 = special,Cost,_start timestamp,_complete timestamp,activity_short,activity duration
0,P,0051574BDF30D681C1257888004DC40D,2011-05-06 16:09:00,2011-05-06 16:27:00,090-10-02,Friday,0.0,1.0,0.0,"0,87 € per minute",2011-05-06 16:09:00,2011-05-06 16:27:00,P,18.0
1,APB,0051574BDF30D681C1257888004DC40D,2011-05-09 11:14:00,2011-05-09 11:16:00,010-23-13,Monday,0.0,1.0,0.0,"1,02 € per minute",2011-05-09 11:14:00,2011-05-09 11:16:00,APB,2.0
2,P,0051574BDF30D681C1257888004DC40D,2011-05-09 15:44:00,2011-05-09 15:58:00,090-10-02,Monday,0.0,1.0,0.0,"0,87 € per minute",2011-05-09 15:44:00,2011-05-09 15:58:00,P,14.0
3,PIO,0051574BDF30D681C1257888004DC40D,2011-05-09 16:10:00,2011-05-09 16:10:00,010-23-13,Monday,0.0,1.0,0.0,"1,02 € per minute",2011-05-09 16:10:00,2011-05-09 16:10:00,PIO,0.0
4,APB,0051574BDF30D681C1257888004DC40D,2011-05-10 08:52:00,2011-05-10 09:32:00,010-23-13,Tuesday,0.0,1.0,0.0,"1,02 € per minute",2011-05-10 08:52:00,2011-05-10 09:32:00,APB,40.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5285,P,FE3E1E0D3928202EC1257896004B8F00,2011-05-20 17:24:00,2011-05-20 17:26:00,000-3-01,Friday,0.0,2.0,0.0,"0,87 € per minute",2011-05-20 17:24:00,2011-05-20 17:26:00,P,2.0
5286,CoPA,FE3E1E0D3928202EC1257896004B8F00,2011-05-23 15:53:00,2011-05-23 16:06:00,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute",2011-05-23 15:53:00,2011-05-23 16:06:00,CoPA,13.0
5287,PoA,FE3E1E0D3928202EC1257896004B8F00,2011-05-26 09:25:00,2011-05-26 09:25:00,010-23-07,Thursday,0.0,2.0,0.0,"1,02 € per minute",2011-05-26 09:25:00,2011-05-26 09:25:00,PoA,0.0
5288,A,FE3E1E0D3928202EC1257896004B8F00,2011-05-26 10:38:00,2011-05-26 10:47:00,010-23-07,Thursday,0.0,2.0,0.0,"1,02 € per minute",2011-05-26 10:38:00,2011-05-26 10:47:00,A,9.0


In [837]:
# proprietary
log['department'] = log['resource'].apply(
    lambda x: x.split('-')[0]
)
log['department_role'] = log['resource'].apply(
    lambda x: x.split('-')[0] + '-' + x.split('-')[1]
)
log

Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,"School holidays; 0 = no, 1 = yes",Approval; 1 = low sum to 4 = high sum,Type of loan; 0 = normal; 1 = special,Cost,_start timestamp,_complete timestamp,activity_short,activity duration,department,department_role
0,P,0051574BDF30D681C1257888004DC40D,2011-05-06 16:09:00,2011-05-06 16:27:00,090-10-02,Friday,0.0,1.0,0.0,"0,87 € per minute",2011-05-06 16:09:00,2011-05-06 16:27:00,P,18.0,090,090-10
1,APB,0051574BDF30D681C1257888004DC40D,2011-05-09 11:14:00,2011-05-09 11:16:00,010-23-13,Monday,0.0,1.0,0.0,"1,02 € per minute",2011-05-09 11:14:00,2011-05-09 11:16:00,APB,2.0,010,010-23
2,P,0051574BDF30D681C1257888004DC40D,2011-05-09 15:44:00,2011-05-09 15:58:00,090-10-02,Monday,0.0,1.0,0.0,"0,87 € per minute",2011-05-09 15:44:00,2011-05-09 15:58:00,P,14.0,090,090-10
3,PIO,0051574BDF30D681C1257888004DC40D,2011-05-09 16:10:00,2011-05-09 16:10:00,010-23-13,Monday,0.0,1.0,0.0,"1,02 € per minute",2011-05-09 16:10:00,2011-05-09 16:10:00,PIO,0.0,010,010-23
4,APB,0051574BDF30D681C1257888004DC40D,2011-05-10 08:52:00,2011-05-10 09:32:00,010-23-13,Tuesday,0.0,1.0,0.0,"1,02 € per minute",2011-05-10 08:52:00,2011-05-10 09:32:00,APB,40.0,010,010-23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5285,P,FE3E1E0D3928202EC1257896004B8F00,2011-05-20 17:24:00,2011-05-20 17:26:00,000-3-01,Friday,0.0,2.0,0.0,"0,87 € per minute",2011-05-20 17:24:00,2011-05-20 17:26:00,P,2.0,000,000-3
5286,CoPA,FE3E1E0D3928202EC1257896004B8F00,2011-05-23 15:53:00,2011-05-23 16:06:00,000-2-01,Monday,0.0,2.0,0.0,"0,87 € per minute",2011-05-23 15:53:00,2011-05-23 16:06:00,CoPA,13.0,000,000-2
5287,PoA,FE3E1E0D3928202EC1257896004B8F00,2011-05-26 09:25:00,2011-05-26 09:25:00,010-23-07,Thursday,0.0,2.0,0.0,"1,02 € per minute",2011-05-26 09:25:00,2011-05-26 09:25:00,PoA,0.0,010,010-23
5288,A,FE3E1E0D3928202EC1257896004B8F00,2011-05-26 10:38:00,2011-05-26 10:47:00,010-23-07,Thursday,0.0,2.0,0.0,"1,02 € per minute",2011-05-26 10:38:00,2011-05-26 10:47:00,A,9.0,010,010-23


In [838]:
sel_resources = log.loc[log['department_role'] == '010-24', 'resource'].unique()
print(len(sel_resources))

11


## Social-Idling-1

In [839]:
log = log.sort_values(by='start timestamp')
log['start timestamp'] = pd.to_datetime(log['start timestamp'], format='mixed', dayfirst=True)
log['complete timestamp'] = pd.to_datetime(log['complete timestamp'], format='mixed', dayfirst=True)


### Calculate activity duration for each resource

In [840]:
df_res_actdur = log[[
    'activity', 'case_id', 'start timestamp', 'complete timestamp',
    'resource'
]]
df_res_actdur['activity duration'] = (
    df_res_actdur['complete timestamp'] -
    df_res_actdur['start timestamp']
).dt.total_seconds() / 60
df_res_actdur

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_res_actdur['activity duration'] = (


Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,activity duration
2709,APB,7389AEB4BBE73512C125784200527D84,2011-02-25 16:01:00,2011-02-25 16:10:00,070-6-05,9.0
3110,P,89031090D8B79148C12578420052F4F6,2011-02-25 16:06:00,2011-02-25 16:19:00,070-9-2,13.0
3414,P,9C7A67B58C33BC60C125784500290BD1,2011-02-28 08:28:00,2011-02-28 09:02:00,001-6-02,34.0
4978,AB,EEBDCCE3EDF9A9BAC1257845002E8C51,2011-02-28 09:28:00,2011-02-28 11:54:00,020-11-01,146.0
1828,APB,4E9BA47E42469777C1257845002F7B0F,2011-02-28 09:38:00,2011-02-28 11:34:00,010-6-05,116.0
...,...,...,...,...,...,...
1914,PA,512AA7EECCD407EFC125789B0023D06C,2011-05-31 15:46:00,2011-05-31 15:51:00,010-25-01,5.0
1915,RB,512AA7EECCD407EFC125789B0023D06C,2011-05-31 15:51:00,2011-05-31 15:53:00,010-25-01,2.0
4358,P,CB9C3E3343BFDA3FC12578A00041CC6E,2011-05-31 16:20:00,2011-05-31 16:21:00,010-21-01,1.0
1659,AB,4785DF959AF563EEC12578A10045BD94,2011-05-31 16:29:00,2011-05-31 16:30:00,010-21-01,1.0


### Calculate waiting time between activity instances for each resource

In [841]:
df_res_waiting = list()
for res, events in log.groupby('resource'):
    for i in range(len(events) - 1):
        pre_event = events.iloc[i]
        next_event = events.iloc[i+1]
        waiting_time = (
            next_event['start timestamp'] - 
            pre_event['complete timestamp']
        )
        df_res_waiting.append({
            'resource': res,
            'preceding case_id': pre_event['case_id'],
            'preceding activity': pre_event['activity'],
            'preceding activity completion': pre_event['complete timestamp'],
            'next case_id': next_event['case_id'],
            'next activity': next_event['activity'],
            'next activity start': next_event['start timestamp'],
            'activity_pair': pre_event['activity'] + '->' + next_event['activity'],
            'waiting time': waiting_time
        })
df_res_waiting = pd.DataFrame(df_res_waiting)
df_res_waiting['waiting time'] =  df_res_waiting['waiting time'].dt.total_seconds() / 60
# NOTE: only non-negative waiting time makes sense
df_res_waiting = df_res_waiting[df_res_waiting['waiting time'] >= 0]
# NOTE: consider only waiting time within the same case
df_res_waiting = df_res_waiting.loc[df_res_waiting['preceding case_id'] == df_res_waiting['next case_id'], :]

df_res_waiting = df_res_waiting[df_res_waiting['resource'].isin(sel_resources)]
df_res_waiting

Unnamed: 0,resource,preceding case_id,preceding activity,preceding activity completion,next case_id,next activity,next activity start,activity_pair,waiting time
2101,010-24-01,514AC430CC96C92EC125784E004E07E3,PoA,2011-03-14 12:29:00,514AC430CC96C92EC125784E004E07E3,PoA,2011-03-14 12:29:00,PoA->PoA,0.0
2102,010-24-01,514AC430CC96C92EC125784E004E07E3,PoA,2011-03-14 13:06:00,514AC430CC96C92EC125784E004E07E3,GSoC,2011-03-14 14:00:00,PoA->GSoC,54.0
2104,010-24-01,5786300DD9C876E5C125784D002A33A1,PoA,2011-03-17 14:52:00,5786300DD9C876E5C125784D002A33A1,GSoC,2011-03-17 14:52:00,PoA->GSoC,0.0
2106,010-24-01,0BFBC318631D1240C125784D002B64FD,PoA,2011-03-17 16:07:00,0BFBC318631D1240C125784D002B64FD,GSoC,2011-03-17 16:07:00,PoA->GSoC,0.0
2109,010-24-01,294185B20DDA8523C1257846002E877F,GSoC,2011-03-22 09:34:00,294185B20DDA8523C1257846002E877F,GSoC,2011-03-22 09:49:00,GSoC->GSoC,15.0
...,...,...,...,...,...,...,...,...,...
3689,010-24-26,C9C2D6BFBC69F3A1C125789C002B03B0,P,2011-05-26 17:01:00,C9C2D6BFBC69F3A1C125789C002B03B0,RB,2011-05-26 17:08:00,P->RB,7.0
3699,010-24-26,04BA004371DFBC4FC125786F00296277,GSoC,2011-05-30 09:48:00,04BA004371DFBC4FC125786F00296277,GSoC,2011-05-30 09:53:00,GSoC->GSoC,5.0
3701,010-24-26,18E50FC00A0C41B9C12578A000285091,RB,2011-05-30 10:16:00,18E50FC00A0C41B9C12578A000285091,CoPA,2011-05-30 11:48:00,RB->CoPA,92.0
3705,010-24-26,D8B099506955A137C125789C002C9C6C,PSP,2011-05-30 15:24:00,D8B099506955A137C125789C002C9C6C,PoAS,2011-05-31 07:34:00,PSP->PoAS,970.0


### Identify resources with higher activity duration and waiting time

In [842]:
# activity duration
df_res_mean_actdur = df_res_actdur.groupby(['resource', 'activity']).agg(
    mean_duration=pd.NamedAgg('activity duration', aggfunc='mean')
).reset_index()
df_range_actdur = df_res_actdur.groupby(['activity']).agg(
    min_act_duration=pd.NamedAgg('activity duration', aggfunc='min'),
    max_act_duration=pd.NamedAgg('activity duration', aggfunc='max'),
).reset_index()
df_res_mean_actdur = df_res_mean_actdur.merge(
    df_range_actdur, how='left', on='activity'
)
# normalize
df_res_mean_actdur['normalized_mean_duration'] = (
    (df_res_mean_actdur['mean_duration'] - df_res_mean_actdur['min_act_duration']) / 
    (df_res_mean_actdur['max_act_duration'] - df_res_mean_actdur['min_act_duration'])
)
df_res_mean_actdur = df_res_mean_actdur[df_res_mean_actdur['resource'].isin(df_res_waiting['resource'].unique())]

# waiting time
df_res_waiting = df_res_waiting[df_res_waiting['waiting time'] <= 60 * 8]

alt.vconcat(
    alt.Chart(
        df_res_waiting,
    ).mark_boxplot().encode(
        y=alt.Y('resource:O'),
        x=alt.X('waiting time:Q'),
    ),
    # alt.Chart(
    #     df_res_waiting,
    #     title='Mean of average waiting time by resource'
    # ).mark_bar().encode(
    #     y=alt.Y('resource:O'),
    #     x=alt.X('mean(waiting time):Q').title('Mean waiting time (minutes)'),
    #     tooltip=['mean(waiting time)']
    # ),
    alt.layer(
        alt.Chart(
            df_res_waiting,
        ).mark_rect().encode(
            y=alt.Y('resource:O'),
            x=alt.X('next activity:N'),
            # x=alt.X('activity_pair:N'),
            color=alt.Color('mean(waiting time):Q').scale(scheme='lightgreyred')
        ),
        alt.Chart(
            df_res_waiting
        ).mark_text(baseline='middle').encode(
            y=alt.Y('resource:O'),
            x=alt.X('next activity:N'),
            # x=alt.X('activity_pair:N'),
            text=alt.Text('mean(waiting time):Q', format='.0f')
        )
    ),
    # alt.layer(
    #     alt.Chart(
    #         df_res_mean_actdur,
    #         title='Mean activity duration by resource per activity'
    #     ).mark_rect().encode(
    #         y=alt.Y('resource:O'),
    #         x=alt.X('activity:N'),
    #         color=alt.Color('normalized_mean_duration:Q').scale(scheme='lightgreyred'),
    #         tooltip=['resource', 'activity', 'mean_duration', 'normalized_mean_duration']
    #     ),
    #     alt.Chart(
    #         df_res_mean_actdur
    #     ).mark_text(baseline='middle').encode(
    #         y=alt.Y('resource:O'),
    #         x=alt.X('activity:N'),
    #         text=alt.Text('mean_duration:Q', format='.0f')
    #     )
    # )
).resolve_scale(color='independent')