# Gold Plating

## Description
Introducing higher levels of regulation or conducting more work, or [...] the addition of features or services that are not required

In [1]:
import pandas as pd
import numpy as np
import altair as alt

In [2]:
ref_log_name = 'propr_ref'
test_log_name = 'propr_test'

fn_logs = {
    'propr': '../data/preproc/proprietary.csv',
    'propr_ref': '../data/preproc/propr_train-70.csv',
    'propr_test': '../data/preproc/propr_test-30.csv'
}

In [3]:
log = pd.read_csv(fn_logs[test_log_name])
print(log['case_id'].nunique())
log

519


Unnamed: 0,activity,case_id,start timestamp,complete timestamp,resource,Weekday,Cost,"School holidays; 0 = no, 1 = yes",Approval; 1 = low risk to 4 = high risk,Type of loan; 0 = normal; 1 = special,_start timestamp,_complete timestamp
0,Precheck,ID-2,6/05/2011 16:09,6/05/2011 16:27,090-10-02,Friday,"0,87 € per minute",0.0,1.0,0.0,2011-05-06 16:09:00,2011-05-06 16:27:00
1,Application_Processing_Branches,ID-2,9/05/2011 11:14,9/05/2011 11:16,010-23-13,Monday,"1,02 € per minute",0.0,1.0,0.0,2011-05-09 11:14:00,2011-05-09 11:16:00
2,Precheck,ID-2,9/05/2011 15:44,9/05/2011 15:58,090-10-02,Monday,"0,87 € per minute",0.0,1.0,0.0,2011-05-09 15:44:00,2011-05-09 15:58:00
3,Processing_Incomplete_Orders,ID-2,9/05/2011 16:10,9/05/2011 16:10,010-23-13,Monday,"1,02 € per minute",0.0,1.0,0.0,2011-05-09 16:10:00,2011-05-09 16:10:00
4,Application_Processing_Branches,ID-2,10/05/2011 8:52,10/05/2011 9:32,010-23-13,Tuesday,"1,02 € per minute",0.0,1.0,0.0,2011-05-10 08:52:00,2011-05-10 09:32:00
...,...,...,...,...,...,...,...,...,...,...,...,...
5285,Precheck,ID-1720,20/05/2011 17:24,20/05/2011 17:26,000-3-01,Friday,"0,87 € per minute",0.0,2.0,0.0,2011-05-20 17:24:00,2011-05-20 17:26:00
5286,Check_of_Processing_Applications,ID-1720,23/05/2011 15:53,23/05/2011 16:06,000-2-01,Monday,"0,87 € per minute",0.0,2.0,0.0,2011-05-23 15:53:00,2011-05-23 16:06:00
5287,Processing_of_Applications,ID-1720,26/05/2011 9:25,26/05/2011 9:25,010-23-07,Thursday,"1,02 € per minute",0.0,2.0,0.0,2011-05-26 09:25:00,2011-05-26 09:25:00
5288,Archieving,ID-1720,26/05/2011 10:38,26/05/2011 10:47,010-23-07,Thursday,"1,02 € per minute",0.0,2.0,0.0,2011-05-26 10:38:00,2011-05-26 10:47:00


In [4]:
# encode activity labels as short names for presentation purpose

print('There exist {} unique activity labels'.format(
    log['activity'].nunique()
))

# use initial letters for encoding
act_labels_short = dict()
for label in log['activity'].unique():
    label_short = ''.join([w[0] for w in label.split('_')])
    act_labels_short[label] = label_short
print('There are {} short labels after encoding'.format(
    len(act_labels_short.values())
))
for k in sorted(act_labels_short.keys()):
    print('{:>50} -> {:>30}'.format(k, act_labels_short[k]))

log['activity_short'] = log['activity'].apply(
    lambda x: act_labels_short[x]
)

log['activity'] = log['activity_short']

There exist 25 unique activity labels
There are 25 short labels after encoding
                              Administrative_Tasks ->                             AT
                   Application_Processing_Branches ->                            APB
                                          Approval ->                              A
                                   Approval_Branch ->                             AB
                          Approval_Executive_Board ->                            AEB
                                        Archieving ->                              A
                                 Check_of_Approval ->                            CoA
                                Check_of_Documents ->                            CoD
                  Check_of_Processing_Applications ->                           CoPA
                                               End ->                              E
                                   Further_inquiry ->                  

## Performance-GP-2

### Apply the given criteria to determine cases of interest (COI)

In [5]:
# proprietary: 
# case types as types of loan

log['case_type'] = log.apply(
    lambda row: (
        'high_risk_loan' if (
            row['Approval; 1 = low risk to 4 = high risk'] == 4
        ) else 
        'other_loan'
    ),
    axis=1
)

print(log['case_type'].unique())

ct_coi = 'high_risk_loan'

print(log.loc[log['case_type'] == ct_coi, 'case_id'].nunique())

# investigate only COI
log = log[log['case_type'] == ct_coi]

ct_coi

['other_loan' 'high_risk_loan']
8


'high_risk_loan'

### Check within COI for cases with rare activities and long duration time

In [6]:
# check rare activities within the COI cohort
case_dist = []
for case_id in log['case_id'].unique():
    act = set(log.loc[log['case_id'] == case_id, 'activity'].unique())
    act_others = set(log.loc[log['case_id'] != case_id, 'activity'].unique())

    case_dist.append({
        'case_id': case_id,
        'uniq_activities': ','.join(a for a in act.difference(act_others)),
        'num_uniq_activities': len(act.difference(act_others))
    })
case_dist = pd.DataFrame(case_dist)

for case_id, events in log.groupby('case_id'):
    activities = events['activity'].unique()
    print(case_id)
    print(events.iloc[0])
    print(sorted(activities))
case_dist

ID-11
activity                                                    AB
case_id                                                  ID-11
start timestamp                                9/03/2011 10:28
complete timestamp                             9/03/2011 10:55
resource                                             040-10-01
Weekday                                              Wednesday
Cost                                         0,87 € per minute
School holidays; 0 = no, 1 = yes                           0.0
Approval; 1 = low risk to 4 = high risk                    4.0
Type of loan; 0 = normal; 1 = special                      0.0
_start timestamp                           2011-03-09 10:28:00
_complete timestamp                        2011-03-09 10:55:00
activity_short                                              AB
case_type                                       high_risk_loan
Name: 29, dtype: object
['A', 'AB', 'AEB', 'APB', 'CoPA', 'E', 'P', 'PDEB', 'PoA']
ID-1655
activity             

Unnamed: 0,case_id,uniq_activities,num_uniq_activities
0,ID-11,,0
1,ID-33,"PA,CoD",2
2,ID-40,,0
3,ID-97,,0
4,ID-243,,0
5,ID-431,,0
6,ID-636,,0
7,ID-1655,,0


In [7]:
# check case duration
df_case_time = []
for case_id, events in log.groupby('case_id'):
    df_case_time.append({
        'case_id': case_id,
        'case_duration_minutes': (
            pd.to_datetime(events['_complete timestamp']).max() -
            pd.to_datetime(events['_start timestamp']).min()
        ).total_seconds() / 60,
        'case_type': events.iloc[0]['case_type']
    })
df_case_time = pd.DataFrame(df_case_time)
df_case_time

Unnamed: 0,case_id,case_duration_minutes,case_type
0,ID-11,40490.0,high_risk_loan
1,ID-1655,17372.0,high_risk_loan
2,ID-243,21932.0,high_risk_loan
3,ID-33,63351.0,high_risk_loan
4,ID-40,66302.0,high_risk_loan
5,ID-431,28713.0,high_risk_loan
6,ID-636,14466.0,high_risk_loan
7,ID-97,29009.0,high_risk_loan


In [8]:
alt.vconcat(
    alt.Chart(df_case_time).mark_boxplot().encode(
        x='case_duration_minutes:Q'
    ),
    alt.Chart(df_case_time).mark_bar().encode(
        x='case_id:O',
        y='case_duration_minutes:Q'
    )
)
# alt.Chart(df_case_time).mark_bar().encode(
#     x=alt.X('case_duration_minutes:Q').bin(),
#     y='count():Q'
# )