In [264]:
import pandas as pd

In [315]:
# Load raw data & Preprocess DataFrame (enrich with derived attributes)
log = 'bpic17'

preprocess = False

if preprocess:
    fn = f'data/raw/{log}.csv'
else:
    fn = f'data/processed/{log}.csv'

if preprocess:
    if log == 'wabo':
        df = pd.read_csv(fn)[[
            'Case ID', 'Resource', 'Complete Timestamp',
            'org:group', 'group',
            'concept:name', 'responsible', 'department', 'channel'
        ]]

        df = df.rename(columns={
            # Resource-related
            'department': 'r:department',
            'org:group': 'r:org:group',
            'group': 'r:group',
            # CT-related
            'channel': 'ct:channel',
            # AT-related
            'concept:name': 'Activity',
        })
        
        # filter meaningless values
        df = df[~df['r:org:group'].isin(['EMPTY'])]
        df = df[~df['r:group'].isin([''])]

    if log == 'bpic17':
        df = pd.read_csv(fn)[[
            'Case ID', 'Activity', 'Resource', 'Complete Timestamp',
            'EventOrigin', 'LoanGoal', 'ApplicationType', 'RequestedAmount'
        ]]

        df = df.rename(columns={
            # Resource-related
            # CT-related
            'LoanGoal': 'ct:loan_goal', 
            'ApplicationType': 'ct:application_type', 
            'RequestedAmount': 'ct:requested_amount', 
            # AT-related
            'EventOrigin': 'at:event_origin'
        })
        
        # filter meaningless values
        df = df[~df['ct:loan_goal'].isin(['Unknown'])]
        

    if log == 'bpic15':
        df = pd.read_csv(fn)[[
            'Case ID', 'Activity', 'Resource', 'Complete Timestamp',
            '(case) last_phase', '(case) parts', 'action_code', 'municipality'
        ]]
        df = df.rename(columns={
            # Resource-related
            'municipality': 'r:municipality',
            # CT-related
            '(case) last_phase': 'ct:last_phase', 
            # AT-related
        })
        df = df.rename(columns={
            '(case) parts': 'case_parts'
        })
        # TODO: derive 'ct:permit_type', 'at:phase'
        df = df[~df['case_parts'].isna()]
        df['ct:permit_type'] = df.apply(lambda row: 'Bouw' if 'Bouw' in str(row['case_parts']).split(',') else 'Non Bouw', axis=1)

        # only look at the main subprocess: "01_HOOFD"
        df = df[~df['action_code'].isna()]
        df = df[df['action_code'].str.startswith('01_HOOFD')]
        df['at:phase'] = df['action_code'].apply(lambda code: code[:10])
        
        # filter meaningless values

    if log == 'bpic18':
        pass

    # Universal (on Disco outputs)
    # derive and append TT related candidate attributes
    df['Complete Timestamp'] = pd.to_datetime(df['Complete Timestamp'], format='%Y-%m-%d %H:%M:%S.%f')
    MONTHS = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    df['tt:month'] = df['Complete Timestamp'].apply(lambda ts: MONTHS[ts.month-1])
    df['tt:day'] = df['Complete Timestamp'].apply(lambda ts: 'Day_{}'.format(ts.day))
    WEEKDAYS = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
    df['tt:weekday'] = df['Complete Timestamp'].apply(lambda ts: WEEKDAYS[ts.dayofweek])
    df['tt:ampm'] = df['Complete Timestamp'].apply(lambda ts: 'AM' if ts.hour < 12 else 'PM')
    
    print(df)
    df.to_csv(f'data/processed/{log}.csv')
else:
    df = pd.read_csv(fn, index_col=0)
    print(df)

                       Case ID                  Activity Resource  \
0        Application_652823628      A_Create Application   User_1   
1        Application_652823628               A_Submitted   User_1   
2        Application_652823628                 A_Concept   User_1   
3        Application_652823628                A_Accepted  User_52   
4        Application_652823628            O_Create Offer  User_52   
...                        ...                       ...      ...   
475301  Application_1350494635                 O_Created  User_96   
475302  Application_1350494635  O_Sent (mail and online)  User_96   
475303  Application_1350494635                A_Complete  User_96   
475304  Application_1350494635               A_Cancelled  User_28   
475305  Application_1350494635               O_Cancelled  User_28   

             Complete Timestamp at:event_origin            ct:loan_goal  \
0       2016-01-01 19:51:15.304     Application  Existing loan takeover   
1       2016-01-01 19

In [330]:
if log == 'wabo':
#     attr = 'Activity'
#     attr = 'r:org:group'
#     attr = 'r:group'
#     attr = 'r:department'
#     attr = 'ct:channel'
#     attr = 'tt:month'
#     attr = 'tt:day'
#     attr = 'tt:weekday'
    attr = 'tt:ampm'

if log == 'bpic17':
#     attr = 'Activity'
#     attr = 'ct:loan_goal'
#     attr = 'ct:application_type'
#     #attr = 'ct:requested_amount'
#     attr = 'at:event_origin'
#     attr = 'tt:month'
#     attr = 'tt:day'
#     attr = 'tt:weekday'
    attr = 'tt:ampm'

if log == 'bpic15':
#     attr = 'Activity'
#     attr = 'r:municipality'
#     attr = 'ct:last_phase'
#     attr = 'ct:permit_type'
#     attr = 'at:phase'
#     attr = 'tt:month'
#     attr = 'tt:day'
#     attr = 'tt:weekday'
    attr = 'tt:ampm'

l = df.groupby(['Resource', attr]).size().groupby(level=0).size().to_numpy()
print(l)
avg_val_per_resource = l.mean()
    
df_grouped = df.groupby(['Resource', attr]).size().groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))
df_grouped = df_grouped.reset_index().pivot(index='Resource', columns=attr, values=0)
#print(df_grouped)

print(f'Shape: {len(df_grouped)} x {len(df_grouped.columns)}')

import matplotlib.pyplot as plt
import seaborn as sns
#f, ax = plt.subplots(figsize=(20, 20))
#ax = sns.heatmap(df_grouped.T, square=True, cbar=False, ax=ax)

[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2
 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
Shape: 144 x 2


In [331]:
# NOTE: the hopkins stat in package `pyclustertend` is defined similarly to https://en.wikipedia.org/wiki/Hopkins_statistic
# Only that the complement is used, i.e., Hopkins = 1 - H, where H is calculated according to the definition shown on Wikipedia
# Hence a value closer to 1 suggests strong clustering tendency
from pyclustertend import vat, ivat, hopkins
from sklearn.preprocessing import scale
from scipy.spatial.distance import pdist
from numpy import mean

# scale
X = scale(df_grouped.fillna(0).to_numpy())
sample_size = int(0.2 * len(X))

# binarize (for hacking hamming distance)
B = (X > 0)

'''
# X-related
# avg pdist
avg_pdist = pdist(X).mean()
print(f'Avg. Pairwise distance (Euclidean): \n{avg_pdist}')
# hopkins stat
hopkins_stat = mean([hopkins(X, sampling_size=sample_size) for i in range(1000)])
print(f'Hopkins statistic averaged over 1k runs, sampling {sample_size} / {len(X)} (20%) points: \n{hopkins_stat}')
'''

# B-related
# avg pdist
avg_pdist_bin = pdist(B, metric='hamming').mean()
print(f'Avg. Pairwise distance: \n{avg_pdist_bin}')
# hopkins stat
#hopkins_stat_bin = mean([hopkins(B, sampling_size=sample_size) for i in range(1000)])
#print(f'Hopkins statistic averaged over 1k runs, sampling {sample_size} / {len(B)} (20%) points: \n{hopkins_stat_bin}')


#print('{:.3f},{:.3f},{:.3f},{:.3f},{:.3f}'.format(avg_val_per_resource, avg_pdist, hopkins_stat, avg_pdist_bin, hopkins_stat_bin))
#print('{:.3f},{:.3f},{:.3f}'.format(avg_val_per_resource, avg_pdist_bin, hopkins_stat_bin))
print('{:.3f},{:.3f}'.format(avg_val_per_resource, avg_pdist_bin))
    
#ivat(X)

Avg. Pairwise distance: 
0.5019425019425019
1.986,0.502
