In [1]:
import pandas as pd
import seaborn as sns
import sklearn

In [2]:
NEW_DATA = False

if NEW_DATA:
    # load and preprocess event log data

    df = pd.read_csv('data/bpic17.csv')[[
        'Resource',
        'Case ID', 'Activity', 'Complete Timestamp',
        'LoanGoal', 'ApplicationType', 'RequestedAmount', 'OfferedAmount',
        'EventOrigin', 'Action'
    ]]


    # keep records of frequent human resources (classes) only
    # remove 'User_1' - the "System"
    df = df[df['Resource'] != 'User_1']

    resource_occurrences = df.value_counts(subset=['Resource'], normalize=True, sort=True)
    frequent_resources = {k[0] for k, v in resource_occurrences.to_dict().items() if v >= 0.02}
    print(f'{len(frequent_resources)} resources: {frequent_resources}')
    df = df[df['Resource'].isin(frequent_resources)]
    
    # select and rename CT and AT related candidate attributes
    df = df.rename(columns={
        # CT-related
        'LoanGoal': 'ct:loan_goal', 
        'ApplicationType': 'ct:application_type', 
        'RequestedAmount': 'ct:requested_amount', 
        'OfferedAmount': 'ct:offered_amount',
        # AT-related
        'EventOrigin': 'at:event_origin', 
        'Action': 'at:action'
    })

    # derive and append TT related candidate attributes
    df['Complete Timestamp'] = pd.to_datetime(df['Complete Timestamp'], format='%Y-%m-%d %H:%M:%S.%f')
    MONTHS = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    df['tt:month'] = df['Complete Timestamp'].apply(lambda ts: MONTHS[ts.month-1])
    df['tt:day'] = df['Complete Timestamp'].apply(lambda ts: 'Day_{}'.format(ts.day))
    WEEKDAYS = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
    df['tt:weekday'] = df['Complete Timestamp'].apply(lambda ts: WEEKDAYS[ts.dayofweek])
    df['tt:ampm'] = df['Complete Timestamp'].apply(lambda ts: 'AM' if ts.hour < 12 else 'PM')
    
    df.to_csv('data/input.csv')

In [3]:
df = pd.read_csv('data/input.csv')

# select attributes
'''
df = df[[
    'Resource',
    'ct:loan_goal', 'ct:application_type', 'ct:requested_amount',# 'ct:offered_amount',
    'at:event_origin', 'at:action',
    'tt:weekday', 'tt:ampm'
]]
'''
df = df[[
    'Resource',
    'ct:application_type',
    'at:event_origin',
    'tt:ampm'
]]

cat_features = [
    'ct:application_type', 
    'at:event_origin',
    'tt:ampm'
]

#df.to_csv('data/bpic17.processed.csv', index_label='index')

df = pd.get_dummies(data=df, columns=cat_features, prefix_sep='::')
#print(df)

In [4]:
features = [col for col in df.columns if col != 'Resource']
X = df[features]
y = df['Resource']

from sklearn import tree

clf = tree.DecisionTreeClassifier(criterion='entropy', 
                                  #max_depth=3, 
                                  random_state=0)
clf.fit(X, y)

scorer = sklearn.metrics.accuracy_score
scorer(y, clf.predict(X))


0.31687396113945093

In [5]:
# plot the fitted tree

import graphviz

dot_data = tree.export_graphviz(clf, out_file=None, 
                                feature_names=features,  
                                class_names=list(pd.unique(y)),  
                                filled=True, rounded=True,  
                                special_characters=True)

graph = graphviz.Source(dot_data)
graph.render('tree')

'tree.pdf'