# Load CSV/Parquet, change some necessary headers etc and transform to pm4py log object

## Imports etc

In [1]:
import pandas as pd
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.log.util import dataframe_utils

import utils

READ_PARQUET = True  # faster, but have to create first

## Read data as DF

In [2]:
if READ_PARQUET:
    print("Reading parquet.")
    df = utils.load_parquet()
else:
    print("Reading csv and savings as parquet.")
    df = utils.load_csv()
    utils.save_parquet(df)

Reading parquet.


In [3]:
df.head(1)

Unnamed: 0,Case ID,Activity,Resource,Complete Timestamp,Variant,Variant index,(case) amount_applied0,(case) amount_applied1,(case) amount_applied2,(case) amount_applied3,...,(case) young farmer,activity,concept:name,docid,doctype,eventid,lifecycle:transition,note,subprocess,success
0,8b99873a6136cfa6,Payment application-Application-mail income,0;n/a,2015/05/08 00:00:00.000,Variant 1832,1832,960.35,,,,...,False,mail income,mail income,-18008611495569447,Payment application,,complete,none,Application,True


In [4]:
df.columns.sort_values()

Index(['(case) amount_applied0', '(case) amount_applied1',
       '(case) amount_applied2', '(case) amount_applied3', '(case) applicant',
       '(case) application', '(case) area', '(case) basic payment',
       '(case) cross_compliance', '(case) department', '(case) greening',
       '(case) number_parcels', '(case) payment_actual0',
       '(case) payment_actual1', '(case) payment_actual2',
       '(case) payment_actual3', '(case) penalty_ABP', '(case) penalty_AGP',
       '(case) penalty_AJLP', '(case) penalty_AUVP', '(case) penalty_AVBP',
       '(case) penalty_AVGP', '(case) penalty_AVJLP', '(case) penalty_AVUVP',
       '(case) penalty_B16', '(case) penalty_B2', '(case) penalty_B3',
       '(case) penalty_B4', '(case) penalty_B5', '(case) penalty_B5F',
       '(case) penalty_B6', '(case) penalty_BGK', '(case) penalty_BGKV',
       '(case) penalty_BGP', '(case) penalty_C16', '(case) penalty_C4',
       '(case) penalty_C9', '(case) penalty_CC', '(case) penalty_GP1',
       '(case)

## Change some column names

In [5]:
df.rename(columns = {
    'Resource': 'org:resource',
    'Complete Timestamp': 'time:timestamp'
}, inplace=True)

## Fix timestamp

In [6]:
df = dataframe_utils.convert_timestamp_columns_in_df(df)

## Convert to pm4py log object

In [7]:
# Can change some parameters about default columns etc and use these as input for the log converter
parameters = {
    log_converter.Variants.TO_EVENT_LOG.value.Parameters.CASE_ID_KEY: 'Case ID',
    log_converter.Variants.TO_EVENT_LOG.value.Parameters.CASE_ATTRIBUTE_PREFIX: '(case) ',
}
log = log_converter.apply(df, parameters=parameters, variant=log_converter.Variants.TO_EVENT_LOG)

## Pickle log object

In [8]:
utils.save_log(log)

# Clean dataframe from outliers to be able to run comformance checking

In [8]:
filtered_df = df.copy()

In [9]:
filtered_df

Unnamed: 0,Case ID,Activity,org:resource,time:timestamp,Variant,Variant index,(case) amount_applied0,(case) amount_applied1,(case) amount_applied2,(case) amount_applied3,...,(case) young farmer,activity,concept:name,docid,doctype,eventid,lifecycle:transition,note,subprocess,success
0,8b99873a6136cfa6,Payment application-Application-mail income,0;n/a,2015-05-08 00:00:00+00:00,Variant 1832,1832,960.35,,,,...,False,mail income,mail income,-18008611495569447,Payment application,,complete,none,Application,True
1,8b99873a6136cfa6,Payment application-Application-mail valid,0;n/a,2015-05-08 00:00:00+00:00,Variant 1832,1832,960.35,,,,...,False,mail valid,mail valid,-18008611495569447,Payment application,,complete,none,Application,True
2,8b99873a6136cfa6,Entitlement application-Main-mail valid,0;n/a,2015-05-08 00:00:00+00:00,Variant 1832,1832,960.35,,,,...,False,mail valid,mail valid,-18008615298673397,Entitlement application,,complete,none,Main,True
3,8b99873a6136cfa6,Entitlement application-Main-mail valid,0;n/a,2015-05-08 00:00:00+00:00,Variant 1832,1832,960.35,,,,...,False,mail valid,mail valid,-18008615298673397,Entitlement application,,complete,none,Main,True
4,8b99873a6136cfa6,Parcel document-Main-initialize,fb5fa8,2015-06-10 11:16:28+00:00,Variant 1832,1832,960.35,,,,...,False,initialize,initialize,-72051858488795160,Parcel document,-7.205186e+16,complete,none,Main,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2514261,ad5dfa0b929735be,Payment application-Application-decide,155add,2017-12-11 10:26:39.692000+00:00,Variant 28923,28923,5117.97,,,,...,False,decide,decide,-54037160734808716,Payment application,-9.006594e+16,complete,automatic,Application,True
2514262,ad5dfa0b929735be,Payment application-Application-begin payment,DP-Z,2017-12-15 16:00:12.252000+00:00,Variant 28923,28923,5117.97,,,,...,False,begin payment,begin payment,-54037160734808716,Payment application,-9.006594e+16,complete,automatic during payment,Application,True
2514263,ad5dfa0b929735be,Payment application-Application-insert document,Notification automaton,2017-12-15 19:19:04.499000+00:00,Variant 28923,28923,5117.97,,,,...,False,insert document,insert document,-54037160734808716,Payment application,-9.006594e+16,complete,notification for applicant,Application,True
2514264,ad5dfa0b929735be,Payment application-Application-insert document,Notification automaton,2017-12-20 09:02:30.380000+00:00,Variant 28923,28923,5117.97,,,,...,False,insert document,insert document,-54037160734808716,Payment application,-1.260947e+17,complete,notification for applicant,Application,False


## Remove traces that have large number of events

In [52]:
threshold = 155

In [53]:
nr_events_per_trace = filtered_df['Case ID'].value_counts()
nr_events_per_trace

93deb55be9aed2be    2973
22b697e259a77e16    2490
00101d737b58ff45     907
6dcac1562d4ca766     864
f8ad3caa0d29795c     788
                    ... 
29b1f2e2f459e623      31
06501822341c30a8      29
2202b247728435e3      29
a8118d96af27df2b      25
e464e72a43e59a55      24
Name: Case ID, Length: 43809, dtype: int64

In [54]:
filtered_nr_events_per_trace = nr_events_per_trace[nr_events_per_trace <= threshold]
filtered_nr_events_per_trace

5ee73ede1d82a111    155
426748ffd7d02d87    155
d598a1fd5fdeed9d    155
2af4bcd51c5a2a72    155
14d4d258673f2251    155
                   ... 
29b1f2e2f459e623     31
06501822341c30a8     29
2202b247728435e3     29
a8118d96af27df2b     25
e464e72a43e59a55     24
Name: Case ID, Length: 43242, dtype: int64

In [55]:
removed_nr_events_per_trace = nr_events_per_trace[nr_events_per_trace > threshold]
removed_nr_events_per_trace

93deb55be9aed2be    2973
22b697e259a77e16    2490
00101d737b58ff45     907
6dcac1562d4ca766     864
f8ad3caa0d29795c     788
                    ... 
1c45e283eedbf1ce     156
c67e1022125070c0     156
551721af52308ef9     156
03e70a9871fa78f8     156
4f4cf79936d9781f     156
Name: Case ID, Length: 567, dtype: int64

In [57]:
filtered_df = filtered_df[filtered_df['Case ID'].isin(filtered_nr_events_per_trace.index)]

### Check how many traces removed

In [56]:
print(f"{len(removed_nr_events_per_trace)} traces filtered from {len(nr_events_per_trace)} total nr of traces")

567 traces filtered from 43809 total nr of traces


In [46]:
def calc_traces_removed(new: pd.Series, old: pd.Series) -> float:
    perc_removed = (len(new) - len(old)) / len(old) * 100
    return round(perc_removed, 2)

In [47]:
calc_traces_removed(filtered_nr_events_per_trace, nr_events_per_trace)

-1.29

Above percentage of total traces removed. 

### Check how many events removed

In [48]:
print(f"{sum(removed_nr_events_per_trace)} events filtered from {sum(nr_events_per_trace)} total nr of events")

138945 events filtered from 2514266 total nr of events


In [58]:
def calc_events_removed(new: pd.Series, old: pd.Series) -> float:
    perc_removed = (sum(new) - sum(old)) / sum(old) * 100
    return round(perc_removed, 2)

In [59]:
calc_events_removed(filtered_nr_events_per_trace, nr_events_per_trace)

-5.53

Above percentage of total events removed.

## Save filtered log in parquet and save filtered log object