# Load CSV/Parquet, change some necessary headers etc and transform to pm4py log object

## Imports etc

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.log.util import dataframe_utils

import utils

READ_PARQUET = True  # faster, but have to create first
CONVERT_REGULAR_DF_TO_LOG = False  # if True, transforms regular dataframe to log object in ~45min
CONVERT_FILTERED_DF_TO_LOG = True  # if True, transforms filtered dataframe to log objects in ~X min 

SAMPLE = False  # take small sample to test whether all runs

In [2]:
def convert_df_to_log(frame):
    parameters = {
        log_converter.Variants.TO_EVENT_LOG.value.Parameters.CASE_ID_KEY: 'Case ID',
        log_converter.Variants.TO_EVENT_LOG.value.Parameters.CASE_ATTRIBUTE_PREFIX: '(case) ',
    }
    log = log_converter.apply(frame, parameters=parameters, variant=log_converter.Variants.TO_EVENT_LOG)
    return log


def calc_traces_removed(new: pd.Series, old: pd.Series) -> float:
    perc_removed = (len(new) - len(old)) / len(old) * 100
    return round(perc_removed, 2)


def calc_events_removed(new: pd.Series, old: pd.Series) -> float:
    perc_removed = (sum(new) - sum(old)) / sum(old) * 100
    return round(perc_removed, 2)

## Read data as DF

In [3]:
if READ_PARQUET:
    print("Reading parquet.")
    df = utils.load_parquet_original()
else:
    print("Reading csv and savings as parquet.")
    df = utils.load_csv()
    utils.save_parquet_original(df)
    
if SAMPLE:
    print("Taking small sample.")
    df = df.iloc[:100]
else:
    print("Taking full dataset. ")

Reading parquet.
Taking full dataset. 


In [4]:
df.head(1)

Unnamed: 0,Case ID,Activity,Resource,Complete Timestamp,Variant,Variant index,(case) amount_applied0,(case) amount_applied1,(case) amount_applied2,(case) amount_applied3,(case) applicant,(case) application,(case) area,(case) basic payment,(case) cross_compliance,(case) department,(case) greening,(case) number_parcels,(case) payment_actual0,(case) payment_actual1,(case) payment_actual2,(case) payment_actual3,(case) penalty_ABP,(case) penalty_AGP,(case) penalty_AJLP,(case) penalty_AUVP,(case) penalty_AVBP,(case) penalty_AVGP,(case) penalty_AVJLP,(case) penalty_AVUVP,(case) penalty_B16,(case) penalty_B2,(case) penalty_B3,(case) penalty_B4,(case) penalty_B5,(case) penalty_B5F,(case) penalty_B6,(case) penalty_BGK,(case) penalty_BGKV,(case) penalty_BGP,(case) penalty_C16,(case) penalty_C4,(case) penalty_C9,(case) penalty_CC,(case) penalty_GP1,(case) penalty_JLP1,(case) penalty_JLP2,(case) penalty_JLP3,(case) penalty_JLP5,(case) penalty_JLP6,(case) penalty_JLP7,(case) penalty_V5,(case) penalty_amount0,(case) penalty_amount1,(case) penalty_amount2,(case) penalty_amount3,(case) program-id,(case) redistribution,(case) rejected,(case) risk_factor,(case) selected_manually,(case) selected_random,(case) selected_risk,(case) small farmer,(case) year,(case) young farmer,activity,concept:name,docid,doctype,eventid,lifecycle:transition,note,subprocess,success
0,8b99873a6136cfa6,Payment application-Application-mail income,0;n/a,2015/05/08 00:00:00.000,Variant 1832,1832,960.35,,,,b3b1bafcf8a5c359,8b99873a6136cfa6,2.6994,True,0.0,e7,True,3,960.35,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,,,,215,True,False,1.0,False,False,False,True,2015,False,mail income,mail income,-18008611495569447,Payment application,,complete,none,Application,True


In [5]:
df.columns.sort_values()

Index(['(case) amount_applied0', '(case) amount_applied1',
       '(case) amount_applied2', '(case) amount_applied3', '(case) applicant',
       '(case) application', '(case) area', '(case) basic payment',
       '(case) cross_compliance', '(case) department', '(case) greening',
       '(case) number_parcels', '(case) payment_actual0',
       '(case) payment_actual1', '(case) payment_actual2',
       '(case) payment_actual3', '(case) penalty_ABP', '(case) penalty_AGP',
       '(case) penalty_AJLP', '(case) penalty_AUVP', '(case) penalty_AVBP',
       '(case) penalty_AVGP', '(case) penalty_AVJLP', '(case) penalty_AVUVP',
       '(case) penalty_B16', '(case) penalty_B2', '(case) penalty_B3',
       '(case) penalty_B4', '(case) penalty_B5', '(case) penalty_B5F',
       '(case) penalty_B6', '(case) penalty_BGK', '(case) penalty_BGKV',
       '(case) penalty_BGP', '(case) penalty_C16', '(case) penalty_C4',
       '(case) penalty_C9', '(case) penalty_CC', '(case) penalty_GP1',
       '(case)

## Change some column names

In [6]:
df.rename(columns = {
    'Resource': 'org:resource',
    'Complete Timestamp': 'time:timestamp'
}, inplace=True)

## Fix timestamp

In [7]:
df = dataframe_utils.convert_timestamp_columns_in_df(df)

## Convert to pm4py log object and save as pickle

In [8]:
if CONVERT_REGULAR_DF_TO_LOG:
    print("Transforming df into log and saving it. ")
    log = convert_df_to_log(df)
    utils.save_log_original(log)
else:
    print("Not transforming and saving.")

Not transforming and saving.


# Clean dataframe from outliers to be able to run comformance checking

In [9]:
filtered_df = df.copy()

In [10]:
filtered_df.head(1)

Unnamed: 0,Case ID,Activity,org:resource,time:timestamp,Variant,Variant index,(case) amount_applied0,(case) amount_applied1,(case) amount_applied2,(case) amount_applied3,(case) applicant,(case) application,(case) area,(case) basic payment,(case) cross_compliance,(case) department,(case) greening,(case) number_parcels,(case) payment_actual0,(case) payment_actual1,(case) payment_actual2,(case) payment_actual3,(case) penalty_ABP,(case) penalty_AGP,(case) penalty_AJLP,(case) penalty_AUVP,(case) penalty_AVBP,(case) penalty_AVGP,(case) penalty_AVJLP,(case) penalty_AVUVP,(case) penalty_B16,(case) penalty_B2,(case) penalty_B3,(case) penalty_B4,(case) penalty_B5,(case) penalty_B5F,(case) penalty_B6,(case) penalty_BGK,(case) penalty_BGKV,(case) penalty_BGP,(case) penalty_C16,(case) penalty_C4,(case) penalty_C9,(case) penalty_CC,(case) penalty_GP1,(case) penalty_JLP1,(case) penalty_JLP2,(case) penalty_JLP3,(case) penalty_JLP5,(case) penalty_JLP6,(case) penalty_JLP7,(case) penalty_V5,(case) penalty_amount0,(case) penalty_amount1,(case) penalty_amount2,(case) penalty_amount3,(case) program-id,(case) redistribution,(case) rejected,(case) risk_factor,(case) selected_manually,(case) selected_random,(case) selected_risk,(case) small farmer,(case) year,(case) young farmer,activity,concept:name,docid,doctype,eventid,lifecycle:transition,note,subprocess,success
0,8b99873a6136cfa6,Payment application-Application-mail income,0;n/a,2015-05-08 00:00:00+00:00,Variant 1832,1832,960.35,,,,b3b1bafcf8a5c359,8b99873a6136cfa6,2.6994,True,0.0,e7,True,3,960.35,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,,,,215,True,False,1.0,False,False,False,True,2015,False,mail income,mail income,-18008611495569447,Payment application,,complete,none,Application,True


## Remove traces that have large number of events

In [11]:
EVENTS_THRESHOLD = 155

In [12]:
nr_events_per_trace = filtered_df['Case ID'].value_counts()
nr_events_per_trace

93deb55be9aed2be    2973
22b697e259a77e16    2490
00101d737b58ff45     907
6dcac1562d4ca766     864
f8ad3caa0d29795c     788
                    ... 
27c444298162c9c5      31
2202b247728435e3      29
06501822341c30a8      29
a8118d96af27df2b      25
e464e72a43e59a55      24
Name: Case ID, Length: 43809, dtype: int64

In [13]:
filtered_nr_events_per_trace = nr_events_per_trace[nr_events_per_trace <= EVENTS_THRESHOLD]
filtered_nr_events_per_trace

c4f9537e2403f7b6    155
d598a1fd5fdeed9d    155
dde5780dabe727a7    155
2af4bcd51c5a2a72    155
14d4d258673f2251    155
                   ... 
27c444298162c9c5     31
2202b247728435e3     29
06501822341c30a8     29
a8118d96af27df2b     25
e464e72a43e59a55     24
Name: Case ID, Length: 43242, dtype: int64

In [14]:
removed_nr_events_per_trace = nr_events_per_trace[nr_events_per_trace > EVENTS_THRESHOLD]
removed_nr_events_per_trace

93deb55be9aed2be    2973
22b697e259a77e16    2490
00101d737b58ff45     907
6dcac1562d4ca766     864
f8ad3caa0d29795c     788
                    ... 
d074099deeaa0119     156
ed0b469c38cf0949     156
03e70a9871fa78f8     156
1c45e283eedbf1ce     156
a22c3943c849f374     156
Name: Case ID, Length: 567, dtype: int64

### Execute removal

In [15]:
filtered_df = filtered_df[filtered_df['Case ID'].isin(filtered_nr_events_per_trace.index)]

### Check how many traces removed

In [16]:
print(f"{len(removed_nr_events_per_trace)} traces filtered from {len(nr_events_per_trace)} total nr of traces")

567 traces filtered from 43809 total nr of traces


In [17]:
calc_traces_removed(filtered_nr_events_per_trace, nr_events_per_trace)

-1.29

Above percentage of total traces removed. 

### Check how many events removed

In [18]:
print(f"{sum(removed_nr_events_per_trace)} events filtered from {sum(nr_events_per_trace)} total nr of events")

138945 events filtered from 2514266 total nr of events


In [19]:
calc_events_removed(filtered_nr_events_per_trace, nr_events_per_trace)

-5.53

Above percentage of total events removed.

## Remove traces with long case duration

In [20]:
DURATION_THRESHOLD = pd.Timedelta(value=365, unit='days')
DURATION_THRESHOLD

Timedelta('365 days 00:00:00')

In [21]:
filtered_df.head(1)

Unnamed: 0,Case ID,Activity,org:resource,time:timestamp,Variant,Variant index,(case) amount_applied0,(case) amount_applied1,(case) amount_applied2,(case) amount_applied3,(case) applicant,(case) application,(case) area,(case) basic payment,(case) cross_compliance,(case) department,(case) greening,(case) number_parcels,(case) payment_actual0,(case) payment_actual1,(case) payment_actual2,(case) payment_actual3,(case) penalty_ABP,(case) penalty_AGP,(case) penalty_AJLP,(case) penalty_AUVP,(case) penalty_AVBP,(case) penalty_AVGP,(case) penalty_AVJLP,(case) penalty_AVUVP,(case) penalty_B16,(case) penalty_B2,(case) penalty_B3,(case) penalty_B4,(case) penalty_B5,(case) penalty_B5F,(case) penalty_B6,(case) penalty_BGK,(case) penalty_BGKV,(case) penalty_BGP,(case) penalty_C16,(case) penalty_C4,(case) penalty_C9,(case) penalty_CC,(case) penalty_GP1,(case) penalty_JLP1,(case) penalty_JLP2,(case) penalty_JLP3,(case) penalty_JLP5,(case) penalty_JLP6,(case) penalty_JLP7,(case) penalty_V5,(case) penalty_amount0,(case) penalty_amount1,(case) penalty_amount2,(case) penalty_amount3,(case) program-id,(case) redistribution,(case) rejected,(case) risk_factor,(case) selected_manually,(case) selected_random,(case) selected_risk,(case) small farmer,(case) year,(case) young farmer,activity,concept:name,docid,doctype,eventid,lifecycle:transition,note,subprocess,success
0,8b99873a6136cfa6,Payment application-Application-mail income,0;n/a,2015-05-08 00:00:00+00:00,Variant 1832,1832,960.35,,,,b3b1bafcf8a5c359,8b99873a6136cfa6,2.6994,True,0.0,e7,True,3,960.35,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,,,,215,True,False,1.0,False,False,False,True,2015,False,mail income,mail income,-18008611495569447,Payment application,,complete,none,Application,True


In [22]:
filtered_df['time:timestamp'].dtype

datetime64[ns, UTC]

In [23]:
max_time = filtered_df.groupby('Case ID')['time:timestamp'].max()
max_time

Case ID
0002505cb62792e4   2017-08-03 15:54:21.311000+00:00
0002a55a6130cec8   2016-02-18 07:41:46.343000+00:00
0004ff62053a60ce          2016-08-19 11:56:47+00:00
000612b48d30de74   2016-09-14 11:47:48.425000+00:00
0006cc909ce508b0          2017-06-15 09:24:55+00:00
                                 ...               
fffa80507910b904   2018-01-05 15:25:41.309000+00:00
fffa93db29ee0841   2017-01-06 15:11:32.944000+00:00
fffab10d958a3e4d   2017-01-06 14:12:05.156000+00:00
fffabee101da7ecb   2018-01-05 15:05:15.215000+00:00
fffadf8d083df3d5   2017-04-25 14:07:21.285000+00:00
Name: time:timestamp, Length: 43242, dtype: datetime64[ns, UTC]

In [24]:
min_time = filtered_df.groupby('Case ID')['time:timestamp'].min()
min_time

Case ID
0002505cb62792e4   2015-04-28 00:00:00+00:00
0002a55a6130cec8   2015-04-17 00:00:00+00:00
0004ff62053a60ce   2015-05-11 00:00:00+00:00
000612b48d30de74   2015-05-06 00:00:00+00:00
0006cc909ce508b0   2016-04-28 00:00:00+00:00
                              ...           
fffa80507910b904   2017-05-10 00:00:00+00:00
fffa93db29ee0841   2016-05-12 00:00:00+00:00
fffab10d958a3e4d   2016-05-11 00:00:00+00:00
fffabee101da7ecb   2017-05-11 00:00:00+00:00
fffadf8d083df3d5   2015-04-28 00:00:00+00:00
Name: time:timestamp, Length: 43242, dtype: datetime64[ns, UTC]

In [25]:
duration_per_trace = max_time - min_time
duration_per_trace

Case ID
0002505cb62792e4   828 days 15:54:21.311000
0002a55a6130cec8   307 days 07:41:46.343000
0004ff62053a60ce          466 days 11:56:47
000612b48d30de74   497 days 11:47:48.425000
0006cc909ce508b0          413 days 09:24:55
                             ...           
fffa80507910b904   240 days 15:25:41.309000
fffa93db29ee0841   239 days 15:11:32.944000
fffab10d958a3e4d   240 days 14:12:05.156000
fffabee101da7ecb   239 days 15:05:15.215000
fffadf8d083df3d5   728 days 14:07:21.285000
Name: time:timestamp, Length: 43242, dtype: timedelta64[ns]

In [26]:
type(duration_per_trace.iloc[0])

pandas._libs.tslibs.timedeltas.Timedelta

In [27]:
filtered_duration_per_trace = duration_per_trace[duration_per_trace < DURATION_THRESHOLD]
filtered_duration_per_trace

Case ID
0002a55a6130cec8   307 days 07:41:46.343000
000b9e1ec3f77825   254 days 13:14:10.394000
000c48f27157572d   236 days 15:16:18.907000
000de8047a5a55db   358 days 10:40:58.142000
0011ec2b7b948cdb   252 days 15:22:58.213000
                             ...           
fffa3cf2546632a2   247 days 14:53:17.235000
fffa80507910b904   240 days 15:25:41.309000
fffa93db29ee0841   239 days 15:11:32.944000
fffab10d958a3e4d   240 days 14:12:05.156000
fffabee101da7ecb   239 days 15:05:15.215000
Name: time:timestamp, Length: 34415, dtype: timedelta64[ns]

In [28]:
removed_duration_per_trace = duration_per_trace[duration_per_trace >= DURATION_THRESHOLD]
removed_duration_per_trace

Case ID
0002505cb62792e4   828 days 15:54:21.311000
0004ff62053a60ce          466 days 11:56:47
000612b48d30de74   497 days 11:47:48.425000
0006cc909ce508b0          413 days 09:24:55
000843b028f083e2   619 days 11:11:42.951000
                             ...           
ffe6c2b90896445e          881 days 07:15:30
ffebb3a36fc7caee   839 days 15:22:11.992000
fff1a78a2add360e          476 days 10:22:53
fff750353a66eba6   609 days 19:50:07.759000
fffadf8d083df3d5   728 days 14:07:21.285000
Name: time:timestamp, Length: 8827, dtype: timedelta64[ns]

### Execute removal

In [29]:
filtered_df = filtered_df[filtered_df['Case ID'].isin(filtered_duration_per_trace.index)]

### Check how many traces removed

In [30]:
calc_traces_removed(filtered_duration_per_trace, duration_per_trace)

-20.41

Above percentage of traces removed (on top of the first percentage)

In [31]:
calc_traces_removed(filtered_duration_per_trace, nr_events_per_trace)

-21.44

Above total percentage of traces removed (first and second percentage combined)

### Check how many events removed

In [32]:
filtered_nr_events_per_trace

c4f9537e2403f7b6    155
d598a1fd5fdeed9d    155
dde5780dabe727a7    155
2af4bcd51c5a2a72    155
14d4d258673f2251    155
                   ... 
27c444298162c9c5     31
2202b247728435e3     29
06501822341c30a8     29
a8118d96af27df2b     25
e464e72a43e59a55     24
Name: Case ID, Length: 43242, dtype: int64

In [33]:
second_filtered_nr_events_per_trace = filtered_df.groupby('Case ID')['time:timestamp'].count()
second_filtered_nr_events_per_trace

Case ID
0002a55a6130cec8     40
000b9e1ec3f77825     77
000c48f27157572d    105
000de8047a5a55db     62
0011ec2b7b948cdb     40
                   ... 
fffa3cf2546632a2     47
fffa80507910b904     49
fffa93db29ee0841     49
fffab10d958a3e4d     39
fffabee101da7ecb     47
Name: time:timestamp, Length: 34415, dtype: int64

In [34]:
calc_events_removed(second_filtered_nr_events_per_trace, filtered_nr_events_per_trace)

-24.86

Above percentage of events removed (on top of the first percentage)

In [35]:
calc_events_removed(second_filtered_nr_events_per_trace, nr_events_per_trace)

-29.01

Above total percentage of events removed (first and second percentage combined)

## Conclusion

    Using events threshold of 155 and duration threshold of one year
    22% of total number of traces is removed
    29% of total number of events is removed

## Filtered dataframe to filtered log object

In [None]:
filtered_log = convert_df_to_log(filtered_df)

In [None]:
filtered_log

In [None]:
len(filtered_log)

Above is number of traces remaining

## Save filtered dataframe and filtered log object

In [None]:
utils.save_parquet_filtered(filtered_df)
utils.save_log_filtered(filtered_log)