In [1]:
import pandas as pd
import utils

READ_PARQUET = True

In [2]:
if READ_PARQUET:
    print("Reading parquet.")
    df = utils.load_parquet()
else:
    print("Reading excel and savings as parquet.")
    df = utils.load_excel()
    utils.save_parquet(df)

Reading excel and savings as parquet.


In [69]:
from copy import deepcopy
# drop uncomplete cases (2017 mostly) and thus do not have a complete label

data = deepcopy(df.loc[df['Timestamp'].dt.year < 2017])

## Undesired Outcome 1

Undesired outcome 1: The payment is late. A payment can be considered
timely, if there has been a begin payment activity by the end of the year that was
not eventually followed by abort payment.

Late Payments There is no easy way of filtering the cases to find cases with
late payments. So in order to look at the differences, we added a case attribute
ourselves with the use of Python. With the use of the ProM forum, we identified
3 situations in which a case is considered late:
1. There is no ‘begin payment’ event in the case.
2. The last ‘begin payment’ event is followed by an ‘abort payment’ event.
3. The last ‘begin payment’ event occurs in a later year than what the case was
started in.

### 1. There is no 'begin payment' event in the case 

#### First determine the latest Begin Payment activity for every case

In [29]:
# group by per caseid --> to find latest payment

df = data.loc[data['Activity'] == 'Payment application-Application-begin payment']
df = df.rename(columns = {'Complete Timestamp': 'Timestamp'})
latest_payments = df.loc[df.groupby('Case ID').Timestamp.idxmax()]


for i in latest_payments.index:
    # Add column to inital dataframe
    data.loc[i,'Latest_Begin_Payment'] = 1


In [30]:
data.tail()

Unnamed: 0,Case ID,Activity,Resource,Complete Timestamp,Variant,Variant index,(case) amount_applied0,(case) amount_applied1,(case) amount_applied2,(case) amount_applied3,...,docid,doctype,eventid,lifecycle:transition,note,subprocess,success,Latest_Begin_Payment,UndesiredOutcome1,Latest_Abort_Payment
193677,4c1dce27d1fa14a5,Parcel document-Main-save,478c4f,2016-06-16,Variant 9000,9000,16309.96,,,,...,-18008662613106736,Parcel document,-5.403733e+16,complete,none,Main,True,,,
193678,4c1dce27d1fa14a5,Parcel document-Main-check,478c4f,2016-06-16,Variant 9000,9000,16309.96,,,,...,-18008662613106736,Parcel document,-5.403733e+16,complete,none,Main,True,,,
193679,4c1dce27d1fa14a5,Parcel document-Main-finish editing,478c4f,2016-06-16,Variant 9000,9000,16309.96,,,,...,-18008662613106736,Parcel document,-5.403733e+16,complete,none,Main,True,,,
193680,4c1dce27d1fa14a5,Control summary-Main-save,478c4f,2016-06-16,Variant 9000,9000,16309.96,,,,...,-18008662613106034,Control summary,-5.403733e+16,complete,none,Main,True,,,
193681,4c1dce27d1fa14a5,Control summary-Main-save,6ad4a6,2016-08-10,Variant 9000,9000,16309.96,,,,...,-18008662613106034,Control summary,-3.602291e+16,complete,none,Main,True,,,


In [31]:
# Check number of cases with a Begin Payment activity
len(data.loc[data['Latest_Begin_Payment'] == 1])

3001

#### Determine all cases without Begin Payment event

In [32]:
# find all case id's for which the sum of Latest begin payment = 0. --> indicates cases without Begin Payment event
payments = data[['Case ID', 'Latest_Begin_Payment']].groupby(by=['Case ID']).sum()

no_begin_payment = payments.loc[payments['Latest_Begin_Payment'] == 0]

# in current subset of data there is no sensor without begin payment
no_begin_payment

Unnamed: 0_level_0,Latest_Begin_Payment
Case ID,Unnamed: 1_level_1
06501822341c30a8,0.0


#### Label all cases without Begin Payment event as Undesired Outcome 1

In [33]:

for i in no_begin_payment.index:
    # for all cases without begin payment --> label them as undesired outcome rows.
    data.loc[data['Case ID'] == i, 'UndesiredOutcome1'] = 1
    

In [34]:
data.loc[data['UndesiredOutcome1'] == 1].head()

Unnamed: 0,Case ID,Activity,Resource,Complete Timestamp,Variant,Variant index,(case) amount_applied0,(case) amount_applied1,(case) amount_applied2,(case) amount_applied3,...,docid,doctype,eventid,lifecycle:transition,note,subprocess,success,Latest_Begin_Payment,UndesiredOutcome1,Latest_Abort_Payment
166560,06501822341c30a8,Payment application-Application-mail income,0;n/a,2015-05-10,Variant 8575,8575,0.0,,,,...,-18008611495764646,Payment application,,complete,none,Application,True,,1.0,
166561,06501822341c30a8,Entitlement application-Main-mail valid,0;n/a,2015-05-11,Variant 8575,8575,0.0,,,,...,-18008615298820872,Entitlement application,,complete,none,Main,True,,1.0,
166562,06501822341c30a8,Payment application-Application-mail valid,0;n/a,2015-05-11,Variant 8575,8575,0.0,,,,...,-18008611495764646,Payment application,,complete,none,Application,True,,1.0,
166563,06501822341c30a8,Entitlement application-Main-mail valid,0;n/a,2015-05-11,Variant 8575,8575,0.0,,,,...,-18008615298820872,Entitlement application,,complete,none,Main,True,,1.0,
166564,06501822341c30a8,Parcel document-Main-initialize,Document processing automaton,2015-06-12,Variant 8575,8575,0.0,,,,...,-18008662110812216,Parcel document,-1.800866e+16,complete,none,Main,True,,1.0,


### 2. The last ‘begin payment’ event is followed by an ‘abort payment’ event. 

#### Determine all 'latest abort payment' events

In [35]:
# Add column that indicates the latest abort payment 

abort_df = data.loc[data['Activity'] == 'Payment application-Application-abort payment']
abort_df = abort_df.rename(columns = {'Complete Timestamp': 'Timestamp'})
latest_aborts = abort_df.loc[abort_df.groupby('Case ID').Timestamp.idxmax()]

for i in latest_aborts.index:
    
    data.loc[i,'Latest_Abort_Payment'] = 1

In [36]:
# insert artificial row that complies with the criteria

data.loc[data['Case ID'] == '5A']

Unnamed: 0,Case ID,Activity,Resource,Complete Timestamp,Variant,Variant index,(case) amount_applied0,(case) amount_applied1,(case) amount_applied2,(case) amount_applied3,...,docid,doctype,eventid,lifecycle:transition,note,subprocess,success,Latest_Begin_Payment,UndesiredOutcome1,Latest_Abort_Payment
0,5A,Payment application-Application-begin payment,0;n/a,2015-04-16,Variant 6000,6000,401.63,,,,...,-18008611495635268,Payment application,,complete,none,Application,True,1.0,,
1,5A,Payment application-Application-abort payment,0;n/a,2015-04-18,Variant 6000,6000,401.63,,,,...,-18008611495635268,Payment application,,complete,none,Application,True,,,1.0


In [37]:
# filter on criteria: cases with a begin payment and an abort payment
df = data.loc[(data['Latest_Begin_Payment'] == 1) | (data['Latest_Abort_Payment'] ==1)]

#### Find cases with abort after begin payment 
- What to do when abort payment and begin payment have the same timestamp? --> timestamp is actually a date rather than a timestamp
- Did we lose timestamp info when exporting from disco or rapidminer??
- Do things differently for automatic (batch) processing operations??

In [50]:
# filter cases with date of abort payment activity > date of latest_begin_payment

# Each case id has at max 2 rows. This because we only consider the LATEST begin payment and LATEST abort payment

df = df.groupby('Case ID')['Complete Timestamp'].agg(['min','max']).rename(columns={'min':'first','max':'last'})

merged_df = pd.merge(test, test2, left_on='Case ID', right_on='Case ID')


# check if timestamp of latest abort payment row  == max --> then there is an abort payment after the latest begin payment
for i in merged_df.index:
    
    row = merged_df.loc[i, :]
    
    if row['Latest_Abort_Payment'] == 1 and row['Complete Timestamp'] == row['last'] and row['first'] != row['last']: # check timestamps
        # when first and last date are exactly the same, the order cannot be concluded from the data and hence these cases cannot be used for predictions
        merged_df.loc[i, 'abort_after_begin_payment'] = 1
        
        abort_after_begin_payment = merged_df.loc[merged_df['abort_after_begin_payment'] == 1].index
        
# label case as undesired outcome 1 in 'data' dataframe
try:
    for i in merged_df.index:
            data.loc[i,'abort_after_begin_payment'] = 1

            data.loc[i,'UndesiredOutcome1'] = 1
except:
    print("There is no such case with Abort Payment event after Last Begin Payment event")



In [51]:
len(merged_df['Case ID'].loc[merged_df['abort_after_begin_payment'] == 1].unique())

1

In [54]:
# check if artificially created rows are indeed labeled as abort_after_begin_payment = 1
merged_df.loc[merged_df['abort_after_begin_payment'] == 1]



Unnamed: 0,Case ID,Activity,Resource,Complete Timestamp,Variant,Variant index,(case) amount_applied0,(case) amount_applied1,(case) amount_applied2,(case) amount_applied3,...,lifecycle:transition,note,subprocess,success,Latest_Begin_Payment,UndesiredOutcome1,Latest_Abort_Payment,first,last,abort_after_begin_payment
1,5A,Payment application-Application-abort payment,0;n/a,2015-04-18,Variant 6000,6000,401.63,,,,...,complete,none,Application,True,,,1.0,2015-04-16,2015-04-18,1.0


#### Conclusion
- Timestamps are not accurate as they only indicate the date and not the timestamp.
- A lot of cases have 'Complete Timestamp' value the same for latest 'Begin Payment' as latest 'Abort Payment'! --> seems to be nothing we can do about this.

### 3. The last ‘begin payment’ event occurs in a later year than what the case was started in.

In [58]:
# determine case start year
start_years = data.groupby(by=['Case ID'])['Complete Timestamp'].agg(['min']).rename(columns={'min':'StartDate'})
start_years['StartYear'] = start_years['StartDate'].dt.year
start_years = start_years.reset_index()
start_years


Unnamed: 0,Case ID,StartDate,StartYear
0,000612b48d30de74,2015-05-06,2015
1,0032b59241ce3589,2015-04-30,2015
2,0042f6f2a63d2cb6,2015-04-27,2015
3,004d4aadd6785efd,2015-05-06,2015
4,005e4fdaf08da672,2015-05-07,2015
...,...,...,...
2997,ff965028d78043eb,2015-04-21,2015
2998,ffe107efb55aa4c4,2015-05-12,2015
2999,ffe298a47381c8b7,2015-04-18,2015
3000,ffebb3a36fc7caee,2015-04-29,2015


In [61]:
# determine latest begin payment year
latest_begin_payment_year = data.loc[data['Activity'] == 'Payment application-Application-abort payment'].groupby(by=['Case ID'])['Complete Timestamp'].agg(['min']).rename(columns={'min':'StartDate'})
latest_begin_payment_year['Latest_Begin_Payment_Year'] =  latest_begin_payment_year['StartDate'].dt.year
latest_begin_payment_year = latest_begin_payment_year.reset_index()
latest_begin_payment_year

Unnamed: 0,Case ID,StartDate,Latest_Begin_Payment_Year
0,000612b48d30de74,2015-12-18,2015
1,0032b59241ce3589,2015-12-18,2015
2,0042f6f2a63d2cb6,2015-12-16,2015
3,005e4fdaf08da672,2015-12-18,2015
4,0063e4bb2015f0ba,2015-12-18,2015
...,...,...,...
2979,ff965028d78043eb,2015-12-17,2015
2980,ffe107efb55aa4c4,2015-12-18,2015
2981,ffe298a47381c8b7,2015-12-16,2015
2982,ffebb3a36fc7caee,2015-12-17,2015


In [62]:
# merge two dataframes
merged_df = pd.merge(latest_begin_payment_year, start_years, on='Case ID')
merged_df

Unnamed: 0,Case ID,StartDate_x,Latest_Begin_Payment_Year,StartDate_y,StartYear
0,000612b48d30de74,2015-12-18,2015,2015-05-06,2015
1,0032b59241ce3589,2015-12-18,2015,2015-04-30,2015
2,0042f6f2a63d2cb6,2015-12-16,2015,2015-04-27,2015
3,005e4fdaf08da672,2015-12-18,2015,2015-05-07,2015
4,0063e4bb2015f0ba,2015-12-18,2015,2015-05-12,2015
...,...,...,...,...,...
2979,ff965028d78043eb,2015-12-17,2015,2015-04-21,2015
2980,ffe107efb55aa4c4,2015-12-18,2015,2015-05-12,2015
2981,ffe298a47381c8b7,2015-12-16,2015,2015-04-18,2015
2982,ffebb3a36fc7caee,2015-12-17,2015,2015-04-29,2015


In [63]:
import numpy as np

In [64]:
# Case ID is the unique identifier in merged_df and thus no grouping by is necessary


merged_df['LatePayment'] = np.where(merged_df['Latest_Begin_Payment_Year'] > merged_df['StartYear'], 1 , 0)

In [66]:
# cases with Latest Begin Payment event in a later year than the case has started
merged_df.loc[merged_df['LatePayment'] == 1]

Unnamed: 0,Case ID,StartDate_x,Latest_Begin_Payment_Year,StartDate_y,StartYear,LatePayment
247,13404f41b918553d,2016-02-22,2016,2015-05-13,2015,1
455,260155273269fe3e,2016-02-22,2016,2015-05-13,2015,1
2373,cc845befea39d489,2016-02-22,2016,2015-04-26,2015,1
2405,cedcde09b55a5da7,2015-12-16,2015,2014-05-06,2014,1
2628,e19f821d86fc80eb,2016-02-22,2016,2015-06-08,2015,1
2769,ecbbb4234da32502,2016-02-22,2016,2015-05-15,2015,1


## Undesired Outcome 2