In [1]:
import pandas as pd

In [2]:
data = pd.read_excel('bpi_challenge_2018_50mb.xlsx')

In [3]:
adf = data.loc[data['Activity'] == 'Payment application-Application-abort payment']
len(adf)

4291

In [20]:
adf = data.loc[data['Activity'] == 'Payment application-Application-begin payment']
len(adf)

7291

In [4]:
# drop uncomplete cases (2017 mostly) and thus do not have a complete label

## Undesired Outcome 1

Undesired outcome 1: The payment is late. A payment can be considered
timely, if there has been a begin payment activity by the end of the year that was
not eventually followed by abort payment.

Late Payments There is no easy way of filtering the cases to find cases with
late payments. So in order to look at the differences, we added a case attribute
ourselves with the use of Python. With the use of the ProM forum, we identified
3 situations in which a case is considered late:
1. There is no ‘begin payment’ event in the case.
2. The last ‘begin payment’ event is followed by an ‘abort payment’ event.
3. The last ‘begin payment’ event occurs in a later year than what the case was
started in.

### 1. There is no 'begin payment' event in the case 

In [5]:
# group by per caseid --> to find latest payment

df = data.loc[data['Activity'] == 'Payment application-Application-begin payment']
df = df.rename(columns = {'Complete Timestamp': 'Timestamp'})
latest_payments = df.loc[df.groupby('Case ID').Timestamp.idxmax()]
#latest_payments.loc[latest_payments['Case ID'] == '788e3f355afbed07']
for i in latest_payments.index:
    # choose which dataframe to update
    # for now pick dataframe: data
    
    data.loc[i,'Latest_Begin_Payment'] = 1


In [27]:
data.tail()

Unnamed: 0,Case ID,Activity,Resource,Complete Timestamp,Variant,Variant index,(case) amount_applied0,(case) amount_applied1,(case) amount_applied2,(case) amount_applied3,...,concept:name,docid,doctype,eventid,lifecycle:transition,note,subprocess,success,Latest_Begin_Payment,Latest_Abort_Payment
193675,4c1dce27d1fa14a5,Parcel document-Main-save,478c4f,2016-06-16,Variant 9000,9000,16309.96,,,,...,save,-18008662613106736,Parcel document,-5.403733e+16,complete,none,Main,True,,
193676,4c1dce27d1fa14a5,Parcel document-Main-check,478c4f,2016-06-16,Variant 9000,9000,16309.96,,,,...,check,-18008662613106736,Parcel document,-5.403733e+16,complete,none,Main,True,,
193677,4c1dce27d1fa14a5,Parcel document-Main-finish editing,478c4f,2016-06-16,Variant 9000,9000,16309.96,,,,...,finish editing,-18008662613106736,Parcel document,-5.403733e+16,complete,none,Main,True,,
193678,4c1dce27d1fa14a5,Control summary-Main-save,478c4f,2016-06-16,Variant 9000,9000,16309.96,,,,...,save,-18008662613106034,Control summary,-5.403733e+16,complete,none,Main,True,,
193679,4c1dce27d1fa14a5,Control summary-Main-save,6ad4a6,2016-08-10,Variant 9000,9000,16309.96,,,,...,save,-18008662613106034,Control summary,-3.602291e+16,complete,none,Main,True,,


In [21]:
len(data.loc[data['Latest_Begin_Payment'] == 1])

3000

In [26]:
# find all case id's for which the sum of Latest begin payment = 0.
payments = data[['Case ID', 'Latest_Begin_Payment']].groupby(by=['Case ID']).sum()
no_begin_payment = payments.loc[payments['Latest_Begin_Payment'] == 0]

# in current subset of data there is no sensor without begin payment
no_begin_payment

Unnamed: 0_level_0,Latest_Begin_Payment,index
Case ID,Unnamed: 1_level_1,Unnamed: 2_level_1
06501822341c30a8,0.0,4830588


In [29]:
for i in no_begin_payment.index:
    # for all cases without begin payment --> label them as undesired outcome rows.
    data.loc[data['Case ID'] == i, 'UndesiredOutcome1'] = 1
    

In [40]:
data.loc[data['UndesiredOutcome1'] == 1].head()

Unnamed: 0,Case ID,Activity,Resource,Complete Timestamp,Variant,Variant index,(case) amount_applied0,(case) amount_applied1,(case) amount_applied2,(case) amount_applied3,...,docid,doctype,eventid,lifecycle:transition,note,subprocess,success,Latest_Begin_Payment,Latest_Abort_Payment,UndesiredOutcome1
166558,06501822341c30a8,Payment application-Application-mail income,0;n/a,2015-05-10,Variant 8575,8575,0.0,,,,...,-18008611495764646,Payment application,,complete,none,Application,True,,,1.0
166559,06501822341c30a8,Entitlement application-Main-mail valid,0;n/a,2015-05-11,Variant 8575,8575,0.0,,,,...,-18008615298820872,Entitlement application,,complete,none,Main,True,,,1.0
166560,06501822341c30a8,Payment application-Application-mail valid,0;n/a,2015-05-11,Variant 8575,8575,0.0,,,,...,-18008611495764646,Payment application,,complete,none,Application,True,,,1.0
166561,06501822341c30a8,Entitlement application-Main-mail valid,0;n/a,2015-05-11,Variant 8575,8575,0.0,,,,...,-18008615298820872,Entitlement application,,complete,none,Main,True,,,1.0
166562,06501822341c30a8,Parcel document-Main-initialize,Document processing automaton,2015-06-12,Variant 8575,8575,0.0,,,,...,-18008662110812216,Parcel document,-1.800866e+16,complete,none,Main,True,,,1.0


### 2. The last ‘begin payment’ event is followed by an ‘abort payment’ event. 

In [10]:
# Add column that indicates the latest abort payment 

adf = data.loc[data['Activity'] == 'Payment application-Application-abort payment']
adf = adf.rename(columns = {'Complete Timestamp': 'Timestamp'})

# Does not make sense to only take the latest abort statement? --> it does the job but
latest_aborts = adf.loc[adf.groupby('Case ID').Timestamp.idxmax()]
#latest_payments.loc[latest_payments['Case ID'] == '788e3f355afbed07']
for i in latest_aborts.index:
    
    data.loc[i,'Latest_Abort_Payment'] = 1

In [11]:
# insert artificial row that complies with the criteria



In [12]:
# filter on criteria: cases with a begin payment and an abort payment
test = data.loc[(data['Latest_Begin_Payment'] == 1) & (data['Latest_Abort_Payment'] ==1)]

In [13]:
# filter cases with date of abort payment activity > date of latest_begin_payment

# Each case id has at max 2 rows. This because we only consider the LATEST begin payment and LATEST abort payment

test2 = test.groupby('Case ID')['Complete Timestamp'].agg(['min','max']).rename(columns={'min':'first','max':'last'})

test3 = pd.merge(test, test2, left_on='Case ID', right_on='Case ID')


# check if timestamp of latest abort payment row  == max --> then there is an abort payment after the latest begin payment
for i in test3.index:
    
    row = test3.loc[i, :]
    
    if row['Latest_Abort_Payment'] ==1 and row['Complete Timestamp'] > row['last']: # check timestamps
        test3.loc[i, 'abort_after_begin_payment'] = 1
        
        
        
# label case as undesired outcome 1 in 'data' dataframe



In [14]:
test3

Unnamed: 0,Case ID,Activity,Resource,Complete Timestamp,Variant,Variant index,(case) amount_applied0,(case) amount_applied1,(case) amount_applied2,(case) amount_applied3,...,doctype,eventid,lifecycle:transition,note,subprocess,success,Latest_Begin_Payment,Latest_Abort_Payment,first,last


In [15]:
# 

### 3. The last ‘begin payment’ event occurs in a later year than what the case was started in.

In [38]:
# determine case start year
start_years = data.groupby(by=['Case ID'])['Complete Timestamp'].agg(['min']).rename(columns={'min':'StartDate'})
start_years = start_years.reset_index()
start_years


Unnamed: 0,Case ID,StartDate
0,000612b48d30de74,2015-05-06
1,0032b59241ce3589,2015-04-30
2,0042f6f2a63d2cb6,2015-04-27
3,004d4aadd6785efd,2015-05-06
4,005e4fdaf08da672,2015-05-07
...,...,...
2996,ff965028d78043eb,2015-04-21
2997,ffe107efb55aa4c4,2015-05-12
2998,ffe298a47381c8b7,2015-04-18
2999,ffebb3a36fc7caee,2015-04-29


In [41]:
# determine latest begin payment year
latest_begin_payment_year = data.loc[data['Activity'] == 'Payment application-Application-abort payment'].groupby(by=['Case ID'])['Complete Timestamp'].agg(['min']).rename(columns={'min':'StartDate'})
latest_begin_payment_year = latest_begin_payment_year.reset_index()
latest_begin_payment_year

Unnamed: 0,Case ID,StartDate
0,000612b48d30de74,2015-12-18
1,0032b59241ce3589,2015-12-18
2,0042f6f2a63d2cb6,2015-12-16
3,005e4fdaf08da672,2015-12-18
4,0063e4bb2015f0ba,2015-12-18
...,...,...
2978,ff965028d78043eb,2015-12-17
2979,ffe107efb55aa4c4,2015-12-18
2980,ffe298a47381c8b7,2015-12-16
2981,ffebb3a36fc7caee,2015-12-17


In [18]:
# check if year of latest begin payment > case start year

# if true --> undesired outcome 1

In [None]:
# merge two dataframes



## Undesired Outcome 2