In [1]:
import pandas as pd
import numpy as np
import utils

READ_PARQUET = True

In [2]:
if READ_PARQUET:
    print("Reading parquet.")
    df = utils.load_parquet()
else:
    print("Reading excel and savings as parquet.")
    df = utils.load_excel()
    utils.save_parquet(df)

Reading parquet.


FileNotFoundError: dataset/BPI_Challenge_df.parquet

In [None]:
from copy import deepcopy
# drop uncomplete cases (2017 mostly) and thus do not have a complete label

cases_df = deepcopy(df.loc[df['Complete Timestamp'].dt.year < 2017])

## Undesired Outcome 1

Undesired outcome 1: The payment is late. A payment can be considered
timely, if there has been a begin payment activity by the end of the year that was
not eventually followed by abort payment.

Late Payments There is no easy way of filtering the cases to find cases with
late payments. So in order to look at the differences, we added a case attribute
ourselves with the use of Python. With the use of the ProM forum, we identified
3 situations in which a case is considered late:
1. There is no ‘begin payment’ event in the case.
2. The last ‘begin payment’ event is followed by an ‘abort payment’ event.
3. The last ‘begin payment’ event occurs in a later year than what the case was
started in.

### 1. There is no 'begin payment' event in the case 

#### First determine the latest Begin Payment activity for every case
__TO DO__
- check if latest begin payment idxmax goes well for cases with only 1 Begin Payment event


In [None]:
# group by per caseid --> to find latest payment

df = cases_df.loc[cases_df['Activity'] == 'Payment application-Application-begin payment']
df = df.rename(columns = {'Complete Timestamp': 'Timestamp'})
latest_payments = df.loc[df.groupby('Case ID').Timestamp.idxmax()]


for i in latest_payments.index:
    # Add column to inital dataframe
    cases_df.loc[i,'_Latest_Begin_Payment'] = 1


#### Determine all cases without Begin Payment event

In [None]:
# find all case id's for which the sum of Latest begin payment = 0. --> indicates cases without Begin Payment event
payments = cases_df[['Case ID', '_Latest_Begin_Payment']].groupby(by=['Case ID']).sum()

no_begin_payment = payments.loc[payments['_Latest_Begin_Payment'] == 0]

# in current subset of data there is no sensor without begin payment
no_begin_payment

#### Label all cases without Begin Payment event as Undesired Outcome 1

In [None]:

for i in no_begin_payment.index:
    # for all cases without begin payment --> label them as undesired outcome rows.
    cases_df.loc[cases_df['Case ID'] == i, 'UndesiredOutcome1'] = 1
    

In [None]:
cases_df.loc[cases_df['UndesiredOutcome1'] == 1].head()

### 2. The last ‘begin payment’ event is followed by an ‘abort payment’ event. 

#### Determine all 'latest abort payment' events

In [None]:
# Add column that indicates the latest abort payment 

abort_df = cases_df.loc[cases_df['Activity'] == 'Payment application-Application-abort payment']
abort_df = abort_df.rename(columns = {'Complete Timestamp': 'Timestamp'})
latest_aborts = abort_df.loc[abort_df.groupby('Case ID').Timestamp.idxmax()]

for i in latest_aborts.index:
    
    cases_df.loc[i,'_Latest_Abort_Payment'] = 1

In [None]:
# insert artificial row that complies with the criteria

cases_df.loc[cases_df['Case ID'] == '5A']

In [None]:
# filter on criteria: cases with a begin payment and an abort payment
latest_df = cases_df.loc[(cases_df['_Latest_Begin_Payment'] == 1) | (cases_df['_Latest_Abort_Payment'] ==1)]

#### Find cases with abort after begin payment 
- What to do when abort payment and begin payment have the same timestamp? --> timestamp is actually a date rather than a timestamp
- Did we lose timestamp info when exporting from disco or rapidminer??
- In code a mistake
- Do things differently for automatic (batch) processing operations??

In [None]:
# filter cases with date of abort payment activity > date of latest_begin_payment

# Each case id has at max 2 rows. This because we only consider the LATEST begin payment and LATEST abort payment

grouped_df = latest_df.groupby('Case ID')['Complete Timestamp'].agg(['min','max']).rename(columns={'min':'first','max':'last'})

merged_df = pd.merge(latest_df, grouped_df, left_on='Case ID', right_on='Case ID')


# check if timestamp of latest abort payment row  == max --> then there is an abort payment after the latest begin payment
for i in merged_df.index:
    
    row = merged_df.loc[i, :]
    
    if row['_Latest_Abort_Payment'] == 1 and row['Complete Timestamp'] == row['last'] and row['first'] != row['last']: # check timestamps
        # when first and last date are exactly the same, the order cannot be concluded from the data and hence these cases cannot be used for predictions
        merged_df.loc[i, '_Abort_After_Begin_Payment'] = 1

        
# label case as undesired outcome 1 in 'data' dataframe
try:
    aborted_cases =  list(merged_df['Case ID'].loc[merged_df['_Abort_After_Begin_Payment'] == 1].unique())
    for i in aborted_cases:
            cases_df.loc[cases_df['Case ID'] == i,'_Abort_After_Begin_Payment'] = 1

            cases_df.loc[cases_df['Case ID'] == i,'UndesiredOutcome1'] = 1
except:
    print("There is no such case with Abort Payment event after Last Begin Payment event")



In [None]:
# check if artificially created rows are indeed labeled as abort_after_begin_payment = 1
merged_df.loc[merged_df['_Abort_After_Begin_Payment'] == 1]


#### Conclusion
- Timestamps are not accurate as they only indicate the date and not the timestamp.
- A lot of cases have 'Complete Timestamp' value the same for latest 'Begin Payment' as latest 'Abort Payment'! --> seems to be nothing we can do about this.

### 3. The last ‘begin payment’ event occurs in a later year than what the case was started in.

In [None]:
# determine case start year
start_year_case = cases_df.groupby(by=['Case ID'])['Complete Timestamp'].agg(['min']).rename(columns={'min':'StartDate'})
start_year_case['StartYear'] = start_year_case['StartDate'].dt.year
start_year_case = start_year_case.reset_index()
start_year_case.head()


In [None]:
# determine latest begin payment year
latest_begin_payment_year = cases_df.loc[cases_df['Activity'] == 'Payment application-Application-abort payment'].groupby(by=['Case ID'])['Complete Timestamp'].agg(['min']).rename(columns={'min':'StartDate'})
latest_begin_payment_year['_Latest_Begin_Payment_Year'] =  latest_begin_payment_year['StartDate'].dt.year
latest_begin_payment_year = latest_begin_payment_year.reset_index()
latest_begin_payment_year.head()

In [None]:
# merge two dataframes
year_df = pd.merge(latest_begin_payment_year, start_year_case, on='Case ID')
del year_df['StartDate_x'] # remove
del year_df[ 'StartDate_y'] # remove
# Case ID is the unique identifier in merged_df and thus no grouping by is necessary
year_df['LatePayment'] = np.where(year_df['_Latest_Begin_Payment_Year'] > year_df['StartYear'], 1 , 0)

In [None]:
# cases with Latest Begin Payment event in a later year than the case has started
year_df.loc[year_df['LatePayment'] == 1]

#### Label cases with payment in a later year than in which case started as undesired outcome 1

In [None]:

late_payments = year_df['Case ID'].loc[year_df['LatePayment'] == 1].unique()

# loop over all Case ID's that have a late payment
for i in late_payments:
    row = cases_df.loc[cases_df['Case ID'] == i, :]
    cases_df.loc[cases_df['Case ID'] == i,'_Begin_Payment_Next_Year'] = 1

    cases_df.loc[cases_df['Case ID'] == i,'UndesiredOutcome1'] = 1


In [None]:
#cases_df.loc[cases_df['Case ID'] == 'cc845befea39d489'].head()

In [None]:
cases_df.head()

In [None]:
undesired_1 = cases_df[['Case ID', 'UndesiredOutcome1']]

In [None]:
undesired_rows = cases_df.loc[cases_df['UndesiredOutcome1'] == 1]
len(undesired_outcomes['Case ID'].unique())

### Save Undesired Outcomes as dataframe in parquet file

In [None]:
utils.save_parquet(cases_df, undesired_outcomes=True)

## Undesired Outcome 2

In [None]:
utils.generate_one_hot_encoding()

In [None]:
utils.generate_outcome_two()

### Undesired Outcomes DataFrame