In [2]:
import pandas as pd
import numpy as np
import utils
import pyarrow

READ_PARQUET = True

In [2]:
import importlib
utils = importlib.reload(utils)  # reloads the utils module without restarting kernel

In [3]:
if READ_PARQUET:
    print("Reading parquet.")
    df = utils.load_parquet()
else:
    print("Reading csv and savings as parquet.")
    df = utils.load_csv()
    utils.save_parquet(df)

Reading parquet.


In [4]:
from copy import deepcopy
# drop uncomplete cases (2017 mostly) and thus do not have a complete label

#cases_df = deepcopy(df.loc[df['Complete Timestamp'].dt.year < 2017])
cases_df = deepcopy(df)

In [5]:
cases_df = cases_df.rename(columns = {'Complete Timestamp': 'Timestamp'})
cases_df.Timestamp = pd.to_datetime(cases_df.Timestamp)

## Undesired Outcome 1

Undesired outcome 1: The payment is late. A payment can be considered
timely, if there has been a begin payment activity by the end of the year that was
not eventually followed by abort payment.

Late Payments There is no easy way of filtering the cases to find cases with
late payments. So in order to look at the differences, we added a case attribute
ourselves with the use of Python. With the use of the ProM forum, we identified
3 situations in which a case is considered late:
1. There is no ‘begin payment’ event in the case.
2. The last ‘begin payment’ event is followed by an ‘abort payment’ event.
3. The last ‘begin payment’ event occurs in a later year than what the case was
started in.

### 1. There is no 'begin payment' event in the case 

#### First determine the latest Begin Payment activity for every case
__TO DO__
- check if latest begin payment idxmax goes well for cases with only 1 Begin Payment event


In [6]:
# group by per caseid --> to find latest payment

df = cases_df.loc[cases_df['Activity'] == 'Payment application-Application-begin payment']
df = df.rename(columns = {'Complete Timestamp': 'Timestamp'})
df.Timestamp = pd.to_datetime(df.Timestamp)
latest_payments = df.loc[df.groupby('Case ID').Timestamp.idxmax()]


for i in latest_payments.index:
    # Add column to inital dataframe
    cases_df.loc[i,'_Latest_Begin_Payment'] = 1


#### Determine all cases without Begin Payment event

In [7]:
# find all case id's for which the sum of Latest begin payment = 0. --> indicates cases without Begin Payment event
payments = cases_df[['Case ID', '_Latest_Begin_Payment']].groupby(by=['Case ID']).sum()

no_begin_payment = payments.loc[payments['_Latest_Begin_Payment'] == 0]

# in current subset of data there is no sensor without begin payment
no_begin_payment

Unnamed: 0_level_0,_Latest_Begin_Payment
Case ID,Unnamed: 1_level_1
007cb8b23b9f5546,0.0
009be1f246ce84a1,0.0
00d49f179074ae53,0.0
06501822341c30a8,0.0
07e35b8f82b537df,0.0
...,...
f716ce59be56e810,0.0
f897f2f5fe58063a,0.0
fb609a453f4b7aed,0.0
fec6ae945d1e5d35,0.0


#### Label all cases without Begin Payment event as Undesired Outcome 1

In [8]:

for i in no_begin_payment.index:
    # for all cases without begin payment --> label them as undesired outcome rows.
    cases_df.loc[cases_df['Case ID'] == i, 'UndesiredOutcome1'] = 1
    

In [9]:
cases_df.loc[cases_df['UndesiredOutcome1'] == 1].head()

Unnamed: 0,Case ID,Activity,Resource,Timestamp,Variant,Variant index,(case) amount_applied0,(case) amount_applied1,(case) amount_applied2,(case) amount_applied3,...,concept:name,docid,doctype,eventid,lifecycle:transition,note,subprocess,success,_Latest_Begin_Payment,UndesiredOutcome1
183867,ff67b399effb3fa8,Payment application-Application-mail income,0;n/a,2015-05-13 00:00:00,Variant 4243,4243,0.0,,,,...,mail income,-18008611495802873,Payment application,,complete,none,Application,True,,1.0
183868,ff67b399effb3fa8,Entitlement application-Main-mail valid,0;n/a,2015-05-15 00:00:00,Variant 4243,4243,0.0,,,,...,mail valid,-18008615298849558,Entitlement application,,complete,none,Main,True,,1.0
183869,ff67b399effb3fa8,Payment application-Application-mail valid,0;n/a,2015-05-15 00:00:00,Variant 4243,4243,0.0,,,,...,mail valid,-18008611495802873,Payment application,,complete,none,Application,True,,1.0
183870,ff67b399effb3fa8,Entitlement application-Main-mail valid,0;n/a,2015-05-15 00:00:00,Variant 4243,4243,0.0,,,,...,mail valid,-18008615298849558,Entitlement application,,complete,none,Main,True,,1.0
183871,ff67b399effb3fa8,Parcel document-Main-initialize,Document processing automaton,2015-06-12 13:32:10,Variant 4243,4243,0.0,,,,...,initialize,-18008662111499908,Parcel document,-1.800866e+16,complete,none,Main,True,,1.0


### 2. The last ‘begin payment’ event is followed by an ‘abort payment’ event. 

#### Determine all 'latest abort payment' events

In [10]:
# Add column that indicates the latest abort payment 

abort_df = cases_df.loc[cases_df['Activity'] == 'Payment application-Application-abort payment']
abort_df = abort_df.rename(columns = {'Complete Timestamp': 'Timestamp'})
abort_df.Timestamp = pd.to_datetime(abort_df.Timestamp)
latest_aborts = abort_df.loc[abort_df.groupby('Case ID').Timestamp.idxmax()]

for i in latest_aborts.index:
    
    cases_df.loc[i,'_Latest_Abort_Payment'] = 1

In [11]:
# insert artificial row that complies with the criteria

cases_df.loc[cases_df['Case ID'] == '5A']

Unnamed: 0,Case ID,Activity,Resource,Timestamp,Variant,Variant index,(case) amount_applied0,(case) amount_applied1,(case) amount_applied2,(case) amount_applied3,...,docid,doctype,eventid,lifecycle:transition,note,subprocess,success,_Latest_Begin_Payment,UndesiredOutcome1,_Latest_Abort_Payment


In [12]:
# filter on criteria: cases with a begin payment and an abort payment
latest_df = cases_df.loc[(cases_df['_Latest_Begin_Payment'] == 1) | (cases_df['_Latest_Abort_Payment'] ==1)]

#### Find cases with abort after begin payment 
- What to do when abort payment and begin payment have the same timestamp? --> timestamp is actually a date rather than a timestamp
- Did we lose timestamp info when exporting from disco or rapidminer??
- In code a mistake
- Do things differently for automatic (batch) processing operations??

In [13]:
# filter cases with date of abort payment activity > date of latest_begin_payment

# Each case id has at max 2 rows. This because we only consider the LATEST begin payment and LATEST abort payment

grouped_df = latest_df.groupby('Case ID')['Timestamp'].agg(['min','max']).rename(columns={'min':'first','max':'last'})

merged_df = pd.merge(latest_df, grouped_df, left_on='Case ID', right_on='Case ID')


# check if timestamp of latest abort payment row  == max --> then there is an abort payment after the latest begin payment
for i in merged_df.index:
    
    row = merged_df.loc[i, :]
    
    if row['_Latest_Abort_Payment'] == 1 and row['Timestamp'] == row['last'] and row['first'] != row['last']: # check timestamps
        # when first and last date are exactly the same, the order cannot be concluded from the data and hence these cases cannot be used for predictions
        merged_df.loc[i, '_Abort_After_Begin_Payment'] = 1

        
# label case as undesired outcome 1 in 'data' dataframe
try:
    aborted_cases =  list(merged_df['Case ID'].loc[merged_df['_Abort_After_Begin_Payment'] == 1].unique())
    for i in aborted_cases:
            cases_df.loc[cases_df['Case ID'] == i,'_Abort_After_Begin_Payment'] = 1

            cases_df.loc[cases_df['Case ID'] == i,'UndesiredOutcome1'] = 1
except:
    print("There is no such case with Abort Payment event after Last Begin Payment event")



In [14]:
# check if artificially created rows are indeed labeled as abort_after_begin_payment = 1
#merged_df.loc[merged_df['_Abort_After_Begin_Payment'] == 1]


#### Conclusion
- Timestamps are not accurate as they only indicate the date and not the timestamp.
- A lot of cases have 'Complete Timestamp' value the same for latest 'Begin Payment' as latest 'Abort Payment'! --> seems to be nothing we can do about this.

### 3. The last ‘begin payment’ event occurs in a later year than what the case was started in.

In [15]:
# determine case start year
start_year_case = cases_df.groupby(by=['Case ID'])['Timestamp'].agg(['min']).rename(columns={'min':'StartDate'})
start_year_case['StartYear'] = start_year_case['StartDate'].dt.year
start_year_case = start_year_case.reset_index()
start_year_case.head()


Unnamed: 0,Case ID,StartDate,StartYear
0,0002505cb62792e4,2015-04-28,2015
1,0002a55a6130cec8,2015-04-17,2015
2,0004ff62053a60ce,2015-05-11,2015
3,000612b48d30de74,2015-05-06,2015
4,0006cc909ce508b0,2016-04-28,2016


In [16]:
# determine latest begin payment year
latest_begin_payment_year = cases_df.loc[cases_df['Activity'] == 'Payment application-Application-abort payment'].groupby(by=['Case ID'])['Timestamp'].agg(['min']).rename(columns={'min':'StartDate'})
latest_begin_payment_year['_Latest_Begin_Payment_Year'] =  latest_begin_payment_year['StartDate'].dt.year
latest_begin_payment_year = latest_begin_payment_year.reset_index()
latest_begin_payment_year.head()

Unnamed: 0,Case ID,StartDate,_Latest_Begin_Payment_Year
0,0002505cb62792e4,2015-12-17 14:58:57.997,2015
1,0002a55a6130cec8,2015-12-18 12:40:48.866,2015
2,0004ff62053a60ce,2015-12-16 16:43:14.024,2015
3,000612b48d30de74,2015-12-18 13:09:36.753,2015
4,000843b028f083e2,2015-12-17 14:57:53.610,2015


In [17]:
# merge two dataframes
year_df = pd.merge(latest_begin_payment_year, start_year_case, on='Case ID')
del year_df['StartDate_x'] # remove
del year_df[ 'StartDate_y'] # remove
# Case ID is the unique identifier in merged_df and thus no grouping by is necessary
year_df['LatePayment'] = np.where(year_df['_Latest_Begin_Payment_Year'] > year_df['StartYear'], 1 , 0)

In [18]:
# cases with Latest Begin Payment event in a later year than the case has started
year_df.loc[year_df['LatePayment'] == 1]

Unnamed: 0,Case ID,_Latest_Begin_Payment_Year,StartYear,LatePayment
898,095ffc8dd79aae27,2016,2015,1
1800,1247d4083d7d565d,2016,2015,1
1913,13404f41b918553d,2016,2015,1
1977,13bd492abca27e38,2015,2014,1
1985,13d5a04a7ccf8108,2016,2015,1
3828,260155273269fe3e,2016,2015,1
4710,2e0f33f7ae3fadd9,2015,2014,1
7828,4c2f30ce48103ded,2016,2015,1
8286,50b780b4500b950f,2016,2015,1
8303,50f57bfbcac17b65,2015,2014,1


#### Label cases with payment in a later year than in which case started as undesired outcome 1

In [19]:

late_payments = year_df['Case ID'].loc[year_df['LatePayment'] == 1].unique()

# loop over all Case ID's that have a late payment
for i in late_payments:
    row = cases_df.loc[cases_df['Case ID'] == i, :]
    cases_df.loc[cases_df['Case ID'] == i,'_Begin_Payment_Next_Year'] = 1

    cases_df.loc[cases_df['Case ID'] == i,'UndesiredOutcome1'] = 1


#### Create df: single out Case ID and UndesiredOutcome column

In [20]:
undesired_1 = cases_df[['Case ID', 'UndesiredOutcome1']]

### Save Undesired Outcomes as dataframe in parquet file

In [21]:
utils.save_parquet(cases_df, undesired_outcomes=True)

## Undesired Outcome 2

In [22]:
data = deepcopy(cases_df)

In [23]:
utils.generate_one_hot_encoding()

Unnamed: 0,Case ID,Activity,Sub Process,Application,Change,Declared,Main,Objection,On,Remote,Reported
0,8b99873a6136cfa6,Payment application-Application-mail income,Application,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8b99873a6136cfa6,Payment application-Application-mail valid,Application,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8b99873a6136cfa6,Entitlement application-Main-mail valid,Main,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,8b99873a6136cfa6,Entitlement application-Main-mail valid,Main,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,8b99873a6136cfa6,Parcel document-Main-initialize,Main,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2514261,ad5dfa0b929735be,Payment application-Application-decide,Application,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2514262,ad5dfa0b929735be,Payment application-Application-begin payment,Application,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2514263,ad5dfa0b929735be,Payment application-Application-insert document,Application,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2514264,ad5dfa0b929735be,Payment application-Application-insert document,Application,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
undesired2 = utils.generate_outcome_two()

### Undesired Outcomes DataFrame

In [25]:
undesired_outcomes_df = pd.merge(undesired_1, undesired2, how='left', left_index=True, right_index=True)

In [26]:
undesired_outcomes_df['Undesired Outcome 1'] = np.where(undesired_outcomes_df['UndesiredOutcome1'] == 1, True, False)
undesired_outcomes_df = undesired_outcomes_df.rename(columns={'Case ID_y': 'Case ID'})
try:
    del undesired_outcomes_df['UndesiredOutcome1']
    del undesired_outcomes_df['Case ID_x']
except:
    print('Column already deleted')

In [27]:
undesired_outcomes_df

Unnamed: 0,Case ID,Undesired Outcome 2,Undesired Outcome 1
0,8b99873a6136cfa6,False,False
1,8b99873a6136cfa6,False,False
2,8b99873a6136cfa6,False,False
3,8b99873a6136cfa6,False,False
4,8b99873a6136cfa6,False,False
...,...,...,...
2514261,ad5dfa0b929735be,False,False
2514262,ad5dfa0b929735be,False,False
2514263,ad5dfa0b929735be,False,False
2514264,ad5dfa0b929735be,False,False


In [28]:
utils._save_pickle(undesired_outcomes_df, 'dataset/undesired_outcomes_df')