In [1]:
import pandas as pd
import numpy as np
import utils
import pyarrow
from datetime import datetime, timedelta

READ_PARQUET = True
pd.set_option('display.max_columns', None)

In [2]:
import pickle
import calendar

In [3]:
import importlib
utils = importlib.reload(utils)  # reloads the utils module without restarting kernel

In [4]:
if READ_PARQUET:
    print("Reading parquet.")
    log = utils.load_parquet()
else:
    print("Reading csv and savings as parquet.")
    df = utils.load_csv()
    utils.save_parquet(df)

Reading parquet.


In [5]:
from copy import deepcopy
# drop uncomplete cases (2017 mostly) and thus do not have a complete label

#cases_df = deepcopy(df.loc[df['Complete Timestamp'].dt.year < 2017])
df = deepcopy(log)

### Rename changed process and doctype names

In [6]:
df = df.replace({'Department control parcels': 'Geo parcel document'}, regex=True)
df = df.replace({'Parcel document': 'Geo parcel document'}, regex=True)

### Adding  Sub Process

In [7]:
def column_sub_process(data):
    data['Sub_Process'] = data['Activity'].apply(lambda x: x.split('-', 2)[:2])
    data['Sub_Process'] = [', '.join(map(str, l)) for l in data['Sub_Process']]
    return data

In [8]:
df = column_sub_process(df)

### Adding start date and end date

In [9]:
dfc = df.groupby('Case ID')['Complete Timestamp']
df = df.assign(min=dfc.transform(min), max=dfc.transform(max)).rename(columns={'min':'StartDate', 'max' : 'EndDate'})

In [10]:
df = df.rename(columns= {'max': 'EndDate', 'min': 'StartDate'})

### Adding total duration

In [11]:
df['EndDate']= pd.to_datetime(df['EndDate'])
df['StartDate']= pd.to_datetime(df['StartDate'])
df['Total_Duration'] = df['EndDate'] - df['StartDate']
df.Total_Duration = df['Total_Duration'].dt.days

### Adding time passed till then

In [12]:
df['Complete Timestamp'] =  pd.to_datetime(df['Complete Timestamp'])
df['TimePassed'] = df['Complete Timestamp'] - df['StartDate']
df.TimePassed = df['TimePassed'].dt.days

### Adding eventcounter

In [13]:
df['event'] = 1
df['eventcounter'] = df.groupby(['Case ID'])['event'].apply(lambda x: x.cumsum())

### Adding cumulative success

In [14]:
df['success'] = df['success'].astype(int)
df['cum_success'] = df.groupby(['Case ID'])['success'].apply(lambda x: x.cumsum())
df['succes_rate'] = df['cum_success']/df['eventcounter']

### Adding activity counter

In [16]:
df['cum_activity'] = df.groupby(['Case ID', 'activity'])['event'].apply(lambda x: x.cumsum())

KeyboardInterrupt: 

### Adding Sub process counter

In [None]:
df['cum_subprocess'] = df.groupby(['Case ID', 'Sub_Process'])['event'].apply(lambda x: x.cumsum())

In [None]:
df.loc[df['Case ID'] == '8b99873a6136cfa6']

### Add  time between concurrent activities

In [None]:
df['Sub_Process'].nunique()

In [None]:
df['Timelag'] =df.groupby('Case ID')['Complete Timestamp'].diff()
df.Timelag = df['Timelag'].dt.days
df.Timelag = df['Timelag'].fillna(0)

### Add max, min, mean time between activities for a Case ID until that point
#### didn't include min time lag because always 0

In [None]:
df['Max_lag'] = df.groupby(['Case ID'])['Timelag'].apply(lambda x: x.cummax())

In [None]:
df['Avg_lag'] = df.groupby(['Case ID'])['Timelag'].apply(lambda x: x.shift().expanding().mean())
df['Avg_lag'] = df['Avg_lag'].fillna(0)

# Drop nog de nietszeggende columns

In [None]:
df

In [None]:
del df['event']

In [None]:
df

### T0 features

In [17]:
df = pickle.load(open('C:/Users/nlsvee/Documents/JADS Y2/PROM/Assignment 2/process-mining-assignment-2/dataset/final_input_for_counters', 'rb'))

In [18]:
df

Unnamed: 0,Case ID,Activity,Resource,Complete Timestamp,Variant,Variant index,(case) amount_applied0,(case) amount_applied1,(case) amount_applied2,(case) amount_applied3,(case) applicant,(case) application,(case) area,(case) basic payment,(case) cross_compliance,(case) department,(case) greening,(case) number_parcels,(case) payment_actual0,(case) payment_actual1,(case) payment_actual2,(case) payment_actual3,(case) penalty_ABP,(case) penalty_AGP,(case) penalty_AJLP,(case) penalty_AUVP,(case) penalty_AVBP,(case) penalty_AVGP,(case) penalty_AVJLP,(case) penalty_AVUVP,(case) penalty_B16,(case) penalty_B2,(case) penalty_B3,(case) penalty_B4,(case) penalty_B5,(case) penalty_B5F,(case) penalty_B6,(case) penalty_BGK,(case) penalty_BGKV,(case) penalty_BGP,(case) penalty_C16,(case) penalty_C4,(case) penalty_C9,(case) penalty_CC,(case) penalty_GP1,(case) penalty_JLP1,(case) penalty_JLP2,(case) penalty_JLP3,(case) penalty_JLP5,(case) penalty_JLP6,(case) penalty_JLP7,(case) penalty_V5,(case) penalty_amount0,(case) penalty_amount1,(case) penalty_amount2,(case) penalty_amount3,(case) program-id,(case) redistribution,(case) rejected,(case) risk_factor,(case) selected_manually,(case) selected_random,(case) selected_risk,(case) small farmer,(case) year,(case) young farmer,activity,concept:name,docid,doctype,eventid,lifecycle:transition,note,subprocess,success,Sub_Process,StartDate,EndDate,Total_Duration,TimePassed,eventcounter,cum_success,succes_rate,cum_activity,cum_subprocess,Timelag,Max_lag,Avg_lag
0,8b99873a6136cfa6,Payment application-Application-mail income,0;n/a,2015-05-08 00:00:00.000,Variant 1832,1832,960.35,,,,b3b1bafcf8a5c359,8b99873a6136cfa6,2.6994,True,0.0,e7,True,3,960.35,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,,,,215,True,False,1.0,False,False,False,True,2015,False,mail income,mail income,-18008611495569447,Payment application,,complete,none,Application,1,"Payment application, Application",2015-05-08,2016-02-18 07:56:22.774,286,0,1,1,1.000000,1,1,0.0,0.0,0.000000
1,8b99873a6136cfa6,Payment application-Application-mail valid,0;n/a,2015-05-08 00:00:00.000,Variant 1832,1832,960.35,,,,b3b1bafcf8a5c359,8b99873a6136cfa6,2.6994,True,0.0,e7,True,3,960.35,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,,,,215,True,False,1.0,False,False,False,True,2015,False,mail valid,mail valid,-18008611495569447,Payment application,,complete,none,Application,1,"Payment application, Application",2015-05-08,2016-02-18 07:56:22.774,286,0,2,2,1.000000,1,2,0.0,0.0,0.000000
2,8b99873a6136cfa6,Entitlement application-Main-mail valid,0;n/a,2015-05-08 00:00:00.000,Variant 1832,1832,960.35,,,,b3b1bafcf8a5c359,8b99873a6136cfa6,2.6994,True,0.0,e7,True,3,960.35,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,,,,215,True,False,1.0,False,False,False,True,2015,False,mail valid,mail valid,-18008615298673397,Entitlement application,,complete,none,Main,1,"Entitlement application, Main",2015-05-08,2016-02-18 07:56:22.774,286,0,3,3,1.000000,2,1,0.0,0.0,0.000000
3,8b99873a6136cfa6,Entitlement application-Main-mail valid,0;n/a,2015-05-08 00:00:00.000,Variant 1832,1832,960.35,,,,b3b1bafcf8a5c359,8b99873a6136cfa6,2.6994,True,0.0,e7,True,3,960.35,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,,,,215,True,False,1.0,False,False,False,True,2015,False,mail valid,mail valid,-18008615298673397,Entitlement application,,complete,none,Main,1,"Entitlement application, Main",2015-05-08,2016-02-18 07:56:22.774,286,0,4,4,1.000000,3,2,0.0,0.0,0.000000
4,8b99873a6136cfa6,Geo parcel document-Main-initialize,fb5fa8,2015-06-10 11:16:28.000,Variant 1832,1832,960.35,,,,b3b1bafcf8a5c359,8b99873a6136cfa6,2.6994,True,0.0,e7,True,3,960.35,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,,,,215,True,False,1.0,False,False,False,True,2015,False,initialize,initialize,-72051858488795160,Geo parcel document,-7.205186e+16,complete,none,Main,1,"Geo parcel document, Main",2015-05-08,2016-02-18 07:56:22.774,286,33,5,5,1.000000,1,1,33.0,33.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2514261,ad5dfa0b929735be,Payment application-Application-decide,155add,2017-12-11 10:26:39.692,Variant 28923,28923,5117.97,,,,3a96cda3a6b0f35f,ad5dfa0b929735be,15.2806,True,0.0,6b,True,6,5116.99,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,,,,215,True,False,1.0,False,False,False,False,2017,False,decide,decide,-54037160734808716,Payment application,-9.006594e+16,complete,automatic,Application,1,"Payment application, Application",2017-04-13,2018-01-05 14:45:52.100,267,242,59,59,1.000000,1,16,2.0,75.0,4.017241
2514262,ad5dfa0b929735be,Payment application-Application-begin payment,DP-Z,2017-12-15 16:00:12.252,Variant 28923,28923,5117.97,,,,3a96cda3a6b0f35f,ad5dfa0b929735be,15.2806,True,0.0,6b,True,6,5116.99,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,,,,215,True,False,1.0,False,False,False,False,2017,False,begin payment,begin payment,-54037160734808716,Payment application,-9.006594e+16,complete,automatic during payment,Application,1,"Payment application, Application",2017-04-13,2018-01-05 14:45:52.100,267,246,60,60,1.000000,1,17,4.0,75.0,3.983051
2514263,ad5dfa0b929735be,Payment application-Application-insert document,Notification automaton,2017-12-15 19:19:04.499,Variant 28923,28923,5117.97,,,,3a96cda3a6b0f35f,ad5dfa0b929735be,15.2806,True,0.0,6b,True,6,5116.99,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,,,,215,True,False,1.0,False,False,False,False,2017,False,insert document,insert document,-54037160734808716,Payment application,-9.006594e+16,complete,notification for applicant,Application,1,"Payment application, Application",2017-04-13,2018-01-05 14:45:52.100,267,246,61,61,1.000000,2,18,0.0,75.0,3.983333
2514264,ad5dfa0b929735be,Payment application-Application-insert document,Notification automaton,2017-12-20 09:02:30.380,Variant 28923,28923,5117.97,,,,3a96cda3a6b0f35f,ad5dfa0b929735be,15.2806,True,0.0,6b,True,6,5116.99,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,,,,215,True,False,1.0,False,False,False,False,2017,False,insert document,insert document,-54037160734808716,Payment application,-1.260947e+17,complete,notification for applicant,Application,0,"Payment application, Application",2017-04-13,2018-01-05 14:45:52.100,267,251,62,61,0.983871,3,19,4.0,75.0,3.918033


In [19]:
df['avg_amount_applied0_per_parcel'] = df['(case) amount_applied0'] / df['(case) number_parcels']

In [20]:
df['avg_amount_applied0_per_area'] = df['(case) amount_applied0'] / df['(case) area']

In [21]:
df['avg_area_per_parcel'] = df['(case) area'] / df['(case) number_parcels']

In [22]:
df['StartMonth'] = pd.DatetimeIndex(df['StartDate']).month
startmonth = df['StartMonth'].to_list()


monthlist = []
for i in startmonth:
    monthlist.append(calendar.month_name[i])
    
df['StartMonth'] = monthlist

#df['Month'] = df['Month'].apply(lambda x: calendar.month_abbr[x])

In [23]:
df

Unnamed: 0,Case ID,Activity,Resource,Complete Timestamp,Variant,Variant index,(case) amount_applied0,(case) amount_applied1,(case) amount_applied2,(case) amount_applied3,(case) applicant,(case) application,(case) area,(case) basic payment,(case) cross_compliance,(case) department,(case) greening,(case) number_parcels,(case) payment_actual0,(case) payment_actual1,(case) payment_actual2,(case) payment_actual3,(case) penalty_ABP,(case) penalty_AGP,(case) penalty_AJLP,(case) penalty_AUVP,(case) penalty_AVBP,(case) penalty_AVGP,(case) penalty_AVJLP,(case) penalty_AVUVP,(case) penalty_B16,(case) penalty_B2,(case) penalty_B3,(case) penalty_B4,(case) penalty_B5,(case) penalty_B5F,(case) penalty_B6,(case) penalty_BGK,(case) penalty_BGKV,(case) penalty_BGP,(case) penalty_C16,(case) penalty_C4,(case) penalty_C9,(case) penalty_CC,(case) penalty_GP1,(case) penalty_JLP1,(case) penalty_JLP2,(case) penalty_JLP3,(case) penalty_JLP5,(case) penalty_JLP6,(case) penalty_JLP7,(case) penalty_V5,(case) penalty_amount0,(case) penalty_amount1,(case) penalty_amount2,(case) penalty_amount3,(case) program-id,(case) redistribution,(case) rejected,(case) risk_factor,(case) selected_manually,(case) selected_random,(case) selected_risk,(case) small farmer,(case) year,(case) young farmer,activity,concept:name,docid,doctype,eventid,lifecycle:transition,note,subprocess,success,Sub_Process,StartDate,EndDate,Total_Duration,TimePassed,eventcounter,cum_success,succes_rate,cum_activity,cum_subprocess,Timelag,Max_lag,Avg_lag,avg_amount_applied0_per_parcel,avg_amount_applied0_per_area,avg_area_per_parcel,StartMonth
0,8b99873a6136cfa6,Payment application-Application-mail income,0;n/a,2015-05-08 00:00:00.000,Variant 1832,1832,960.35,,,,b3b1bafcf8a5c359,8b99873a6136cfa6,2.6994,True,0.0,e7,True,3,960.35,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,,,,215,True,False,1.0,False,False,False,True,2015,False,mail income,mail income,-18008611495569447,Payment application,,complete,none,Application,1,"Payment application, Application",2015-05-08,2016-02-18 07:56:22.774,286,0,1,1,1.000000,1,1,0.0,0.0,0.000000,320.116667,355.764244,0.899800,May
1,8b99873a6136cfa6,Payment application-Application-mail valid,0;n/a,2015-05-08 00:00:00.000,Variant 1832,1832,960.35,,,,b3b1bafcf8a5c359,8b99873a6136cfa6,2.6994,True,0.0,e7,True,3,960.35,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,,,,215,True,False,1.0,False,False,False,True,2015,False,mail valid,mail valid,-18008611495569447,Payment application,,complete,none,Application,1,"Payment application, Application",2015-05-08,2016-02-18 07:56:22.774,286,0,2,2,1.000000,1,2,0.0,0.0,0.000000,320.116667,355.764244,0.899800,May
2,8b99873a6136cfa6,Entitlement application-Main-mail valid,0;n/a,2015-05-08 00:00:00.000,Variant 1832,1832,960.35,,,,b3b1bafcf8a5c359,8b99873a6136cfa6,2.6994,True,0.0,e7,True,3,960.35,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,,,,215,True,False,1.0,False,False,False,True,2015,False,mail valid,mail valid,-18008615298673397,Entitlement application,,complete,none,Main,1,"Entitlement application, Main",2015-05-08,2016-02-18 07:56:22.774,286,0,3,3,1.000000,2,1,0.0,0.0,0.000000,320.116667,355.764244,0.899800,May
3,8b99873a6136cfa6,Entitlement application-Main-mail valid,0;n/a,2015-05-08 00:00:00.000,Variant 1832,1832,960.35,,,,b3b1bafcf8a5c359,8b99873a6136cfa6,2.6994,True,0.0,e7,True,3,960.35,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,,,,215,True,False,1.0,False,False,False,True,2015,False,mail valid,mail valid,-18008615298673397,Entitlement application,,complete,none,Main,1,"Entitlement application, Main",2015-05-08,2016-02-18 07:56:22.774,286,0,4,4,1.000000,3,2,0.0,0.0,0.000000,320.116667,355.764244,0.899800,May
4,8b99873a6136cfa6,Geo parcel document-Main-initialize,fb5fa8,2015-06-10 11:16:28.000,Variant 1832,1832,960.35,,,,b3b1bafcf8a5c359,8b99873a6136cfa6,2.6994,True,0.0,e7,True,3,960.35,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,,,,215,True,False,1.0,False,False,False,True,2015,False,initialize,initialize,-72051858488795160,Geo parcel document,-7.205186e+16,complete,none,Main,1,"Geo parcel document, Main",2015-05-08,2016-02-18 07:56:22.774,286,33,5,5,1.000000,1,1,33.0,33.0,0.000000,320.116667,355.764244,0.899800,May
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2514261,ad5dfa0b929735be,Payment application-Application-decide,155add,2017-12-11 10:26:39.692,Variant 28923,28923,5117.97,,,,3a96cda3a6b0f35f,ad5dfa0b929735be,15.2806,True,0.0,6b,True,6,5116.99,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,,,,215,True,False,1.0,False,False,False,False,2017,False,decide,decide,-54037160734808716,Payment application,-9.006594e+16,complete,automatic,Application,1,"Payment application, Application",2017-04-13,2018-01-05 14:45:52.100,267,242,59,59,1.000000,1,16,2.0,75.0,4.017241,852.995000,334.932529,2.546767,April
2514262,ad5dfa0b929735be,Payment application-Application-begin payment,DP-Z,2017-12-15 16:00:12.252,Variant 28923,28923,5117.97,,,,3a96cda3a6b0f35f,ad5dfa0b929735be,15.2806,True,0.0,6b,True,6,5116.99,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,,,,215,True,False,1.0,False,False,False,False,2017,False,begin payment,begin payment,-54037160734808716,Payment application,-9.006594e+16,complete,automatic during payment,Application,1,"Payment application, Application",2017-04-13,2018-01-05 14:45:52.100,267,246,60,60,1.000000,1,17,4.0,75.0,3.983051,852.995000,334.932529,2.546767,April
2514263,ad5dfa0b929735be,Payment application-Application-insert document,Notification automaton,2017-12-15 19:19:04.499,Variant 28923,28923,5117.97,,,,3a96cda3a6b0f35f,ad5dfa0b929735be,15.2806,True,0.0,6b,True,6,5116.99,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,,,,215,True,False,1.0,False,False,False,False,2017,False,insert document,insert document,-54037160734808716,Payment application,-9.006594e+16,complete,notification for applicant,Application,1,"Payment application, Application",2017-04-13,2018-01-05 14:45:52.100,267,246,61,61,1.000000,2,18,0.0,75.0,3.983333,852.995000,334.932529,2.546767,April
2514264,ad5dfa0b929735be,Payment application-Application-insert document,Notification automaton,2017-12-20 09:02:30.380,Variant 28923,28923,5117.97,,,,3a96cda3a6b0f35f,ad5dfa0b929735be,15.2806,True,0.0,6b,True,6,5116.99,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,,,,215,True,False,1.0,False,False,False,False,2017,False,insert document,insert document,-54037160734808716,Payment application,-1.260947e+17,complete,notification for applicant,Application,0,"Payment application, Application",2017-04-13,2018-01-05 14:45:52.100,267,251,62,61,0.983871,3,19,4.0,75.0,3.918033,852.995000,334.932529,2.546767,April


In [24]:
utils._save_pickle(df, 'dataset/final_input_for_counters')