## Imports etc

In [1]:
import pandas as pd
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.log.util import dataframe_utils

import utils

READ_PARQUET = True  # faster, but have to create first

## Read data as DF and optionally make sample

In [2]:
if READ_PARQUET:
    print("Reading parquet.")
    df = utils.load_parquet()
else:
    print("Reading csv and savings as parquet.")
    df = utils.load_csv()
    utils.save_parquet(df)

Reading parquet.


In [3]:
df.head(1)

Unnamed: 0,Case ID,Activity,Resource,Complete Timestamp,Variant,Variant index,(case) amount_applied0,(case) amount_applied1,(case) amount_applied2,(case) amount_applied3,...,(case) young farmer,activity,concept:name,docid,doctype,eventid,lifecycle:transition,note,subprocess,success
0,8b99873a6136cfa6,Payment application-Application-mail income,0;n/a,2015/05/08 00:00:00.000,Variant 1832,1832,960.35,,,,...,False,mail income,mail income,-18008611495569447,Payment application,,complete,none,Application,True


In [4]:
df.columns.sort_values()

Index(['(case) amount_applied0', '(case) amount_applied1',
       '(case) amount_applied2', '(case) amount_applied3', '(case) applicant',
       '(case) application', '(case) area', '(case) basic payment',
       '(case) cross_compliance', '(case) department', '(case) greening',
       '(case) number_parcels', '(case) payment_actual0',
       '(case) payment_actual1', '(case) payment_actual2',
       '(case) payment_actual3', '(case) penalty_ABP', '(case) penalty_AGP',
       '(case) penalty_AJLP', '(case) penalty_AUVP', '(case) penalty_AVBP',
       '(case) penalty_AVGP', '(case) penalty_AVJLP', '(case) penalty_AVUVP',
       '(case) penalty_B16', '(case) penalty_B2', '(case) penalty_B3',
       '(case) penalty_B4', '(case) penalty_B5', '(case) penalty_B5F',
       '(case) penalty_B6', '(case) penalty_BGK', '(case) penalty_BGKV',
       '(case) penalty_BGP', '(case) penalty_C16', '(case) penalty_C4',
       '(case) penalty_C9', '(case) penalty_CC', '(case) penalty_GP1',
       '(case)

In [5]:
first_id = df.iloc[0]['Case ID']
first_id

'8b99873a6136cfa6'

In [6]:
sample_df = df[df['Case ID'] == first_id]
print(len(sample_df))
sample_df.head(1)

52


Unnamed: 0,Case ID,Activity,Resource,Complete Timestamp,Variant,Variant index,(case) amount_applied0,(case) amount_applied1,(case) amount_applied2,(case) amount_applied3,...,(case) young farmer,activity,concept:name,docid,doctype,eventid,lifecycle:transition,note,subprocess,success
0,8b99873a6136cfa6,Payment application-Application-mail income,0;n/a,2015/05/08 00:00:00.000,Variant 1832,1832,960.35,,,,...,False,mail income,mail income,-18008611495569447,Payment application,,complete,none,Application,True


## Change some column names

In [7]:
sample_df.rename(columns = {
    'Resource': 'org:resource',
    'Complete Timestamp': 'time:timestamp'
}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


## Fix timestamp

In [8]:
sample_df = dataframe_utils.convert_timestamp_columns_in_df(sample_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.to_datetime(df[col], utc=True)


## Convert to pm4py log object

In [9]:
# Can change some parameters about default columns etc and use these as input for the log converter
parameters = {
    log_converter.Variants.TO_EVENT_LOG.value.Parameters.CASE_ID_KEY: 'Case ID',
    log_converter.Variants.TO_EVENT_LOG.value.Parameters.CASE_ATTRIBUTE_PREFIX: '(case) ',
}
log = log_converter.apply(sample_df, parameters=parameters, variant=log_converter.Variants.TO_EVENT_LOG)

## Pickle log object

In [10]:
utils.save_log(log)

## Checkout log object

### Traces

In [11]:
# first trace
log[0].attributes

{'amount_applied0': 960.35,
 'amount_applied1': nan,
 'amount_applied2': nan,
 'amount_applied3': nan,
 'applicant': 'b3b1bafcf8a5c359',
 'application': '8b99873a6136cfa6',
 'area': 2.6994,
 'basic payment': True,
 'cross_compliance': 0.0,
 'department': 'e7',
 'greening': True,
 'number_parcels': 3,
 'payment_actual0': 960.35,
 'payment_actual1': nan,
 'payment_actual2': nan,
 'payment_actual3': nan,
 'penalty_ABP': False,
 'penalty_AGP': False,
 'penalty_AJLP': False,
 'penalty_AUVP': False,
 'penalty_AVBP': False,
 'penalty_AVGP': False,
 'penalty_AVJLP': False,
 'penalty_AVUVP': False,
 'penalty_B16': False,
 'penalty_B2': False,
 'penalty_B3': False,
 'penalty_B4': False,
 'penalty_B5': False,
 'penalty_B5F': False,
 'penalty_B6': False,
 'penalty_BGK': False,
 'penalty_BGKV': False,
 'penalty_BGP': False,
 'penalty_C16': False,
 'penalty_C4': False,
 'penalty_C9': False,
 'penalty_CC': False,
 'penalty_GP1': False,
 'penalty_JLP1': False,
 'penalty_JLP2': False,
 'penalty_JLP3': 

In [12]:
# first trace from Rowan
trace = {'young farmer': False,
 'selected_random': False,
 'penalty_AJLP': False,
 'application': '8b99873a6136cfa6',
 'penalty_amount0': 0.0,
 'program-id': '215',
 'penalty_BGKV': False,
 'penalty_AUVP': False,
 'applicant': 'b3b1bafcf8a5c359',
 'risk_factor': 1.0,
 'small farmer': True,
 'penalty_BGP': False,
 'department': 'e7',
 'penalty_C16': False,
 'penalty_BGK': False,
 'penalty_AVUVP': False,
 'penalty_CC': False,
 'penalty_AVJLP': False,
 'penalty_C9': False,
 'cross_compliance': 0.0,
 'rejected': False,
 'greening': True,
 'penalty_C4': False,
 'penalty_AVGP': False,
 'penalty_ABP': False,
 'penalty_B6': False,
 'penalty_B4': False,
 'penalty_B5': False,
 'penalty_AVBP': False,
 'penalty_B2': False,
 'selected_risk': False,
 'penalty_B3': False,
 'area': 2.6994,
 'selected_manually': False,
 'penalty_AGP': False,
 'penalty_B16': False,
 'penalty_GP1': False,
 'basic payment': True,
 'penalty_B5F': False,
 'penalty_V5': False,
 'payment_actual0': 960.35,
 'identity:id': 'B15FB890-574E-4117-B66A-BDFBC4FCAA6E',
 'amount_applied0': 960.35,
 'redistribution': True,
 'penalty_JLP6': False,
 'penalty_JLP7': False,
 'year': '2015',
 'penalty_JLP5': False,
 'penalty_JLP2': False,
 'penalty_JLP3': False,
 'number_parcels': 3,
 'penalty_JLP1': False,
 'concept:name': '8b99873a6136cfa6'}

In [13]:
# check what is present in my log but not in Rowans
for key in log[0].attributes.keys():
    if key not in trace.keys():
        print(key)

amount_applied1
amount_applied2
amount_applied3
payment_actual1
payment_actual2
payment_actual3
penalty_amount1
penalty_amount2
penalty_amount3


Present while shouldn't be there. But they are all nan but are probably just removed by pm4py

In [14]:
# check what is present in Rowan's logs but not in mine
for key in trace.keys():
    if key not in log[0].attributes.keys():
        print(key)

identity:id


Should be in the data, not sure why it is not there.

## Events

In [15]:
# check out first event
first_event = dict(log[0][0])
first_event

{'Case ID': '8b99873a6136cfa6',
 'Activity': 'Payment application-Application-mail income',
 'org:resource': '0;n/a',
 'time:timestamp': Timestamp('2015-05-08 00:00:00+0000', tz='UTC'),
 'Variant': 'Variant 1832',
 'Variant index': 1832,
 'activity': 'mail income',
 'concept:name': 'mail income',
 'docid': -18008611495569447,
 'doctype': 'Payment application',
 'eventid': nan,
 'lifecycle:transition': 'complete',
 'note': 'none',
 'subprocess': 'Application',
 'success': True}

In [16]:
# check event by Rowan
import datetime
event = {'success': True, 'org:resource': '0;n/a', 'docid_uuid': 'CD3DC291-76C6-420A-B3F1-7C808970915B',
 'doctype': 'Payment application', 'subprocess': 'Application', 'docid': '-18008611495569447',
 'activity': 'mail income', 'note': 'none', 'eventid': 'null', 'identity:id': '510B5333-731A-40FD-B7D6-FC149E50E961',
 'concept:name': 'mail income', 'lifecycle:transition': 'complete',
 'time:timestamp': datetime.datetime(2015, 5, 8, 0, 0, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)))}

In [17]:
# check what is in my event and not in Rowans
for key in first_event.keys():
    if key not in event.keys():
        print(key)

Case ID
Activity
Variant
Variant index


In [18]:
# check what is in my event and not in Rowans
for key in event.keys():
    if key not in first_event.keys():
        print(key)

docid_uuid
identity:id
