In [3]:
import pm4py
import pandas

any process mining algorithm implemented in PM4Py, using an event log as an input,
can work directly with a pandas file!

In [4]:
def import_csv(file_path):
    event_log = pandas.read_csv(file_path, sep=';')
    num_events = len(event_log)
    num_cases = len(event_log.case_id.unique())
    print("Number of events: {}\nNumber of cases: {}".format(num_events, num_cases))


import_csv("../data/running-example.csv")

Number of events: 42
Number of cases: 6


In [5]:
def import_csv2(filename):
    log = pandas.read_csv(filename, sep=';')
    log = pm4py.format_dataframe(log, case_id='case_id', activity_key='activity', timestamp_key='timestamp',
                                 timest_format='%Y-%m-%d %H:%M:%S%z')
    start_activities = pm4py.get_start_activities(log)
    end_activities = pm4py.get_end_activities(log)
    print("start_activities: {}, end_activities: {}".format(start_activities, end_activities))
    print("log:\n{}".format(log))


import_csv2("../data/running-example.csv")

start_activities: {'register request': 6}, end_activities: {'reject request': 3, 'pay compensation': 3}
log:
    case_id            activity                 timestamp  costs resource  \
0         1    register request 2010-12-30 10:02:00+00:00     50     Pete   
1         1  examine thoroughly 2010-12-31 09:06:00+00:00    400      Sue   
2         1        check ticket 2011-01-05 14:12:00+00:00    100     Mike   
3         1              decide 2011-01-06 10:18:00+00:00    200     Sara   
4         1      reject request 2011-01-07 13:24:00+00:00    200     Pete   
5         2    register request 2010-12-30 10:32:00+00:00     50     Mike   
6         2        check ticket 2010-12-30 11:12:00+00:00    100     Mike   
7         2    examine casually 2010-12-30 13:16:00+00:00    400     Sean   
8         2              decide 2011-01-05 10:22:00+00:00    200     Sara   
9         2    pay compensation 2011-01-08 11:05:00+00:00    200    Ellen   
10        3    register request 2010-12-30 1

  log = pm4py.format_dataframe(log, case_id='case_id', activity_key='activity', timestamp_key='timestamp', timest_format='%Y-%m-%d %H:%M:%S%z')


In [6]:
def import_xes(filename):
    log = pm4py.read_xes(filename)
    start_activities = pm4py.get_start_activities(log)
    end_activities = pm4py.get_end_activities(log)
    print(f'start_activities: {start_activities}, end_activities: {end_activities}')


import_xes("../data/running-example.xes")



parsing log, completed traces ::   0%|          | 0/6 [00:00<?, ?it/s]

start_activities: {'register request': 6}, end_activities: {'reject request': 3, 'pay compensation': 3}


  df[col] = pd.to_datetime(df[col], utc=True)
  df[col] = pd.to_datetime(df[col], utc=True)
  df[col] = pd.to_datetime(df[col], utc=True)


In [9]:
def import_xes_2(filename):
    log = pm4py.read_xes(filename)  # DataFrame
    print(log.info())
    print('*' * 100)
    # convert dataframe to list and do filtering
    log_list: list = log.values.tolist()  # log.values is a ndarray
    print("log list:")
    for i in log_list:
        print(i)
    print('*' * 100)
    trace_list = list(filter(lambda x: x[0] == 'register request', log_list))
    print("trace list:")
    print(trace_list)


import_xes_2("../data/running-example.xes")

parsing log, completed traces ::   0%|          | 0/6 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   concept:name       42 non-null     object             
 1   time:timestamp     42 non-null     datetime64[ns, UTC]
 2   cost:total         42 non-null     int64              
 3   org:resource       42 non-null     object             
 4   @@index            42 non-null     int64              
 5   case:concept:name  42 non-null     object             
dtypes: datetime64[ns, UTC](1), int64(2), object(3)
memory usage: 2.1+ KB
None
****************************************************************************************************
log list:
['register request', Timestamp('2010-12-30 10:02:00+0000', tz='UTC'), 50, 'Pete', 14, '1']
['examine thoroughly', Timestamp('2010-12-31 09:06:00+0000', tz='UTC'), 400, 'Sue', 15, '1']
['check ticket', Timestamp('2011-01-05 14:12

  df[col] = pd.to_datetime(df[col], utc=True)
  df[col] = pd.to_datetime(df[col], utc=True)
  df[col] = pd.to_datetime(df[col], utc=True)
