In [None]:
import pandas as pd
from pathlib import Path
from pprint import pprint

In [None]:
# from https://ocel-standard.org
DATASET_GITHUB = {"dataset": "github_pm4py.jsonocel", "leading_type": "case:concept:name"}
DATASET_O2C = {"dataset": "o2c.jsonocel", "leading_type": "BELNR"} # SAP
DATASET_P2P = {"dataset": "p2p.jsonocel", "leading_type": "BELNR"} # SAP
DATASET_TRANSFER = {"dataset": "transfer_order.jsonocel", "leading_type": "MATNR"} # SAP
DATASET_RECRUITING = {"dataset": "recruiting.jsonocel", "leading_type": "applications"}
DATASET_ORDER = {"dataset": "running-example.jsonocel", "leading_type": "xxx"}
DATASET_WINDOWS = {"dataset": "windows_events.jsonocel", "leading_type": "eventIdentifier"}

# example dataset from celonis
DATASET_CELONIS = {"dataset": "celonis.jsonocel", "leading_type": "xxx"}

In [None]:
import pm4py

In [None]:
dataset = DATASET_RECRUITING

filename = Path("../data/datasets") / dataset["dataset"]

ocel = pm4py.read_ocel(str(filename))

## Testing

In [None]:
from datetime import datetime
import numpy as np

In [None]:
datetime.fromstr('2020-01-01 00:00:00')

In [None]:
def reduce_event_log(num_events):

    events = ocel.events.iloc[:num_events,:].copy()
    relations = ocel.relations[ocel.relations['ocel:eid'].isin(events['ocel:eid'])]
    objects = ocel.objects[ocel.objects['ocel:oid'].isin(relations['ocel:oid'])]

    print(f"Events: {len(ocel.events)} -> {len(events)} ({len(events)/len(ocel.events):.1%})")
    print(f"Objects: {len(ocel.objects)} -> {len(objects)} ({len(objects)/len(ocel.objects):.1%})")

    ocel2 = pm4py.ocel.OCEL(events=events, objects=objects, relations=relations)

    dataset_name = dataset["dataset"].split(".")[0]
    filename = f"../data/datasets/{dataset_name}_{num_events}.jsonocel"
    pm4py.objects.ocel.exporter.jsonocel.exporter.apply(ocel2, filename)

    print(f"Saved to file {filename}")
    
# reduce_event_log(num_events=500)

In [None]:
start = ocel.events['ocel:timestamp'].min()
end = ocel.events['ocel:timestamp'].max()

# ocel.events["ocel:timestamp"] = pd.to_datetime(ocel.events["ocel:timestamp"]).dt.datetime
bounds_str = ('2020-01-01 00:00:00', '2020-08-31 00:00:00')
ybounds = ("2020", "2020")
bounds = np.array(list(bounds_str), dtype='datetime64[ns]')
# bounds = (datetime.strptime(x, '%Y-%m-%d %H:%M:%S') for x in bounds_str)
ocel2 = pm4py.filter_ocel_events_timestamp(ocel, *bounds)


In [None]:
start, end

In [None]:
ocel.events[ocel.events["ocel:timestamp"] < "2020"]

# Overview of `pm4py` methods

In [None]:
# pm4py.ocel_get_attribute_names(ocel)
# pm4py.ocel_get_object_types(ocel) # list[ot]
# pm4py.ocel_object_type_activities(ocel) # dict[ot -> set[activities]]
# pm4py.ocel_objects_ot_count(ocel) # dict[eid -> Multiset[ot]]
# pm4py.ocel_temporal_summary(ocel)

In [None]:
# pm4py.ocel_get_attribute_names(ocel)
# pm4py.ocel_get_attribute_names(ocel)
# pm4py.ocel_get_object_types(ocel) # List[ot]
# pm4py.ocel_object_type_activities(ocel) # Dict[ot -> set[activities]]
# pm4py.ocel_objects_ot_count(ocel) # Dict[eid -> Dict[ot -> int]]
# pm4py.ocel_temporal_summary(ocel) # DataFrame (multiple events aggregated when same timestamp)
# pm4py.ocel_flattening(ocel, "applications") # flattened DataFrame
# pm4py.ocel_objects_summary(ocel) # DataFrame (lifecycle of objects, with interacting objects)

In [None]:
# ocdfg = pm4py.discover_ocdfg(ocel) # Object-centric DFG
# ocpn = pm4py.discover_oc_petri_net(ocel) # Object-centric Petri Net

In [None]:
G = pm4py.discover_objects_graph(ocel)

In [None]:
ocel.objects

In [None]:
len(G)

In [None]:
import networkx as nx

In [None]:
og = nx.DiGraph()
og.add_nodes_from(ocel.objects["ocel:oid"])

In [None]:
nx.draw(og)

# Object-centric Petri Net

In [None]:
model = pm4py.discover_oc_petri_net(ocel)

In [None]:
pm4py.view_ocpn(model, format="png")

In [None]:
ocel.get_extended_table()

In [None]:
ocdfg = pm4py.discover_ocdfg(ocel)

In [None]:
pm4py.view_ocdfg(ocdfg, format="png")

## Object types per activity

In [None]:
# act_ot_counts = {act: {f"num_{ot}": sum([len([oid for oid in ocel.obj.eve_objects(eid) if ocel.obj.raw.objects[oid].type == ot]) for eid in ocel.obj.act_events(act)]) for ot in ocel.object_types} for act in ocel.obj.activities}
# act_stats = pd.DataFrame([{"activity": act,
#                            "frequency": len(ocel.obj.act_events(act)),
#                            **act_ot_counts[act]} for act in ocel.obj.activities])
# act_stats

## Object type statistics (per event)

In [None]:
# numobjs_ev = ocel.log.log[ocel.object_types].applymap(len)
# stats = numobjs_ev.describe().transpose().drop(columns=["count"])
# stats["count"] = [len(ocel.obj.ot_objects(ot)) for ot in stats.index]
# stats["caseID"] = (stats["min"] == 1) & (stats["max"] == 1)
# stats

In [None]:
# print(f"{len(ocel.process_executions)} Process executions")

In [None]:
# pex_event_counts = pd.Series([len(ex) for ex in ocel.process_executions])
# pex_event_counts.describe()

In [None]:
# import matplotlib.pyplot as plt

# plt.hist(pex_event_counts, bins=100)
# plt.title("Number of events per process execution")
# plt.show()