In [1]:
import pm4py
from pm4py.objects.ocel.obj import OCEL
from typing import Optional
import pandas as pd

In [2]:
ocel = pm4py.read_ocel2_sqlite("../data/event_logs/ContainerLogistics-v1.sqlite")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pandas_utils.dataframe_column_string_to_datetime(df[col], format=timest_format, utc=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = strpfromiso.fix_dataframe_column(df[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  object_changes[internal_index] = object_changes.index

# Filter

In [19]:
from typing import Literal, Optional
from pydantic import BaseModel

class BaseFilterConfig(BaseModel):
    type: str
    mode: Optional[Literal["include", "exclude"]] = "include"

## Filter Entities by type

In [20]:
def filter_by_entity_types(entity_table:pd.DataFrame,entity_type_field:str,entity_type_list:list[str],include:bool=True):
    eventMask = entity_table[entity_type_field].isin(entity_type_list)
    return eventMask if include else ~eventMask

### Filter by activity name

In [21]:
class EventTypeFilterConfig(BaseFilterConfig):
    type: Literal["event_type"]
    event_types: list[str]

In [22]:
def filter_by_event_types(ocel:OCEL,event_types:list[str],include:bool=True):
    return filter_by_entity_types(ocel.events, ocel.event_activity, event_types, include)

### Filter by object type

In [23]:
class ObjectTypeFilterConfig(BaseFilterConfig):
    type:Literal["object_type"]
    object_types:list[str]

In [24]:
def filter_by_object_types(ocel:OCEL,object_types:list[str],include:bool=True):
    return filter_by_entity_types(ocel.objects, ocel.object_id_column, object_types, include)

## Filter Entities by frequencies

In [25]:
def filter_by_entity_frequency(entity_tabele:pd.DataFrame, entity_type_field:str, threshold:int):
    cumulative_freq = entity_table.value_counts(entity_type_field, normalize=True).cumsum()
    included_entities = cumulative_freq[cumulative_freq <= threshold].index
    return filter_by_entity_types(entity_table,entity_type_field,included_entities)

### Filter by Activity type frequency 

In [26]:
def filter_by_activity_frequency(ocel:OCEL, threshold:int):
    return filter_by_entity_frequency(ocel.events, ocel.event_activity, threshold)

### Filter by Object type frequency

In [27]:
def filter_by_object_frequency(ocel:OCEL, threshold:int):
    return filter_by_entity_frequency(ocel.objects, ocel.object_id_column, threshold)

## Filter by time range

In [28]:
def filter_by_time_range(ocel:OCEL,start_time:Optional[str]=None, end_time:Optional[str]=None , include:bool=True):
    if start_time is not None:
        start_time = pd.Timestamp(start_time, tz="UTC")
    if end_time is not None:
        end_time = pd.Timestamp(end_time, tz="UTC")
    
    events_df = ocel.events

    mask = pd.Series([True] * len(events_df), index=events_df.index)
    if start_time is not None:
        mask &= events_df["ocel:timestamp"] >= start_time
    if end_time is not None:
        mask &= events_df["ocel:timestamp"] <= end_time

    return mask if include else ~mask

## Filter by Relation Count

### E20

In [29]:
ocel.relations

Unnamed: 0,ocel:eid,ocel:oid,ocel:qualifier,ocel:activity,ocel:timestamp,ocel:type
0,reg_co1,co1,registered CO,Register Customer Order,2023-05-22 13:54:42+00:00,Customer Order
1,reg_co2,co2,registered CO,Register Customer Order,2023-05-22 20:33:30+00:00,Customer Order
2,create_td1,co2,TD created for CO,Create Transport Document,2023-05-23 10:22:17+00:00,Customer Order
3,create_td1,td1,created TD,Create Transport Document,2023-05-23 10:22:17+00:00,Transport Document
4,book_vehs_td1,td1,VHs booked for TD,Book Vehicles,2023-05-23 12:05:16+00:00,Transport Document
...,...,...,...,...,...,...
74284,drive_term_cr2000,cr2000,CR moved,Drive to Terminal,2024-08-21 16:07:12+00:00,Container
74285,drive_term_cr2000,tr2,TR moved,Drive to Terminal,2024-08-21 16:07:12+00:00,Truck
74286,pick_cr2011,cr2011,CR picked,Pick Up Empty Container,2024-08-21 16:07:12+00:00,Container
74287,load_veh_cr1936,cr1936,CR loaded,Load to Vehicle,2024-08-21 16:10:38+00:00,Container


In [137]:
def filter_by_e2o_counts(
    ocel: OCEL,
    event_type: str,
    object_type: str,
    min_relations: int,
    entity_field: Optional[Literal["object", "event"]] = "object",
    max_relations: Optional[int] = None,
    qualifier: Optional[str] = None
):
    target_id = ocel.event_id_column if entity_field == "event" else ocel.object_id_column
    target_dataframe = ocel.events if entity_field =="event" else ocel.objects
    mask = (
        (ocel.relations[ocel.event_activity] == event_type) &
        (ocel.relations[ocel.object_type_column] == object_type)
    )
    
    if qualifier is not None and 'qualifier' in ocel.relations.columns:
        mask &= (ocel.relations['qualifier'] == qualifier)
    
    filtered_relations = ocel.relations[mask]

    entity_counts = (
        filtered_relations
        .groupby(target_id)
        .size()
        .reset_index(name='entity_count')
    )
    
    if max_relations is not None:
        entity_counts = entity_counts[
            (entity_counts['entity_count'] >= min_relations) & 
            (entity_counts['entity_count'] <= max_relations)
        ]
    else:
        entity_counts = entity_counts[entity_counts['entity_count'] >= min_relations]
    
    entity_mask = (target_dataframe[ocel.event_activity if entity_field =="event" else ocel.object_type_column] !=(event_type if entity_field =="event" else object_type) ) |  (target_dataframe[target_id].isin(entity_counts[target_id]))
    
    return entity_mask

In [138]:
ocel.o2o

Unnamed: 0,ocel:oid,ocel:oid_2,ocel:qualifier
0,co2,td1,TD for CO
1,td1,vh1,High-Prio VH for TD
2,td1,vh4,Regular VH for TD
3,cr2,td1,CR for TD
4,cr3,td1,CR for TD
...,...,...,...
15961,cr1987,hu10552,CR contains HU
15967,cr1987,hu10549,CR contains HU
15968,tr2,cr2011,TR loads CR
15970,cr2018,hu10557,CR contains HU


In [140]:
    qualifier_col = ocel.qualifier
    activity_col = ocel.event_activity
    object_type_col = ocel.object_type_column
    event_id_col = ocel.event_id_column

    grouped_relations = (
        ocel.relations.groupby(
            [event_id_col, qualifier_col, activity_col, object_type_col]
        )
        .size()
        .reset_index()
        .rename(columns={0: "count"})
    )

    summary: pd.DataFrame = (
        grouped_relations.groupby([qualifier_col, activity_col, object_type_col])[
            "count"
        ]
        .agg(["min", "max"])
        .reset_index()
        .rename(columns={"min": "min_count", "max": "max_count"})
    )

 

In [141]:
summary

Unnamed: 0,ocel:qualifier,ocel:activity,ocel:type,min_count,max_count
0,CR brought to bay,Bring to Loading Bay,Container,1,1
1,CR departed,Depart,Container,2,39
2,CR laded,Load Truck,Container,1,1
3,CR loaded,Load to Vehicle,Container,1,1
4,CR moved,Drive to Terminal,Container,1,1
5,CR picked,Pick Up Empty Container,Container,1,1
6,CR rescheduled,Reschedule Container,Container,1,1
7,CR stored,Place in Stock,Container,1,1
8,CR weighted,Weigh,Container,1,1
9,CRs ordered,Order Empty Containers,Container,1,5


In [152]:
def get_e2o_summary(ocel:OCEL, direction:Optional[Literal["event" ,"object"]]="event"):
    qualifier_col = ocel.qualifier
    activity_col = ocel.event_activity
    object_type_col = ocel.object_type_column
    event_id_col = ocel.event_id_column
    object_id_col = ocel.object_id_column

    grouped_relations = (
        ocel.relations.groupby(
            [event_id_col if direction =="event" else object_id_col, qualifier_col, activity_col, object_type_col]
        )
        .size()
        .reset_index()
        .rename(columns={0: "count"})
    )

    return (
        grouped_relations.groupby([qualifier_col, activity_col, object_type_col])[
            "count"
        ]
        .agg(["min", "max","sum"])
        .reset_index()
        .rename(columns={"min": "min_count", "max": "max_count"})
    )


In [157]:
get_e2o_summary(ocel )

Unnamed: 0,ocel:qualifier,ocel:activity,ocel:type,min_count,max_count,sum
0,CR brought to bay,Bring to Loading Bay,Container,1,1,1960
1,CR departed,Depart,Container,2,39,1956
2,CR laded,Load Truck,Container,1,1,10551
3,CR loaded,Load to Vehicle,Container,1,1,1959
4,CR moved,Drive to Terminal,Container,1,1,1989
5,CR picked,Pick Up Empty Container,Container,1,1,1995
6,CR rescheduled,Reschedule Container,Container,1,1,35
7,CR stored,Place in Stock,Container,1,1,1814
8,CR weighted,Weigh,Container,1,1,1988
9,CRs ordered,Order Empty Containers,Container,1,5,1996


In [158]:
get_e2o_summary(ocel, "object")

Unnamed: 0,ocel:qualifier,ocel:activity,ocel:type,min_count,max_count,sum
0,CR brought to bay,Bring to Loading Bay,Container,1,1,1960
1,CR departed,Depart,Container,1,1,1956
2,CR laded,Load Truck,Container,1,6,10551
3,CR loaded,Load to Vehicle,Container,1,1,1959
4,CR moved,Drive to Terminal,Container,1,1,1989
5,CR picked,Pick Up Empty Container,Container,1,1,1995
6,CR rescheduled,Reschedule Container,Container,1,1,35
7,CR stored,Place in Stock,Container,1,1,1814
8,CR weighted,Weigh,Container,1,1,1988
9,CRs ordered,Order Empty Containers,Container,1,1,1996
