In [20]:
import pm4py
ocel = pm4py.read_ocel2_xml("/home/grkmr/data/ocels/angular_github_commits_ocel.xml")

In [23]:
from typing import List, Literal, Union
import pandas as pd
import pm4py
from pm4py.objects.ocel.obj import OCEL
from pydantic.main import BaseModel


# --- Attribute Type Models ---

class IntegerAttribute(BaseModel):
    attribute: str
    type: Literal["integer"]
    min: int
    max: int


class FloatAttribute(BaseModel):
    attribute: str
    type: Literal["float"]
    min: float
    max: float


class BooleanAttribute(BaseModel):
    attribute: str
    type: Literal["boolean"]
    true_count: int
    false_count: int


class DateAttribute(BaseModel):
    attribute: str
    type: Literal["date"]
    min: str
    max: str


class NominalAttribute(BaseModel):
    attribute: str
    type: Literal["nominal"]
    num_unique: int


AttributeSummary = Union[
    IntegerAttribute,
    FloatAttribute,
    BooleanAttribute,
    DateAttribute,
    NominalAttribute,
]

# --- Utility Functions ---

def melt_df(df: pd.DataFrame, type_col: str, cols: list[str]) -> pd.DataFrame:
    return (
        df[[type_col] + cols]
        .melt(id_vars=type_col, var_name="attribute", value_name="value")
        .dropna(subset=["value"])
    )


def is_boolean_series_fast(series: pd.Series, lower_vals: pd.Series) -> bool:
    valid = {"true", "false", "yes", "no", "0", "1"}
    return set(lower_vals.unique()).issubset(valid) and lower_vals.nunique() <= 2


# --- Main Attribute Summary Logic ---

def summarize_attributes(df: pd.DataFrame, type_column: str) -> dict[str, List[AttributeSummary]]:
    summary_by_type: dict[str, List[AttributeSummary]] = {}

    grouped = df.groupby([type_column, "attribute"])

    for (type_name, attr), group in grouped:
        values = group["value"].dropna()
        str_vals = values.astype(str)
        lower_vals = str_vals.str.lower()

        attribute_type = "unknown"
        numeric_values = None
        date_values = None

        # Try boolean
        if is_boolean_series_fast(values, lower_vals):
            attribute_type = "boolean"

        # Try numeric
        if attribute_type == "unknown":
            try:
                numeric_values = pd.to_numeric(values, errors="raise")
                if (numeric_values % 1 == 0).all():
                    attribute_type = "integer"
                    numeric_values = numeric_values.astype(int)
                else:
                    attribute_type = "float"
            except Exception:
                pass

        # Try date
        if attribute_type == "unknown":
            try:
                    
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", UserWarning)
                    date_values = pd.to_datetime(values, errors="coerce")
                if date_values.notna().all():
                    attribute_type = "date"
            except Exception:
                pass

        # Fallback to nominal
        if attribute_type == "unknown":
            attribute_type = "nominal"

        # Create summary
        match attribute_type:
            case "integer":
                summary = IntegerAttribute(
                    attribute=attr,
                    type="integer",
                    min=int(numeric_values.min()),
                    max=int(numeric_values.max()),
                )
            case "float":
                summary = FloatAttribute(
                    attribute=attr,
                    type="float",
                    min=float(numeric_values.min()),
                    max=float(numeric_values.max()),
                )
            case "boolean":
                true_count = lower_vals.isin(["true", "yes", "1"]).sum()
                false_count = len(values) - true_count
                summary = BooleanAttribute(
                    attribute=attr,
                    type="boolean",
                    true_count=true_count,
                    false_count=false_count,
                )
            case "date":
                summary = DateAttribute(
                    attribute=attr,
                    type="date",
                    min=str(date_values.min()),
                    max=str(date_values.max()),
                )
            case "nominal":
                summary = NominalAttribute(
                    attribute=attr,
                    type="nominal",
                    num_unique=values.nunique(),
                )

        summary_by_type.setdefault(type_name, []).append(summary)

    return summary_by_type


# --- OCEL Integration Functions ---

def summarize_event_attributes(ocel: OCEL) -> dict[str, list[AttributeSummary]]:
    event_attribute_names = [
        col for col in pm4py.ocel_get_attribute_names(ocel) if col in ocel.events.columns
    ]

    melted_event_attributes = melt_df(
        ocel.events, ocel.event_activity, event_attribute_names
    )

    return summarize_attributes(melted_event_attributes, ocel.event_activity)


def summarize_object_attributes(ocel: OCEL) -> dict[str, list[AttributeSummary]]:
    obj_type_col = ocel.object_type_column

    attribute_names = pm4py.ocel_get_attribute_names(ocel)
    object_cols = [col for col in attribute_names if col in ocel.objects.columns]
    object_changes_cols = [
        col for col in attribute_names if col in ocel.object_changes.columns
    ]

    melted_objects = melt_df(
        ocel.objects.replace("null", pd.NA), obj_type_col, object_cols
    )
    melted_changes = melt_df(
        ocel.object_changes.replace("null", pd.NA), obj_type_col, object_changes_cols
    )

    metadata = pd.concat([melted_objects, melted_changes], ignore_index=True)

    return summarize_attributes(metadata, obj_type_col)


In [24]:
summarize_event_attributes(ocel)

{'add': [IntegerAttribute(attribute='Unnamed: 0', type='integer', min=313, max=5212),
  NominalAttribute(attribute='author_name', type='nominal', num_unique=10),
  NominalAttribute(attribute='commit_message', type='nominal', num_unique=11),
  BooleanAttribute(attribute='merge', type='boolean', true_count=0, false_count=11)],
 'address': [IntegerAttribute(attribute='Unnamed: 0', type='integer', min=988, max=988),
  NominalAttribute(attribute='author_name', type='nominal', num_unique=1),
  NominalAttribute(attribute='commit_message', type='nominal', num_unique=1),
  BooleanAttribute(attribute='merge', type='boolean', true_count=0, false_count=1)],
 'aio': [IntegerAttribute(attribute='Unnamed: 0', type='integer', min=7636, max=9204),
  NominalAttribute(attribute='author_name', type='nominal', num_unique=2),
  NominalAttribute(attribute='commit_message', type='nominal', num_unique=3),
  BooleanAttribute(attribute='merge', type='boolean', true_count=0, false_count=3)],
 'benchmark': [Intege

In [18]:
from pandas.api.types import is_numeric_dtype, is_datetime64_any_dtype
import pandas as pd
from typing import Union,Optional
from dataclasses import dataclass

class BaseFilterConfig(BaseModel):
    mode: Optional[Literal["include", "exclude"]] = "include"


class AttributeFilterConfig(BaseFilterConfig):
    type: Literal["attribute"]
    target: Literal["event", "object"]
    attribute: str

    # Range filters
    min: Optional[Union[int, float, str]] = None
    max: Optional[Union[int, float, str]] = None

    # Nominal filters
    values: Optional[list[Union[str, int, float]]] = None
    regex: Optional[str] = None

@dataclass
class FilterResult:
    events: Optional[pd.Series] = None
    objects: Optional[pd.Series] = None
    e2o: Optional[pd.Series] = None
    o2o: Optional[pd.Series] = None
    


def filter_by_attribute(ocel: OCEL, config: AttributeFilterConfig) -> FilterResult:
    df = ocel.events if config.target == "event" else ocel.objects
    col = config.attribute

    if col not in df.columns:
        raise ValueError(f"Attribute '{col}' not found in {config.target} data")

    series = df[col]
    mask = pd.Series(True, index=series.index)

    # Handle numeric filtering
    if config.min is not None or config.max is not None:
        if is_numeric_dtype(series):
            numeric_series = series
        else:
            numeric_series = pd.to_numeric(series, errors="coerce")

        if config.min is not None:
            mask &= numeric_series >= float(config.min)
        if config.max is not None:
            mask &= numeric_series <= float(config.max)

    # Handle date filtering
    elif isinstance(config.min, str) or isinstance(config.max, str):
        if is_datetime64_any_dtype(series):
            date_series = series
        else:
            date_series = pd.to_datetime(series, errors="coerce")

        if config.min is not None:
            mask &= date_series >= pd.to_datetime(config.min)
        if config.max is not None:
            mask &= date_series <= pd.to_datetime(config.max)

    # Handle nominal filtering
    if config.values is not None:
        mask &= series.isin(config.values)

    if config.regex is not None:
        mask &= series.astype(str).str.contains(config.regex, regex=True, na=False)

    if config.mode == "exclude":
        mask = ~mask

    return FilterResult(
        events=mask if config.target == "event" else None,
        objects=mask if config.target == "object" else None
    )


In [19]:
summarize_event_attributes(ocel)

{'add': [IntegerAttribute(attribute='Unnamed: 0', type='integer', min=313, max=5212),
  NominalAttribute(attribute='author_name', type='nominal', num_unique=10),
  NominalAttribute(attribute='commit_message', type='nominal', num_unique=11),
  BooleanAttribute(attribute='merge', type='boolean', true_count=0, false_count=11)],
 'address': [IntegerAttribute(attribute='Unnamed: 0', type='integer', min=988, max=988),
  NominalAttribute(attribute='author_name', type='nominal', num_unique=1),
  NominalAttribute(attribute='commit_message', type='nominal', num_unique=1),
  BooleanAttribute(attribute='merge', type='boolean', true_count=0, false_count=1)],
 'aio': [IntegerAttribute(attribute='Unnamed: 0', type='integer', min=7636, max=9204),
  NominalAttribute(attribute='author_name', type='nominal', num_unique=2),
  NominalAttribute(attribute='commit_message', type='nominal', num_unique=3),
  BooleanAttribute(attribute='merge', type='boolean', true_count=0, false_count=3)],
 'benchmark': [Intege

In [13]:
ocel.events

Unnamed: 0.1,ocel:eid,ocel:activity,ocel:timestamp,author_name,merge,Unnamed: 0,commit_message,Unnamed: 10,Unnamed: 11
0,6a3abf2366e2c32ce3460155903262fee01736c8,initial,2014-09-18 09:12:01+00:00,Miško Hevery,False,0,Initial commit,,
1,afa761646472120edef1f9b01f219f125f20128e,build,2014-09-18 14:56:38+00:00,Tobias Bosch,False,1,build - refactor,,
2,45f8a5119488d28bf90311b2dd7fc55ee6f7d92a,build,2014-09-19 14:29:20+00:00,Tobias Bosch,False,6,build: prevent infinite loop during install,,
6,e45ecd6d58986c45955081c14295cd28d984846e,chore,2014-09-19 16:37:56+00:00,Misko Hevery,False,5,chare: changed gulp to support test folder,,
7,8afa421d75c95562060e35dbe8f576bd1e81c294,first,2014-09-19 16:38:37+00:00,Misko Hevery,False,12,first chunk of interfaces that are valid via d...,,
...,...,...,...,...,...,...,...,...,...
27842,8cd9663a9255e4f8dbddb0a888ea21da82082358,release,2023-09-06 18:23:35+00:00,Jessica Janiuk,False,27829,release: cut the v17.0.0-next.3 release,,
27843,eb0137de8ad498cf37ac238bdd472c8eab542a4d,build,2023-09-07 22:02:21+00:00,JoostK,False,27840,build: migrate `project_id` option to use `bes...,,
27844,b3edcda9e600297441b5a55406f6b8a217826921,build,2023-09-08 09:13:15+00:00,Kristiyan Kostadinov,False,27841,build: attempt to deflake windows tests (#5170...,,
27845,e866d85e95a5974252e5a5bd79281cc33b3b1489,docs,2023-09-08 11:23:07+00:00,Alan Agius,False,27843,docs: typo in what is angular (#51703)\n\nFix ...,,


In [15]:
filter_by_attribute(ocel,AttributeFilterConfig(
    type="attribute",
    target="event",
    attribute="author_name",
    regex="Misko",
    mode="include"
)
)



FilterResult(events=0        False
1        False
2        False
6         True
7         True
         ...  
27842    False
27843    False
27844    False
27845    False
27846    False
Length: 27842, dtype: bool, objects=None, e2o=None, o2o=None)