In [45]:
from typing import List, Literal, Union
import pandas as pd
import pm4py
from pm4py.objects.ocel.obj import OCEL
from pydantic.main import BaseModel


# --- Attribute Type Models ---

class IntegerAttribute(BaseModel):
    attribute: str
    type: Literal["integer"]
    min: int
    max: int


class FloatAttribute(BaseModel):
    attribute: str
    type: Literal["float"]
    min: float
    max: float


class BooleanAttribute(BaseModel):
    attribute: str
    type: Literal["boolean"]
    true_count: int
    false_count: int


class DateAttribute(BaseModel):
    attribute: str
    type: Literal["date"]
    min: str
    max: str


class NominalAttribute(BaseModel):
    attribute: str
    type: Literal["nominal"]
    num_unique: int


AttributeSummary = Union[
    IntegerAttribute,
    FloatAttribute,
    BooleanAttribute,
    DateAttribute,
    NominalAttribute,
]

# --- Utility Functions ---

def melt_df(df: pd.DataFrame, type_col: str, cols: list[str]) -> pd.DataFrame:
    return (
        df[[type_col] + cols]
        .melt(id_vars=type_col, var_name="attribute", value_name="value")
        .dropna(subset=["value"])
    )


def is_boolean_series_fast(series: pd.Series, lower_vals: pd.Series) -> bool:
    valid = {"true", "false", "yes", "no", "0", "1"}
    return set(lower_vals.unique()).issubset(valid) and lower_vals.nunique() <= 2


# --- Main Attribute Summary Logic ---

def summarize_attributes(df: pd.DataFrame, type_column: str) -> dict[str, List[AttributeSummary]]:
    summary_by_type: dict[str, List[AttributeSummary]] = {}

    grouped = df.groupby([type_column, "attribute"])

    for (type_name, attr), group in grouped:
        values = group["value"].dropna()
        str_vals = values.astype(str)
        lower_vals = str_vals.str.lower()

        attribute_type = "unknown"
        numeric_values = None
        date_values = None

        # Try boolean
        if is_boolean_series_fast(values, lower_vals):
            attribute_type = "boolean"

        # Try numeric
        if attribute_type == "unknown":
            try:
                numeric_values = pd.to_numeric(values, errors="raise")
                if (numeric_values % 1 == 0).all():
                    attribute_type = "integer"
                    numeric_values = numeric_values.astype(int)
                else:
                    attribute_type = "float"
            except Exception:
                pass

        # Try date
        if attribute_type == "unknown":
            try:
                date_values = pd.to_datetime(values, errors="coerce")
                if date_values.notna().all():
                    attribute_type = "date"
            except Exception:
                pass

        # Fallback to nominal
        if attribute_type == "unknown":
            attribute_type = "nominal"

        # Create summary
        match attribute_type:
            case "integer":
                summary = IntegerAttribute(
                    attribute=attr,
                    type="integer",
                    min=int(numeric_values.min()),
                    max=int(numeric_values.max()),
                )
            case "float":
                summary = FloatAttribute(
                    attribute=attr,
                    type="float",
                    min=float(numeric_values.min()),
                    max=float(numeric_values.max()),
                )
            case "boolean":
                true_count = lower_vals.isin(["true", "yes", "1"]).sum()
                false_count = len(values) - true_count
                summary = BooleanAttribute(
                    attribute=attr,
                    type="boolean",
                    true_count=true_count,
                    false_count=false_count,
                )
            case "date":
                summary = DateAttribute(
                    attribute=attr,
                    type="date",
                    min=str(date_values.min()),
                    max=str(date_values.max()),
                )
            case "nominal":
                summary = NominalAttribute(
                    attribute=attr,
                    type="nominal",
                    num_unique=values.nunique(),
                )

        summary_by_type.setdefault(type_name, []).append(summary)

    return summary_by_type


# --- OCEL Integration Functions ---

def summarize_event_attributes(ocel: OCEL) -> dict[str, list[AttributeSummary]]:
    event_attribute_names = [
        col for col in pm4py.ocel_get_attribute_names(ocel) if col in ocel.events.columns
    ]

    melted_event_attributes = melt_df(
        ocel.events, ocel.event_activity, event_attribute_names
    )

    return summarize_attributes(melted_event_attributes, ocel.event_activity)


def summarize_object_attributes(ocel: OCEL) -> dict[str, list[AttributeSummary]]:
    obj_type_col = ocel.object_type_column

    attribute_names = pm4py.ocel_get_attribute_names(ocel)
    object_cols = [col for col in attribute_names if col in ocel.objects.columns]
    object_changes_cols = [
        col for col in attribute_names if col in ocel.object_changes.columns
    ]

    melted_objects = melt_df(
        ocel.objects.replace("null", pd.NA), obj_type_col, object_cols
    )
    melted_changes = melt_df(
        ocel.object_changes.replace("null", pd.NA), obj_type_col, object_changes_cols
    )

    metadata = pd.concat([melted_objects, melted_changes], ignore_index=True)

    return summarize_attributes(metadata, obj_type_col)


In [46]:
ocel = pm4py.read_ocel2_sqlite("/home/grkmr/Projects/ocelescope/src/backend/data/event_logs/socel_hinge_v1.1_num.sqlite")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pandas_utils.dataframe_column_string_to_datetime(df[col], format=timest_format, utc=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = strpfromiso.fix_dataframe_column(df[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  object_changes[internal_index] = object_changes.index

In [47]:
%timeit summarize_object_attributes(ocel)

  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="c

94.1 ms ± 3.35 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")
  date_values = pd.to_datetime(values, errors="coerce")


In [53]:
from pandas.api.types import is_numeric_dtype, is_datetime64_any_dtype
import pandas as pd
from typing import Union
from dataclasses import dataclass

class BaseFilterConfig(BaseModel):
    mode: Optional[Literal["include", "exclude"]] = "include"


class AttributeFilterConfig(BaseFilterConfig):
    type: Literal["attribute"]
    target: Literal["event", "object"]
    attribute: str

    # Range filters
    min: Optional[Union[int, float, str]] = None
    max: Optional[Union[int, float, str]] = None

    # Nominal filters
    values: Optional[list[Union[str, int, float]]] = None
    regex: Optional[str] = None

@dataclass
class FilterResult:
    events: Optional[pd.Series] = None
    objects: Optional[pd.Series] = None
    e2o: Optional[pd.Series] = None
    o2o: Optional[pd.Series] = None
    


def filter_by_attribute(ocel: OCEL, config: AttributeFilterConfig) -> FilterResult:
    df = ocel.events if config.target == "event" else ocel.objects
    col = config.attribute

    if col not in df.columns:
        raise ValueError(f"Attribute '{col}' not found in {config.target} data")

    series = df[col]
    mask = pd.Series(True, index=series.index)

    # Handle numeric filtering
    if config.min is not None or config.max is not None:
        if is_numeric_dtype(series):
            numeric_series = series
        else:
            numeric_series = pd.to_numeric(series, errors="coerce")

        if config.min is not None:
            mask &= numeric_series >= float(config.min)
        if config.max is not None:
            mask &= numeric_series <= float(config.max)

    # Handle date filtering
    elif isinstance(config.min, str) or isinstance(config.max, str):
        if is_datetime64_any_dtype(series):
            date_series = series
        else:
            date_series = pd.to_datetime(series, errors="coerce")

        if config.min is not None:
            mask &= date_series >= pd.to_datetime(config.min)
        if config.max is not None:
            mask &= date_series <= pd.to_datetime(config.max)

    # Handle nominal filtering
    if config.values is not None:
        mask &= series.isin(config.values)

    if config.regex is not None:
        mask &= series.astype(str).str.contains(config.regex, regex=True, na=False)

    if config.mode == "exclude":
        mask = ~mask

    return FilterResult(
        events=mask if config.target == "event" else None,
        objects=mask if config.target == "object" else None
    )


In [55]:
summarize_event_attributes(ocel)

  date_values = pd.to_datetime(values, errors='raise')
  date_values = pd.to_datetime(values, errors='raise')
  date_values = pd.to_datetime(values, errors='raise')
  date_values = pd.to_datetime(values, errors='raise')
  date_values = pd.to_datetime(values, errors='raise')
  date_values = pd.to_datetime(values, errors='raise')
  date_values = pd.to_datetime(values, errors='raise')
  date_values = pd.to_datetime(values, errors='raise')
  date_values = pd.to_datetime(values, errors='raise')
  date_values = pd.to_datetime(values, errors='raise')
  date_values = pd.to_datetime(values, errors='raise')
  date_values = pd.to_datetime(values, errors='raise')
  date_values = pd.to_datetime(values, errors='raise')
  date_values = pd.to_datetime(values, errors='raise')
  date_values = pd.to_datetime(values, errors='raise')
  date_values = pd.to_datetime(values, errors='raise')
  date_values = pd.to_datetime(values, errors='raise')
  date_values = pd.to_datetime(values, errors='raise')
  date_val

{'add': [NumericalAttribute(attribute='Unnamed: 0', type='numerical', min=313.0, max=5212.0),
  NominalAttribute(attribute='author_name', type='nominal', sample_values=['Essam Al Joubori', 'Kevin Moore', 'Bertrand Laporte', 'Jeremy Elbourn', 'John Jelinek IV', 'Matan Lurey', 'Eric Mendes Dantas', 'Kathy Walrath', 'Joost de Vries', 'Jesús Rodríguez'], num_unique=10),
  NominalAttribute(attribute='commit_message', type='nominal', sample_values=['Add semicolon to generateNavigationDoc.js', 'add types', 'add compilation error test for dart\n\nCloses #812', 'add return types for indexOf and lastIndexOf\n\ncloses #1277', 'Add full example to FormBuilder docs', 'Add correct type to TestBed.createView\n\nCloses #1727', 'Adding lifecycle breaking change to the changelog.md', 'Add ES5 examples - before and after\n\nCloses #4130', 'Add BREAKING CHANGE to CHANGELOG for b.15\n\nDart apps that import angular2/bootstrap.dart ran in beta.14 but fail in beta.15.\n\nCloses #8071', "adds 'repository' m

In [66]:
filter_by_attribute(ocel,AttributeFilterConfig(
    type="attribute",
    target="event",
    attribute="commit_message",
    regex="fix",
    mode="include"
)
)



FilterResult(events=0        False
1        False
2        False
6        False
7        False
         ...  
27842    False
27843    False
27844    False
27845    False
27846    False
Length: 27842, dtype: bool, objects=None, e2o=None, o2o=None)