# Feature Extraction

#### Dependencies

In [None]:
import pm4py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

#### Event Log Import

In [None]:
non_conforming_log_df = pd.read_pickle("filtered-non-conforminlog.pkl")
conforming_log_df  = pd.read_pickle("conforming-log.pkl")

#### Normative Model Import

In [None]:
file_path = r""
normative_petri_net = pm4py.read_pnml(file_path)

## Feature Extraction from Log

In [None]:
def extract_initial_features(event_log):
    features_df = pm4py.extract_features_dataframe(
        event_log,
        activity_key='concept:name',
        case_id_key='case:concept:name',
        timestamp_key='time:timestamp',
        str_tr_attr=[], 	
        num_tr_attr=["Amount", "RequestedAmount", "OriginalAmount", "Permit RequestedBudget", "AdjustedAmount"],
        str_ev_attr=['org:role'],
        include_case_id=True
    )

    return features_df

In [None]:
non_conf_init_features_df = extract_initial_features(non_conforming_log_df)
conf_init_features_df = extract_initial_features(conforming_log_df)

In [None]:
non_conf_init_features_df.head()

In [None]:
def add_categorical_features(event_log, df):
    trace_categorical_attributes_df = event_log.groupby("case:concept:name").agg({
        "case:Permit BudgetNumber": "first",
        "case:Permit OrganizationalEntity": "first", 
        "case:Permit ProjectNumber": "first", 
        "case:BudgetNumber": "first"
    }).reset_index()

    categorical_features_df = pd.merge(df, trace_categorical_attributes_df, on="case:concept:name", how="left")

    return categorical_features_df

In [None]:
non_conf_cat_features_df = add_categorical_features(non_conforming_log_df, non_conf_init_features_df)
conf_cat_features_df = add_categorical_features(conforming_log_df, conf_init_features_df)

In [None]:
def add_event_frequency_features(event_log, df):
    frequency_table = event_log.groupby(["case:concept:name", "concept:name"]).size().unstack(fill_value=0)
    frequency_table = frequency_table.reset_index()

    event_freqency_features_df = pd.merge(df, frequency_table, on="case:concept:name", how="left")

    return event_freqency_features_df

In [None]:
non_conf_ev_freq_features_df = add_event_frequency_features(non_conforming_log_df, non_conf_cat_features_df)
conf_ev_freq_features_df = add_event_frequency_features(conforming_log_df, conf_cat_features_df)

## Performance Metrics

### Fitness

In [None]:
def add_trace_fitness_metric(event_log, df):

    from pm4py.objects.conversion.log import converter as log_converter

    fitness_scores = []

    for case_id in df["case:concept:name"]:
        trace_df = event_log[event_log["case:concept:name"] == case_id]

        sublog = log_converter.apply(trace_df, variant=log_converter.Variants.TO_EVENT_LOG)

        fitness = pm4py.fitness_token_based_replay(
        sublog,
        normative_petri_net[0],
        normative_petri_net[1],
        normative_petri_net[2],
        activity_key='concept:name',
        case_id_key='case:concept:name',
        timestamp_key='time:timestamp'
        )

        fitness_value = fitness.get("average_trace_fitness")
        fitness_scores.append((case_id, fitness_value))

    fitness_df = pd.DataFrame(fitness_scores, columns=["case:concept:name", "token_fitness"])

    features_fit_df = df.merge(fitness_df, on="case:concept:name", how="left")

    return features_fit_df

In [None]:
non_conf_ev_freq_features_df = add_trace_fitness_metric(non_conforming_log_df, non_conf_ev_freq_features_df)
conf_ev_freq_features_df = add_trace_fitness_metric(conforming_log_df, conf_ev_freq_features_df)

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(non_conf_ev_freq_features_df["token_fitness"], bins=20, kde=True, color="skyblue")

plt.title("Distribution of Token-Based Fitness Scores per Trace")
plt.xlabel("Token Fitness Score")
plt.ylabel("Number of Traces")
plt.tight_layout()
plt.show()

### Trace Duration & Event Count

In [None]:
def add_trace_duration_n_event_count(event_log, df):

    trace_duration_df = event_log.groupby("case:concept:name")["time:timestamp"].agg(
        trace_start="min",
        trace_end="max"
    ).reset_index()

    trace_duration_df["trace_duration_days"] = (trace_duration_df["trace_end"] - trace_duration_df["trace_start"]).dt.total_seconds() / (60 * 60 * 24)

    trace_event_count_df = event_log.groupby("case:concept:name").size().reset_index(name="num_events")

    features_duration_df = df.merge(trace_duration_df[["case:concept:name", "trace_duration_days"]], on="case:concept:name", how="left")
    features_ev_count_df = features_duration_df.merge(trace_event_count_df, on="case:concept:name", how="left")

    return features_ev_count_df

In [None]:
non_conf_duration_n_count_features_df = add_trace_duration_n_event_count(non_conforming_log_df, non_conf_ev_freq_features_df)
conf_duration_n_count_features_df = add_trace_duration_n_event_count(conforming_log_df, conf_ev_freq_features_df)

In [None]:
def encode_categorical_to_label(df):

    from sklearn.preprocessing import LabelEncoder

    label_cols = ['case:Permit BudgetNumber', 
                'case:Permit OrganizationalEntity', 
                'case:Permit ProjectNumber', 
                'case:BudgetNumber']

    le = LabelEncoder()

    for col in label_cols:
        df[col + '_le'] = le.fit_transform(df[col])
        df.drop(columns=col, inplace=True)

    return df

In [None]:
non_conf_complete_features_df = encode_categorical_to_label(non_conf_duration_n_count_features_df)
conf_complete_features_df = encode_categorical_to_label(conf_duration_n_count_features_df)

In [None]:
# Get all possible columns from both sets
all_columns = set(non_conf_complete_features_df.columns).union(conf_complete_features_df.columns)

# Reindex both with the full column set
non_conf_final_features_df = non_conf_complete_features_df.reindex(columns=all_columns, fill_value=0)
conf_final_features_df = conf_complete_features_df.reindex(columns=all_columns, fill_value=0)


In [None]:
print(f"Non-conforming feature set is of shape {non_conf_final_features_df.shape}")
print(f"Conforming feature set is of shape {conf_final_features_df.shape}")

In [None]:
non_conf_final_features_df.to_pickle("non_conforming_numerical_features.pkl")
conf_final_features_df.to_pickle("conforming_numerical_features.pkl")