# Feature Extraction: Categorical

#### Dependencies

In [None]:
import pm4py
import numpy as np
import pandas as pd

#### Event Log Import

In [None]:
non_conforming_log_df = pd.read_pickle("filtered-non-conforminlog.pkl")
conforming_log_df  = pd.read_pickle("conforming-log.pkl")

#### Normative Model Import

In [None]:
file_path = r""
normative_petri_net = pm4py.read_pnml(file_path)

## Feature Extraction from Log

### Built-In pm4py Extraction Function

In [None]:
def extract_initial_features(event_log):
    features_df = pm4py.extract_features_dataframe(
        event_log,
        activity_key='concept:name',
        case_id_key='case:concept:name',
        timestamp_key='time:timestamp',
        str_tr_attr=[], 	
        num_tr_attr=["Amount", "RequestedAmount", "OriginalAmount", "Permit RequestedBudget", "AdjustedAmount"],
        str_ev_attr=['org:role'],
        include_case_id=True
    )

    return features_df

In [None]:
non_conf_init_features_df = extract_initial_features(non_conforming_log_df)
conf_init_features_df = extract_initial_features(conforming_log_df)

#### Standardize Numerical Values

In [None]:
def convert_floats_to_int64(df):
    df_copy = df.copy()
    
    float_cols = df_copy.select_dtypes(include=['float32', 'float64']).columns
    
    for col in float_cols:      
        df_copy[col] = df_copy[col].astype('int64')
    
    return df_copy

In [None]:
non_conf_init_features_df = convert_floats_to_int64(non_conf_init_features_df)
conf_init_features_df = convert_floats_to_int64(conf_init_features_df)

#### Maintain Uniform Dataframes (conforming vs non-conforming)

In [None]:
def add_n_fill_missing_columns(df_1, df_2):   
    all_columns = sorted(set(df_1.columns).union(df_2.columns))
    
    df_1_all_columns = df_1.reindex(columns=all_columns, fill_value=0)
    df_2_all_columns = df_2.reindex(columns=all_columns, fill_value=0)

    return df_1_all_columns, df_2_all_columns

In [None]:
non_conf_full_init_features_df, conf_init_full_features_df = add_n_fill_missing_columns(non_conf_init_features_df, conf_init_features_df)

#### Convert Numerical Values to Bins

In [None]:
def numerical_to_bin(
    df_list,                       
    columns_to_bin,                
    n_bins=4,                      
    bin_labels=None,               
    strategy='quantile',           
    drop_original=True,            
    custom_bins=None              
):

    if bin_labels is None:
        bin_labels = [f'bin_{i+1}' for i in range(n_bins)]
    
    if custom_bins is None:
        custom_bins = {}

    combined = pd.concat(df_list, axis=0)

    bin_edges = {}

    for col in columns_to_bin:
        if strategy == 'quantile':
            _, bins = pd.qcut(combined[col], q=n_bins, retbins=True, duplicates='drop')
        elif strategy == 'uniform':
            if col in custom_bins:
                bins = custom_bins[col]
            else:
                _, bins = pd.cut(combined[col], bins=n_bins, retbins=True)
        else:
            raise ValueError("strategy must be 'quantile' or 'uniform'")

        bin_edges[col] = bins

        updated_dfs = []
        for df in df_list:
            df_copy = df.copy()
            df_copy[col + '_bin'] = pd.cut(
                df_copy[col],
                bins=bins,
                labels=bin_labels[:len(bins)-1],
                include_lowest=True
            )

            if drop_original:
                df_copy = df_copy.drop(columns=col)

            updated_dfs.append(df_copy)

        df_list = updated_dfs

    return df_list, bin_edges

In [None]:
dfs = [non_conf_full_init_features_df, conf_init_full_features_df]

numerical_cols = [
    'case:Permit RequestedBudget',
    'case:AdjustedAmount',
    'case:OriginalAmount',
    'case:Amount',
    'case:RequestedAmount'
]

[non_conf_binned_df, conf_binned_df], bin_info = numerical_to_bin(
    df_list = dfs,
    columns_to_bin = numerical_cols,
    n_bins = 4,
    bin_labels = ['very_low', 'low', 'high', 'very_high'],
    strategy = 'quantile',
    drop_original = True
)

#### Convert Encoding (0/1) to Categorical

In [None]:
def one_hot_to_catergorical(df):

    role_cols = [col for col in df.columns if col.startswith("org:role_")]

    features_categorical = df.copy()

    for col in role_cols:
        features_categorical[col] = df[col].map(lambda x: 'present' if x == 1 else 'absent')

    return features_categorical

In [None]:
conf_cat_df = one_hot_to_catergorical(conf_binned_df)
non_conf_cat_df = one_hot_to_catergorical(non_conf_binned_df)

### Extract Case Level Categorical Attributes 

In [None]:
def add_categorical_features(event_log, df):
    trace_categorical_attributes_df = event_log.groupby("case:concept:name").agg({
        "case:Permit BudgetNumber": "first",
        "case:Permit OrganizationalEntity": "first", 
        "case:Permit ProjectNumber": "first", 
        "case:BudgetNumber": "first"
    }).reset_index()

    categorical_features_df = pd.merge(df, trace_categorical_attributes_df, on="case:concept:name", how="left")

    return categorical_features_df

In [None]:
non_conf_cat_features_df = add_categorical_features(non_conforming_log_df, non_conf_cat_df)
conf_cat_features_df = add_categorical_features(conforming_log_df, conf_cat_df)

### Extract Event Frequency 

In [None]:
def add_event_frequency_features(event_log, df):
    frequency_table = event_log.groupby(["case:concept:name", "concept:name"]).size().unstack(fill_value=0)
    frequency_table = frequency_table.reset_index()

    event_freqency_features_df = pd.merge(df, frequency_table, on="case:concept:name", how="left")

    return event_freqency_features_df

In [None]:
non_conf_ev_freq_features_df = add_event_frequency_features(non_conforming_log_df, non_conf_cat_features_df)
conf_ev_freq_features_df = add_event_frequency_features(conforming_log_df, conf_cat_features_df)

#### Maintain Uniform Dataframes (conforming vs non-conforming)

In [None]:
non_conf_full_ev_freq_features_df, conf_full_ev_freq_features_df = add_n_fill_missing_columns(non_conf_ev_freq_features_df, conf_ev_freq_features_df)

#### Convert Numerical to Categorical Values

In [None]:
def map_values_to_labels(df, columns, mappings):

    df_copy = df.copy()

    value_map = dict(mappings)

    for col in columns:
        if col in df_copy.columns:
            df_copy[col] = df_copy[col].map(value_map)
        else:
            print(f"Column '{col}' not found in DataFrame.")

    return df_copy

In [None]:
numeric_cols = non_conf_full_ev_freq_features_df.select_dtypes(include='int64').columns.tolist()
value_map = [(0, 'Never'), (1, 'Once'), (2, 'Twice'), (3, 'Three times'), (4, 'Four Times'), (5, 'Five Times'), (6, 'Six Times')]

non_conf_cat_freq_features_df = map_values_to_labels(non_conf_full_ev_freq_features_df, numeric_cols, value_map)
conf_cat_freq_features_df = map_values_to_labels(conf_full_ev_freq_features_df, numeric_cols, value_map)

## Performance Metrics

### Fitness

In [None]:
def add_trace_fitness_metric(event_log, df):

    from pm4py.objects.conversion.log import converter as log_converter

    fitness_scores = []

    for case_id in df["case:concept:name"]:
        trace_df = event_log[event_log["case:concept:name"] == case_id]

        sublog = log_converter.apply(trace_df, variant=log_converter.Variants.TO_EVENT_LOG)

        fitness = pm4py.fitness_token_based_replay(
        sublog,
        normative_petri_net[0],
        normative_petri_net[1],
        normative_petri_net[2],
        activity_key='concept:name',
        case_id_key='case:concept:name',
        timestamp_key='time:timestamp'
        )

        fitness_value = fitness.get("average_trace_fitness")
        fitness_scores.append((case_id, fitness_value))

    fitness_df = pd.DataFrame(fitness_scores, columns=["case:concept:name", "token_fitness"])

    features_fit_df = df.merge(fitness_df, on="case:concept:name", how="left")

    return features_fit_df

In [None]:
non_conf_fit_features_df = add_trace_fitness_metric(non_conforming_log_df, non_conf_cat_freq_features_df)
conf_fit_features_df = add_trace_fitness_metric(conforming_log_df, conf_cat_freq_features_df)

#### Convert Numerical Values to Bins

In [None]:
dfs = [non_conf_fit_features_df, conf_fit_features_df]

num_cols = ['token_fitness']

[non_conf_fit_binned_df2, conf_fit_binned_df2], bin_info2 = numerical_to_bin(
    df_list = dfs,
    columns_to_bin = num_cols,
    n_bins = 4,
    bin_labels = ['very_low', 'low', 'high', 'very_high'],
    strategy = 'quantile',
    drop_original = True
)

In [None]:
dfs = [non_conf_fit_features_df, conf_fit_features_df]

custom_bins = {
    'token_fitness': [0.7, 0.8, 0.85, 0.95, 1.01]
}

[non_conf_fit_binned_df, conf_fit_binned_df], bin_info = numerical_to_bin(
    df_list = dfs,
    columns_to_bin = ['token_fitness'],
    n_bins = 4,
    bin_labels = ['very_low', 'low', 'high', 'very_high'],
    strategy = 'uniform',
    custom_bins = custom_bins,
    drop_original = True
)

### Trace Duration & Event Count

In [None]:
def add_trace_duration_n_event_count(event_log, df):

    trace_duration_df = event_log.groupby("case:concept:name")["time:timestamp"].agg(
        trace_start="min",
        trace_end="max"
    ).reset_index()

    trace_duration_df["trace_duration_days"] = (trace_duration_df["trace_end"] - trace_duration_df["trace_start"]).dt.total_seconds() / (60 * 60 * 24)

    trace_event_count_df = event_log.groupby("case:concept:name").size().reset_index(name="num_events")

    features_duration_df = df.merge(trace_duration_df[["case:concept:name", "trace_duration_days"]], on="case:concept:name", how="left")
    features_ev_count_df = features_duration_df.merge(trace_event_count_df, on="case:concept:name", how="left")

    return features_ev_count_df

In [None]:
non_conf_duration_n_count_features_df = add_trace_duration_n_event_count(non_conforming_log_df, non_conf_fit_binned_df)
conf_duration_n_count_features_df = add_trace_duration_n_event_count(conforming_log_df, conf_fit_binned_df)

#### Convert Numerical Values to Bins

In [None]:
dfs = [non_conf_duration_n_count_features_df, conf_duration_n_count_features_df]

[non_conf_duration_n_count_binned_df, conf_duration_n_count_binned_df], bin_info = numerical_to_bin(
    df_list = dfs,
    columns_to_bin = ['trace_duration_days', 'num_events'],
    n_bins = 4,
    bin_labels = ['very_low', 'low', 'high', 'very_high'],
    strategy = 'quantile',
    drop_original=True
)

In [None]:
from pandas.api.types import CategoricalDtype

def apply_ordered_binning_dtype(df, suffix='_bin', categories=None):

    if categories is None:
        categories = ['very_low', 'low', 'high', 'very_high']

    ordinal_type = CategoricalDtype(categories=categories, ordered=True)
    df_copy = df.copy()

    for col in df_copy.columns:
        if col.endswith(suffix):
            df_copy[col] = df_copy[col].astype(ordinal_type)

    return df_copy

In [None]:
non_conf_final_features_df = apply_ordered_binning_dtype(non_conf_duration_n_count_binned_df)
conf_final_features_df = apply_ordered_binning_dtype(conf_duration_n_count_binned_df)

## Exporting Features

In [None]:
print(f"Non-conforming feature set is of shape {non_conf_final_features_df.shape}")
print(f"Conforming feature set is of shape {conf_final_features_df.shape}")

In [None]:
non_conf_final_features_df.to_pickle("non_conforming_categorical_features.pkl")
conf_final_features_df.to_pickle("conforming_categorical_features.pkl")