In [None]:
import os
import string
import pandas as pd
import numpy as np
import datetime
# set printing all columns
pd.set_option('display.max_columns', None)

In [None]:
BPI12_dt = pd.read_csv("../data/BPI_Challenge_2012.csv")

In [None]:
# First change NAN to preseve the orginal NAN as categorial in org:rescource
BPI12_dt = BPI12_dt.fillna('N')

In [None]:
# Sort by case and time to ensure the starts come before completes for the same event
BPI12_dt.sort_values(by=['case:concept:name', 'time:timestamp'], inplace=True)

In [None]:
# for BPI12 set Transform case:AMOUNT_REQ to integer 
if not BPI12_dt['case:AMOUNT_REQ'].isnull().any():
    BPI12_dt['case:AMOUNT_REQ'] = BPI12_dt['case:AMOUNT_REQ'].astype(int)

In [None]:
# Drop repeated rows and reset index
BPI12_dt = BPI12_dt.drop_duplicates().reset_index(drop=True)

In [None]:
BPI12_dt.head()

In [None]:
# BPI12 set. Rename original concept:name and split for different set
BPI12_dt.rename(columns={'concept:name': 'full_concept:name'}, inplace=True)
BPI12_dt[['pre_concept:name', 'concept:name']] = BPI12_dt['full_concept:name'].str.split('_', n=1, expand=True)

In [None]:
def generate_labels(n):
    from itertools import product
    import string

    chars = string.ascii_uppercase
    # 1-letter + 2-letter combos (up to 702 labels)
    labels = list(chars) + [''.join(p) for p in product(chars, repeat=2)]
    if n > len(labels):
        raise ValueError("Too many activities. Extend to 3-letter if needed.")
    return labels[:n]

activities = sorted(BPI12_dt['full_concept:name'].unique())
labels = generate_labels(len(activities))
activity_mapping = dict(zip(activities, labels))
# Apply the mappings to create the new columns in 'bpi12_dt'
BPI12_dt['event_label'] = BPI12_dt['full_concept:name'].map(activity_mapping)

In [None]:
BPI12_dt['time:timestamp'] = pd.to_datetime(BPI12_dt['time:timestamp'], format='ISO8601').dt.strftime('%Y-%m-%d %H:%M:%S')
BPI12_dt.rename(columns={"time:timestamp": "time",
                         "case:concept:name": "sequence"}, 
                inplace=True)

In [None]:
transitions = list(np.sort(BPI12_dt['lifecycle:transition'].unique()))
cycle_mapping = {transition: letter for transition, letter in zip(transitions, string.ascii_lowercase)}

In [None]:
BPI12_dt['status'] = BPI12_dt['lifecycle:transition'].map(cycle_mapping)

In [None]:
BPI12_dt['event'] = BPI12_dt['event_label'] + BPI12_dt['status']

In [None]:
#BPI12set
BPI12_dt = BPI12_dt.drop(columns = ['lifecycle:transition', "concept:name", 'pre_concept:name','full_concept:name', "case:REG_DATE"])

In [None]:
# Columns to exclude from checking
exclude_cols = ["status", 'event', 'sequence', 'time', 'event_label']  

# Identify columns to drop, excluding specified ones
columns_to_drop = [col for col in BPI12_dt.columns if col not in exclude_cols and BPI12_dt[col].nunique() == 1]

# Drop those columns
BPI12_dt.drop(columns=columns_to_drop, inplace=True)

In [None]:
def classify_and_rename(df, sequence_col, event_col, ex_col, unique_threshold = 10, manual_override = None):
    """
    Classifies attributes as sequence-level or event-level, detects categorical vs. numerical data, and renames columns.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - sequence_col (str): The column representing the sequence identifier.
    - event_col (str): The column representing the event identifier.
    - ex_col(list): The column representing the non atrribute columns.
    - unique_threshold (int, optional): Max unique values for numeric attributes to be considered categorical.
    - manual_override (list, optional): User-defined category overrides [col_name].

    Returns:
    - pd.DataFrame: DataFrame with renamed columns.
    - dict: Mapping of old column names to new ones.
    """

    # Exclude sequence & event columns from classification
    attribute_cols = [col for col in df.columns if col not in [sequence_col, event_col] + ex_col]

    sequence_level, event_level = [], []

    # Identify Sequence-Level vs. Event-Level
    for col in attribute_cols:
        if df.groupby(sequence_col)[col].nunique().max() == 1:
            sequence_level.append(col)
        else:
            event_level.append(col)

    def detect_categorical(columns):
        categorical, numerical = [], []
        for col in columns:
            if df[col].dtype == "object":
                categorical.append(col)
            elif df[col].dtype in ["int64", "float64", 'int32','float32']:
                if df[col].nunique() <= unique_threshold:
                    categorical.append(col)
                else:
                    numerical.append(col)
        return categorical, numerical

    # Auto-Detect Categorical vs. Numerical
    sequence_categorical, sequence_numerical = detect_categorical(sequence_level)
    event_categorical, event_numerical = detect_categorical(event_level)

    # Manual Override (if provided)
    if manual_override:
        for col in manual_override:
            if col in sequence_numerical:
                sequence_categorical.append(col)
                sequence_numerical.remove(col) 
            elif col in event_numerical:
                event_categorical.append(col)
                event_numerical.remove(col)
                
    print("Sequence-level attributes:", sequence_level)
    print("Event-level attributes:", event_level)

    # Step 4: Rename Columns
    new_column_names = {}

    for i, col in enumerate(sequence_categorical, start=1):
        new_column_names[col] = f"sc{i}"
    for i, col in enumerate(sequence_numerical, start=1):
        new_column_names[col] = f"sn{i}"
    for i, col in enumerate(event_categorical, start=1):
        new_column_names[col] = f"ec{i}"
    for i, col in enumerate(event_numerical, start=1):
        new_column_names[col] = f"en{i}"

    # Apply renaming
    df.rename(columns=new_column_names, inplace=True)

    return df, new_column_names


In [None]:
BPI12, renamed_columns = classify_and_rename(BPI12_dt, 
                                          sequence_col = "sequence", 
                                          event_col = "event", 
                                          ex_col = ['time', 'event_label', 'status'] #, 
                                          #manual_override = ['case:variant-index']
                                         )

print("Renamed Columns:", renamed_columns)

In [None]:
BPI12