In [None]:
import os
import pandas as pd
import numpy as np
import pm4py

pd.set_option('display.max_columns', None)

In [None]:
def load_xes_files(raw_data_path):
    """
    Load all XES files from the specified directory.

    Args:
        raw_data_path (str): Path to the directory containing .xes files.

    Returns:
        dict: dict where key is log name and value is the log as pd.DataFrames.
    """
    logs = {}
    for file in os.listdir(raw_data_path):
        if file.endswith(".xes"):
            print(f"Loading {file}...")
            log = pm4py.read_xes(os.path.join(raw_data_path, file))
            logs[file] = log
    return logs

def preprocess_logs(logs, preprocess_instructions):
    """
    Preprocess event logs by converting them to pd.DataFrames and filtering out irrelevant columns.

    Args:
        logs (dict): Dictionary of event logs.
        preprocess_instructions (dict): Dictionary of preprocessing instructions.

    Returns:
        dict: Dictionary of preprocessed event logs.
    """
    preprocessed_logs = {}
    for log_name, log in logs.items():
        print(f"Preprocessing {log_name}...")
        df = logs[log_name]
        df = df[preprocess_instructions[log_name]]
        df.columns = ['case_id', 'activity', 'timestamp', 'resource']

        # Ensure timestamps are in datetime format and standardize to UTC
        df["timestamp"] = pd.to_datetime(df["timestamp"])
        if df["timestamp"].dt.tz is None:  # Check if timestamps are tz-naive
            df["timestamp"] = df["timestamp"].dt.tz_localize("UTC")  # Localize to UTC

        # Sort values by case id and then by timestamp
        df = df.sort_values(by=['case_id', 'timestamp'])

        # Impute missing values for categorical attributes
        df["activity"].fillna("<UNK>", inplace=True)
        df["resource"].fillna("<UNK>", inplace=True)

        # Drop remaining records with missing values
        df.dropna(inplace=True)

        # refresh index and then add as event_id column
        df = df.reset_index(drop=True)
        df = df.reset_index().rename(columns={'index': 'event_id'})

        preprocessed_logs[log_name] = df
    return preprocessed_logs

def save_preprocessed_logs(preprocessed_logs, output_path):
    """
    Save preprocessed logs to the specified directory.

    Args:
        preprocessed_logs (dict): Dictionary of preprocessed event logs.
        output_path (str): Path to the directory where preprocessed logs will be saved.
    """
    for log_name, log in preprocessed_logs.items():
        cleaned_log_name = log_name.replace(".xes", "")
        log.to_csv(os.path.join(output_path, f"{cleaned_log_name}.csv"), index=False)


In [None]:
preprocess_instructions = {
    '2011_BPI_Challenge.xes' : ['case:concept:name', 'concept:name', 'time:timestamp', 'org:group'],
    '2012_BPI_Challenge.xes' : ['case:concept:name', 'concept:name', 'time:timestamp', 'org:resource'],
    '2015_BPI_Challenge_1.xes' : ['case:concept:name', 'activityNameEN', 'time:timestamp', 'org:resource'],
    '2015_BPI_Challenge_2.xes' : ['case:concept:name', 'activityNameEN', 'time:timestamp', 'org:resource'],
    '2015_BPI_Challenge_3.xes' : ['case:concept:name', 'activityNameEN', 'time:timestamp', 'org:resource'],
    '2015_BPI_Challenge_4.xes' : ['case:concept:name', 'activityNameEN', 'time:timestamp', 'org:resource'],
    '2015_BPI_Challenge_5.xes' : ['case:concept:name', 'activityNameEN', 'time:timestamp', 'org:resource'],
    '2019_BPI_Challenge.xes' : ['case:concept:name', 'concept:name', 'time:timestamp', 'org:resource'],
    '2020_DomesticDeclarations.xes' : ['case:concept:name', 'concept:name', 'time:timestamp', 'org:role'],  
    '2020_InternationalDeclarations.xes' : ['case:concept:name', 'concept:name', 'time:timestamp', 'org:role'],
    '2020_PermitLog.xes' : ['case:concept:name', 'concept:name', 'time:timestamp', 'org:role'],
    '2020_PrepaidTravelCost.xes' : ['case:concept:name', 'concept:name', 'time:timestamp', 'org:role'],
    '2020_RequestForPayment.xes' : ['case:concept:name', 'concept:name', 'time:timestamp', 'org:role'],
    '2018_BPI_Challenge.xes' : ['case:concept:name', 'concept:name', 'time:timestamp', 'org:resource']
}

all_logs = load_xes_files('../data/raw/')
preprocessed_logs = preprocess_logs(all_logs, preprocess_instructions)
save_preprocessed_logs(preprocessed_logs, '../data/processed/')