## This notebook requires to be runned in Python 3.8

## 0) Initialization

### 0.1) Configure environment and import dependencies

In [None]:
# Import standard libraries for filesystem and module management
import os            
import sys 
import csv          
import shutil       
import importlib
from functools import reduce
from typing import List, Dict
import pandas as pd 
import yaml
import glob

# Add repository root to PYTHONPATH to enable absolute imports of local modules
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd()))
sys.path.insert(0, PROJECT_ROOT)



### 0.2) Define paths, constants and create output directories

In [None]:
# Load configuration
with open(os.path.join(PROJECT_ROOT, 'config', "config_feature_extraction.yaml"), "r") as f:
    cfg = yaml.safe_load(f)

# File and folder paths for each processing stage
EVENT_LOG           = cfg["event_log"]
EVENT_LOG_SETTINGS  = cfg["event_log_settings"]
exp_name            = cfg["experiment_name"]
RAW_LOG             = os.path.join(PROJECT_ROOT, '0_raw_log', cfg["raw_log"])
UNIQUE_DIR          = os.path.join(PROJECT_ROOT, '1_unique_log')
LABELED_DIR         = os.path.join(PROJECT_ROOT, '2_labelled_logs', EVENT_LOG)
FEATURES_DIR        = os.path.join(PROJECT_ROOT, '3_extracted_features', EVENT_LOG)

# Parameters controlling data split and sequence filtering
SPLIT_RATIO   = 0.7  # fraction of events used for training
SEQ_THRESHOLD = 5    # minimum events in a sequence to keep
MAX_SPLITS    = 1    # number of times to split the log

# Create folders for downstream outputs if they don't already exist
os.makedirs(UNIQUE_DIR,  exist_ok=True)
os.makedirs(LABELED_DIR, exist_ok=True)
os.makedirs(FEATURES_DIR, exist_ok=True)

# print configuration settings
print(f"Event Log: {EVENT_LOG}")
#print(f"Event Log Settings: {conf_settings}")
print(f"Experiment: {exp_name}")    

### 0.3) Load modules for log preparation, tagging, splitting, and feature extraction

In [None]:
# === Log preparation and tagging utilities ===
from helper_functions.preparation.RetagLogWithUniqueIds         import changeLog
from helper_functions.preparation.TaggingStrategy               import TaggingStrategy
from helper_functions.preparation.ConfigurationFile             import ConfigurationFile
from helper_functions.preparation.PayloadType                   import PayloadType
from helper_functions.preparation.declaretemplates_new          import template_response, template_exist
from helper_functions.preparation.LogTaggingViaPredicates       import (
    tagLogWithOccurrence,
    tagLogWithValueEqOverEventAttn,
    tagLogWithSatAllProp,
    tagLogWithExactOccurrence,
    tagLogWithValueEqOverIthEventAttn,
    SatCases
)

# === Log splitting and feature extraction helpers ===
from helper_functions.feature_extraction.deviancecommon         import read_XES_log
from helper_functions.feature_extraction.FileNameUtils          import csv_trace_encodings
from helper_functions.feature_extraction.TraceUtils             import getTraceId
from helper_functions.feature_extraction.LogUtils               import (
    xes_to_propositional_split,
    xes_to_data_propositional_split,
    xes_to_tracelist_split
)

# Feauture extraction helpers
from helper_functions.feature_extraction.baseline_runner         import baseline_embedding
from helper_functions.feature_extraction.sequence_runner         import generateSequences, run_sequences
from helper_functions.feature_extraction.declaredevmining        import declare_embedding
from helper_functions.feature_extraction.ddm_newmethod_fixed_new import declare_data_aware_embedding
from helper_functions.feature_extraction.payload_extractor       import payload_embedding

# File utilities
from helper_functions.feature_extraction.PandaExpress            import dataframe_multiway_equijoin, fast_csv_parse, ensureDataFrameQuality
from helper_functions.feature_extraction.DumpUtils               import read_single_arff_dump, read_arff_embedding_dump, dump_custom_dataframes
from helper_functions.extra_functions                            import (
    build_conf,
    _make_value_eq,
    _make_occurrence,
    _make_sat_all_prop,
    read_sequence_log_via_arff,
    internal_to_folder,
    read_all_numeric_columns_except_one,
    read_feature_csv,
    dataframe_join_with_checks,
    dataframe_multiway_equijoin,
    multijoined_dump_no_splits,
    dump_all_compositions,
)

## 1) Tagging & Labeling

### 1.1) Retag raw log with unique identifiers

In [None]:
# Apply unique IDs to each case and event in the raw XES log
unique_log, log_obj = changeLog(RAW_LOG, output_dir=UNIQUE_DIR)

# Confirm where the retagged file was saved
print(f"✔ Retagged log written to: {unique_log}")

### 1.2) Build default configuration and define labelling strategies (Uncomment one per run based on which log is being processed)

#### TRAFFIC

In [None]:
# Helper: create a ConfigurationFile pre-populated with shared Traffic settings
def build_conf(exp_name: str) -> ConfigurationFile:
    cf = ConfigurationFile()
    cf.setExperimentName(exp_name)
    cf.setLogName(os.path.basename(unique_log))      # use the retagged filename
    cf.setOutputFolder("TRAFFIC")                    # base output subfolder
    cf.setMaxDepth(5)                                # decision-tree max depth
    cf.setMinLeaf(5)                                 # minimum samples per leaf
    cf.setSequenceThreshold(SEQ_THRESHOLD)           # filter short sequences
    cf.setPayloadType(PayloadType.both)              # include control flow & data payload
    cf.setAutoIgnore([                               
        "time:timestamp", "concept: name", "Label",  # ignore these event attributes
        "Start date", "End date", "Diagnosis", 
        "Diagnosis code", "Diagnosis Treatment", 
        "Combination ID", "Treatment code", "Activity code"
    ])
    cf.setPayloadSettings("traffic_settings.cfg")   # load custom payload rules
    return cf

# Dictionary containing the labeling functions for writing labels to the log
labeling_map = {
    "traffic_payload_Pay36": lambda log: tagLogWithValueEqOverEventAttn(
        log, "paymentAmount", 36.0
    ),
    "traffic_mr_tr": lambda log: tagLogWithOccurrence(
        log, ["Add penalty", "Payment"], 1
    ),
    "traffic_decl3": lambda log: tagLogWithSatAllProp(
        log,
        [(template_response, ["Insert Date Appeal to Prefecture", "Add penalty"])],
        SatCases.NotVacuitySat
    )
}

#### SEPSIS

In [None]:
# Helper: create a ConfigurationFile pre-populated with shared Sepsis settings
def build_conf(exp_name: str) -> ConfigurationFile:
    cf = ConfigurationFile()
    cf.setExperimentName(exp_name)
    cf.setLogName(os.path.basename(unique_log))       # use the retagged filename
    cf.setOutputFolder("SEPSIS")                     # base output subfolder
    cf.setMaxDepth(5)                                 # decision-tree max depth
    cf.setMinLeaf(5)                                  # minimum samples per leaf
    cf.setSequenceThreshold(SEQ_THRESHOLD)            # filter short sequences
    cf.setPayloadType(PayloadType.both)               # include control flow & data payload
    cf.setAutoIgnore([
        "Diagnosis", "Diagnose",                     # ignore these attributes
        "time:timestamp", "concept: name", "Label",
        "lifecycle: transition"
    ])
    cf.setPayloadSettings("sepsis_settings.cfg")      # load custom payload rules
    return cf

# Dictionary containing the labeling functions for writing labels to the log
labeling_map = {
    "sepsis_payload2": lambda log: tagLogWithValueEqOverIthEventAttn(
        log, "DisfuncOrg", True, 0
    ),
    "sepsis_mr_tr": lambda log: tagLogWithExactOccurrence(
        log, ["Admission NC", "CRP", "Leucocytes"], 1
    ),
    "sepsis_decl": lambda log: tagLogWithSatAllProp(
        log,
        [
            (template_response, ["IV Antibiotics", "Leucocytes"]),
            (template_response, ["LacticAcid", "IV Antibiotics"]),
            (template_response, ["ER Triage", "CRP"])
        ],
        SatCases.NotVacuitySat
    )
}

#### BPI15A

In [None]:
# # Helper: create a ConfigurationFile pre-populated with shared BPI15A settings
# def build_conf(exp_name: str) -> ConfigurationFile:
#     cf = ConfigurationFile()
#     cf.setExperimentName(exp_name)
#     cf.setLogName(os.path.basename(unique_log))        # use the retagged filename
#     cf.setOutputFolder("BPI15A")                       # base output subfolder
#     cf.setMaxDepth(5)                                  # decision-tree max depth
#     cf.setMinLeaf(5)                                   # minimum samples per leaf
#     cf.setSequenceThreshold(SEQ_THRESHOLD)             # filter short sequences
#     cf.setPayloadType(PayloadType.both)                # include control flow & data payload
#     cf.setAutoIgnore([
#         "time:timestamp", "concept: name", "Label",   # ignore these event attributes
#         "Start date", "End date", "Diagnosis",
#         "Diagnosis code", "Diagnosis Treatment",
#         "Combination ID", "Treatment code", "Activity code",
#         "dateFinished", "panned", "dueDate"
#     ])
#     cf.setPayloadSettings("bpi2015_settings.cfg")     # load custom payload rules
#     return cf

# # Dictionary containing the labeling functions for writing labels to the log
# labeling_map = {
#     "bpi15A_payload_560925": lambda log: tagLogWithValueEqOverEventAttn(
#         log, "monitoringResource", "560925"
#     ),
#     "bpi15A_mr_tr": lambda log: tagLogWithOccurrence(
#         log, ["08_AWB45_005", "01_HOOFD_200"], 1
#     ),
#     "bpi15A_decl2": lambda log: tagLogWithSatAllProp(
#         log,
#         [(template_exist, ["01_HOOFD_011"])],
#         SatCases.NotVacuitySat
#     )
# }

### 1.3) Apply each labeling strategy and write out labeled logs

In [None]:
# Iterate through each experiment name and its associated labeling function
for exp_name, label_func in labeling_map.items():
    print(f"\n▶ Processing label set: {exp_name}")
    
    # Build and serialize the configuration for this labeling experiment
    cf = build_conf(exp_name)
    json_path = os.path.join(LABELED_DIR, f"{exp_name}.json")
    cf.dump(json_path)
    
    # Initialize and execute the tagging strategy on the log object
    tagger = TaggingStrategy(exp_name, label_func)
    tagger(LABELED_DIR, cf, log_obj)
    
    # Assemble and print the paths of the outputs just generated
    xes_path = os.path.join(LABELED_DIR, tagger.logname)
    print(f"   • Labeled XES → {xes_path}")
    print(f"   • Config JSON → {json_path}")

## 2) Preparing Feature Extraction & Encoding

### 2.1) Prepare labeled log and output directory for feature extraction

In [None]:
exp_name      = cfg["experiment_name"]

# Paths to the labeled XES file and its configuration
labeled_log = os.path.join(LABELED_DIR, f'{exp_name}.xes')
json_config = os.path.join(LABELED_DIR, f'{exp_name}.json')

# Create a folder to store extracted features for this experiment
features_folder  = os.path.join(FEATURES_DIR, f'{exp_name}_features')
os.makedirs(features_folder, exist_ok=True)

# Load the labeled XES log into memory as a list of traces
log = read_XES_log(labeled_log)
print(f"✔ Loaded '{exp_name}' → {len(log)} total traces")

### 2.2) Generate propositional and canonical representations of the log

In [None]:
# Capture all trace IDs for downstream reference
trace_ids = {getTraceId(t) for t in log}

# Convert each trace into a propositional format for sequence & baseline (IA) models
propositional_log = xes_to_propositional_split(log)
print("✔ Propositional traces ready")

# Include event data attributes in the propositional conversion (required for DWD)
data_log = xes_to_data_propositional_split(log, doForce=False)
print("✔ Data propositional traces ready")

# Create a simple list of traces in case-based format (used by certain embeddings)
trace_list = xes_to_tracelist_split(log)
print("✔ Canonical trace list ready")

## 3) Individual‐Activity Frequencies (Baseline Encoding)

In [None]:
# Create a subfolder for Individual‐Activity Frequencies features within the experiment’s output directory
baseline_folder = os.path.join(features_folder, 'baseline')
os.makedirs(baseline_folder, exist_ok=True)

# Run the baseline embedding procedure on the full propositional log
processed_ids = baseline_embedding(
    baseline_folder,
    propositional_log,
    None
)

# Display a summary of what was generated
print(f"✔ Baseline features written to: {baseline_folder}")
print(f"  • {len(processed_ids)} total traces processed")

## 4) Sequential Encoding (MR / TR / TRA / MRA)

In [None]:
# Paths
log_file_name      = f"{exp_name}.xes"
sequence_folder    = os.path.join(features_folder, "sequence")
error_log_folder   = os.path.join(sequence_folder, "errors")

# os.makedirs(sequence_folder, exist_ok=True)
# os.makedirs(error_log_folder, exist_ok=True)

# # Run it
# strategies = run_sequences(
#     inp_path=LABELED_DIR,
#     log_path=log_file_name,
#     results_folder=sequence_folder,
#     sequence_threshold=SEQ_THRESHOLD,
#     err_logger=error_log_folder,
# )

strategies = generateSequences(
    inp_path=LABELED_DIR,
    log_path=log_file_name,
    results_folder=sequence_folder,
    sequence_threshold=SEQ_THRESHOLD,
    err_logger=error_log_folder
)

print("→ Mined strategies:", strategies)
print("🔍 If anything went wrong, check logs in:", error_log_folder)

### 4.1) Reorganize strategy folders and clean up intermediate files

In [None]:
# Base folders for feature outputs and the intermediate split
split_folder   = os.path.join(sequence_folder, 'split1')

# Only proceed if the first split exists
if os.path.isdir(split_folder):
    for strategy in os.listdir(split_folder):
        src = os.path.join(split_folder, strategy)
        dst = os.path.join(features_folder, strategy)

        # Remove any pre-existing folder at the destination
        if os.path.exists(dst):
            shutil.rmtree(dst)

        # Move each strategy folder up one level
        shutil.move(src, dst)
        print(f"→ Moved {strategy} to {dst}")

    # Remove the now-empty intermediate sequence directory
    shutil.rmtree(sequence_folder)
    print(f"→ Removed sequence folder at {sequence_folder}")
else:
    print(f"No split directory found at {split_folder}, nothing moved.")

### 4.2) Write CSV for every individual sequential encoding

In [None]:
encodings = ['mr', 'mra', 'tr', 'tra']

# Loop through each sequence encoding strategy
for enc in encodings:
    # Ensure the folder for this encoding exists
    enc_folder = os.path.join(features_folder, enc)
    os.makedirs(enc_folder, exist_ok=True)

    # Load train/test DataFrames for the current encoding
    tr_df, te_df = read_sequence_log_via_arff(features_folder, enc)
    # Combine both sets into a full dataset
    total_df = pd.concat([tr_df, te_df])

    # Define output file paths for total, train, and test splits
    paths = {
        'total': os.path.join(enc_folder, f'{enc}.csv'),
        'train': os.path.join(enc_folder, f'{enc}_train.csv'),
        'test':  os.path.join(enc_folder, f'{enc}_test.csv'),
    }

    # Write each DataFrame to disk with semicolon separators
    total_df.to_csv(paths['total'], index=False)
    tr_df.to_csv(paths['train'], index=False)
    te_df.to_csv(paths['test'], index=False)

    # Print confirmation of written files
    print(f"✔ {enc}: wrote total → {paths['total']}")
    print(f"       train → {paths['train']}")
    print(f"       test  → {paths['test']}")

In [None]:
# List of sequence encoding strategies
encodings = ['mr', 'mra', 'tr', 'tra']

# Loop through each encoding
for enc in encodings:
    # Ensure the folder for this encoding exists
    enc_folder = os.path.join(features_folder, enc)
    os.makedirs(enc_folder, exist_ok=True)

    # Load train/test DataFrames for the current encoding
    tr_df, te_df = read_sequence_log_via_arff(features_folder, enc)
    # Combine both sets into a full dataset
    total_df = pd.concat([tr_df, te_df])

    # Define output file path for total split
    total_path = os.path.join(enc_folder, f'{enc}.csv')

    # Write only the total DataFrame to disk with semicolon separators
    total_df.to_csv(total_path, index=False)

    # Print confirmation of written file
    print(f"✔ {enc}: wrote total → {total_path}")

    # # Delete any other CSV files in the folder so only the total remains
    # for file_path in glob.glob(os.path.join(enc_folder, '*.csv')):
    #     if file_path != total_path:
    #         os.remove(file_path)


### 4.3) Provide proper column naming (no collisions when merging)

In [None]:
for enc in encodings:
    enc_folder = os.path.join(features_folder, enc)
    
    # 1) load the mapping.txt
    mapping_path = os.path.join(enc_folder, 'mapping.txt')
    mapping_df   = pd.read_csv(mapping_path, sep=';', header=0)
    code_to_name = dict(zip(mapping_df['activityCode'], mapping_df['activityName']))
    
    # 2) for each split, read → rename → write
    for split, suffix in [('total', ''), ('train', '_train'), ('test', '_test')]:
        csv_path = os.path.join(enc_folder, f"{enc}{suffix}.csv")
        
        df = pd.read_csv(csv_path)
        
        # Build new column names
        new_columns = []
        for col in df.columns:
            # Split the column name into 3-character codes
            codes = [col[i:i+3] for i in range(0, len(col), 3)]
            # Map codes to activity names (skip codes not in mapping)
            names = [code_to_name[c] for c in codes if c in code_to_name]
            if names:
                new_col = f"{enc}[{', '.join(names)}]"
            else:
                new_col = col  # Leave unchanged if no codes found
            new_columns.append(new_col)
        
        df.columns = new_columns
        
        df.to_csv(csv_path, index=False)
        print(f"✔ {enc}{suffix}.csv  →  columns renamed and saved")        

## 5) Declarative Encoding (using Declare templates)

In [None]:
# Prepare output folder for Declare features
declare_folder = os.path.join(features_folder, 'declare')
os.makedirs(declare_folder, exist_ok=True)

# Run the Declare embedding on the full propositional log
processed_ids = declare_embedding(
    declare_folder,
    propositional_log,   # full-log, no split
    None,                # legacy placeholder
    None,                # no ExperimentRunner
    reencode=False,
    candidate_threshold=SPLIT_RATIO,
    constraint_threshold=SPLIT_RATIO
)

# Print output summary
print(f"✔ Declare features written to: {declare_folder}")
print(f"  • {len(processed_ids)} total traces")

## 6) Data-Aware Declare Encoding

In [None]:
# Prepare output folder for Data-Aware Declare features
dwd_folder = os.path.join(features_folder, 'dwd')
os.makedirs(dwd_folder, exist_ok=True)

# Attributes to ignore
ignored_attrs = [
    "time:timestamp", "concept: name", "Label", "Start date", "End date",
    "Diagnosis", "Diagnosis code", "Diagnosis Treatment", "Combination ID",
    "Treatment code", "Activity code"
]

# Run the Data-Aware Declare embedding on the full data_log
processed_ids = declare_data_aware_embedding(
    ignored_attrs,
    dwd_folder,
    data_log,               # entire log, no split
    None,                   # legacy placeholder
    missing_literal=None,
    self=None,              # in-memory runner (or your runner instance)
    candidate_threshold=SPLIT_RATIO,
    constraint_threshold=SPLIT_RATIO
)

# Output summary
print(f"✔ Data-Aware Declare features written to: {dwd_folder}")
print(f" • {len(processed_ids)} total traces")

## 7) Payload Encoding (Event & Trace Attribute Aggregations)

In [None]:
# Prepare output folder for payload features
payload_folder = os.path.join(features_folder, 'payload')
os.makedirs(payload_folder, exist_ok=True)

# Define the path to the payload configuration file (empty for traffic)
payload_settings = os.path.join(PROJECT_ROOT, 'log_settings', EVENT_LOG_SETTINGS)
#payload_settings = ""

processed_ids = payload_embedding(
    payload_folder,
    payload_settings,
    trace_list,
    None,
    None
)

# Output summary
print(f"✔ Payload features written to: {payload_folder}")
print(f"  • {len(processed_ids)} traces")

## 8) Feature concatenations

### 8.1) Combine sequential feature encodings across all strategies into 'sequential_combined'

In [None]:
encodings = ['mr', 'mra', 'tr', 'tra']

sequential_combined_folder = os.path.join(features_folder, 'seq_combined')
os.makedirs(sequential_combined_folder, exist_ok=True)

splits = {
    "total": "",
    "train": "_train",
    "test": "_test"
}

def load_and_merge(split_suffix):
    dfs = []
    for i, enc in enumerate(encodings):
        csv_path = os.path.join(features_folder, enc, f"{enc}{split_suffix}.csv")
        df = pd.read_csv(csv_path)
        if i > 0 and 'Label' in df.columns:
            df = df.drop(columns=['Label'])
        dfs.append(df)
    merged = dfs[0]
    for df in dfs[1:]:
        merged = pd.merge(merged, df, on='Case_ID', how='outer')
    return merged

for split, suffix in splits.items():
    combined_df = load_and_merge(suffix)
    out_path = os.path.join(sequential_combined_folder, f"seq_combined{suffix}.csv")
    combined_df.to_csv(out_path, index=False)
    print(f"✔ Wrote: {out_path}")


### 8.2) Make sure all Case_ID's are set to index

In [None]:
# make a list of all strategies inside the features folder
strategies = [d for d in os.listdir(features_folder) if os.path.isdir(os.path.join(features_folder, d))]

# For every csv, loop and set the index to Case_ID
for strat in strategies:
    # Define the path to the CSV file for this strategy
    csv_path = os.path.join(features_folder, strat, f"{strat}.csv")
    
    # Read the CSV into a DataFrame
    df = pd.read_csv(csv_path)
    
    # Set Case_ID as the index
    df.set_index('Case_ID', inplace=True)
    
    # Save the modified DataFrame back to CSV
    df.to_csv(csv_path)
    
    print(f"✔ Set index for {strat} → {csv_path}")

### 8.3) Concatenating new combinations of extracted features

In [None]:
dataset_composition = cfg["dataset_composition"]
folder_to_internal = cfg["folder_to_internal"]
internal_to_folder_map = internal_to_folder(folder_to_internal)

In [None]:
# to join a single composition:
# joined_df = multijoined_dump_no_splits(
#     key="baseline_data",
#     dataset_list=dataset_composition["baseline_data"],
#     base_folder=features_folder,
#     folder_to_internal_map=folder_to_internal
# )

# or to dump all:
dump_all_compositions(
    dataset_composition,
    base_folder=features_folder,
    folder_to_internal_map=folder_to_internal
)

## 9) Some final checks

In [None]:
# make a list of all strategies inside the features folder
strategies = [d for d in os.listdir(features_folder) if os.path.isdir(os.path.join(features_folder, d))]

# For every csv, loop and set the index to Case_ID
for strat in strategies:
    # Define the path to the CSV file for this strategy
    csv_path = os.path.join(features_folder, strat, f"{strat}.csv")
    
    # Read the CSV into a DataFrame
    df = pd.read_csv(csv_path)
    
    # Set Case_ID as the index
    df.set_index('Case_ID', inplace=True)
    
    # Save the modified DataFrame back to CSV
    df.to_csv(csv_path)
    
    print(f"✔ Set index for {strat} → {csv_path}")