In [3]:
import os
import pandas as pd
import glob
import random

# Paths
PROCESSED_DIR = "../data/processed/"
TRAIN_DIR = "../data/train/"
TEST_DIR = "../data/test/"

# Ensure output directories exist
os.makedirs(TRAIN_DIR, exist_ok=True)
os.makedirs(TEST_DIR, exist_ok=True)

def generate_random_event_id(num_events):
    """Generates a list of unique random event IDs (short numeric format)."""
    return random.sample(range(10_000_000, 99_999_999), num_events)  # 8-digit unique numbers

def process_logs():
    """Processes event logs by replacing event_id with random IDs, then splitting into train/test sets."""
    log_files = glob.glob(os.path.join(PROCESSED_DIR, "*.csv"))  # Get all CSV files in processed folder

    for log_file in log_files:
        df = pd.read_csv(log_file)  # Load the log
        
        if "event_id" in df.columns:
            df["event_id"] = generate_random_event_id(len(df))  # Overwrite event_id with random UUIDs

        # Get unique case IDs and shuffle
        unique_cases = df["case_id"].unique()
        total_cases = len(unique_cases)
        shuffled_cases = pd.Series(unique_cases).sample(frac=1, random_state=42).tolist()

        # Split cases into train (80%) and test (20%)
        split_idx = int(0.8 * total_cases)
        train_cases = shuffled_cases[:split_idx]
        test_cases = shuffled_cases[split_idx:]

        # Create training and testing subsets
        train_df = df[df["case_id"].isin(train_cases)]
        test_df = df[df["case_id"].isin(test_cases)]

        # Save to appropriate directories
        base_filename = os.path.basename(log_file)  # Extract filename
        train_file = os.path.join(TRAIN_DIR, base_filename)
        test_file = os.path.join(TEST_DIR, base_filename)

        train_df.to_csv(train_file, index=False)
        test_df.to_csv(test_file, index=False)

        print(f"Processed {log_file}: Train -> {train_file}, Test -> {test_file}")

# Run the processing
process_logs()


Processed ../data/processed/2020_RequestForPayment.csv: Train -> ../data/train/2020_RequestForPayment.csv, Test -> ../data/test/2020_RequestForPayment.csv
Processed ../data/processed/2012_BPI_Challenge.csv: Train -> ../data/train/2012_BPI_Challenge.csv, Test -> ../data/test/2012_BPI_Challenge.csv
Processed ../data/processed/2018_BPI_Challenge.csv: Train -> ../data/train/2018_BPI_Challenge.csv, Test -> ../data/test/2018_BPI_Challenge.csv
Processed ../data/processed/2020_PermitLog.csv: Train -> ../data/train/2020_PermitLog.csv, Test -> ../data/test/2020_PermitLog.csv
Processed ../data/processed/2019_BPI_Challenge.csv: Train -> ../data/train/2019_BPI_Challenge.csv, Test -> ../data/test/2019_BPI_Challenge.csv
Processed ../data/processed/2020_InternationalDeclarations.csv: Train -> ../data/train/2020_InternationalDeclarations.csv, Test -> ../data/test/2020_InternationalDeclarations.csv
Processed ../data/processed/2020_DomesticDeclarations.csv: Train -> ../data/train/2020_DomesticDeclaration