In [8]:
import json
import xml.etree.ElementTree as ET
import hashlib
from collections import OrderedDict # added for ordered dict

In [9]:
INPUT_FILENAME = "13small.xes"
OUTPUT_FILENAME = INPUT_FILENAME.replace(".xes", ".json")
# optional user-provided classifier
# keys separated by whitespace, e.g. "concept:name lifecycle:transition"
CLASSIFIER = None

In [10]:
# Read the file content and compute its hash
with open(INPUT_FILENAME) as f:
    log_string = "".join(f.readlines())
log_hash = hash(log_string)
del log_string

In [11]:
events = []
objects = []
event_object = []
object_object = []

In [12]:
existing_objects = {}
#existing_events = {}

In [13]:
tree = ET.parse(INPUT_FILENAME)
root = tree.getroot()
#TODO: check  in classifier if there is only concept name or lifecycle transition as well

In [14]:
root.findall('.//{http://www.xes-standard.org/}classifier')[0].get('keys')

'concept:name lifecycle:transition'

In [15]:
# get the first classifier
classifiers = root.findall('.//{http://www.xes-standard.org/}classifier')
if CLASSIFIER:
    # give the user the opportunity to specify classifier
    classifier = CLASSIFIER
elif classifiers:
    # or use the first classifier in the log
    classifier = classifiers[0].get('keys')   
else:
    # or fall back to concept:name and lifecycle:transition if no other information is provided
    classifier = "concept:name lifecycle:transition"
#classifier = CLASSIFIER if CLASSIFIER is not None else classifiers[0].get('keys')
classifier = tuple(classifier.split(" "))

In [16]:
event_id_counter = 1
object_id_counter = 1
case_id_counter = 1

In [17]:
for case in root.findall('.//{http://www.xes-standard.org/}trace'):
    case_attributes = [child for child in case.iter() if child.tag != '{http://www.xes-standard.org/}event']
    xes_case_id = None
    # check if case ID is present in the XES file
    for attr in case_attributes:
        if attr.get('key') == 'concept:name':
            xes_case_id = attr.get('value')
            break
    # create a case object REGARDLESS of whether the case has an ID
        # case ID depending on position, XES case ID (if present) and log hash
    case_id = f"case_{case_id_counter}_{xes_case_id}_{log_hash}" if xes_case_id else f"case_{case_id_counter}_{log_hash}"
    case_id_counter += 1
    # append "o_" to the beginning of hashed case ID to prevent hashes starting from a number
    # "o_" stands for object
    # TODO: should we replace it with "c_" for case?
    case_id_hashed = f"o_{hashlib.sha1(case_id.encode()).hexdigest()}"
    objects.append({"id": case_id_hashed, "object_type": "case", "attributes": [{"object_attribute_name": "concept:name", "object_attribute_value": case_id}]})
    
    for attr in case_attributes:
        if attr.get('key') != 'concept:name':
            # add object
            object_type = attr.get('key')
            object_value = attr.get('value')
            object_key = f"{object_type}_{object_value}_{log_hash}"
             #generate a SHA-1 hash of the object key
            if object_key not in existing_objects:
                # append "o_" to the beginning of hashed object ID to prevent hashes starting from a number
                object_hash = f"o_{hashlib.sha1(object_key.encode()).hexdigest()}"
                existing_objects[object_key] = {"id": object_hash, "count": 1}
                objects.append({"id": object_hash, "object_type": object_type, "attributes": [{"object_attribute_name": object_type, "object_attribute_value": object_value}]})
            else:
                existing_objects[object_key]["count"] += 1

            object_id_hashed = existing_objects[object_key]["id"]
            object_object.append({"from": case_id_hashed, "to": object_id_hashed, "object_relation_type": "case_object"})
       
    for event in [child for child in case.iter() if child.tag == '{http://www.xes-standard.org/}event']:
        # identify event by its position
        event_id = f"event_{event_id_counter}_{log_hash}"
        event_id_counter += 1
        # append "e_" to the beginning of hashed event ID to prevent hashes starting from a number
        event_id_hashed = f"e_{hashlib.sha1(event_id.encode()).hexdigest()}"
        event_time_element = event.find('.//{http://www.xes-standard.org/}date')
        event_time_iso8601 = None

        if event_time_element is not None:
            event_time = event_time_element.attrib.get('value')
            # event time to ISO 8601 format 
            if event_time:
                event_time_iso8601 = event_time.replace("T", " ").replace("Z", "")

        # take all attributes from classifier
        event_classifier = OrderedDict.fromkeys(classifier)

        event_attributes = [attr for attr in event.iter()]
        for attr in event_attributes:
            # fill in event_classifier with values present in the event
            if attr.get('key') in event_classifier:
                event_classifier[attr.get('key')] = attr.get('value')
            # if attr.get('key') == 'concept:name':
            #     concept_name = attr.get('value')
            # elif attr.get('key') == 'lifecycle:transition':
            #     lifecycle_transition = attr.get('value')
        
        # order of attribute values is preserved for all events
        # skip None values, i.e. attributes not present in the event
        event_type = " ".join([v for v in event_classifier.values() if v is not None])
        
        # add the event
        # only add the attributes that are present
        events.append({"id": event_id_hashed, "time": event_time_iso8601, "event_type": event_type, 
                       "attributes":[{"event_attribute_name": k, "event_attribute_value": v} for k,v in event_classifier.items() if v is not None]})
            
        # add event_object relation to the case
        event_object.append({"eventID": event_id_hashed, "objectID": case_id_hashed, "qualifier": "case_event"})

            
        # loop to Iterate through attributes of the event
        for attr in event_attributes:
            # only attributes that are not in the classifier and not timestamp are converted to objects
            # timestamp is also not an attribute
            #if attr.get('key') != 'concept:name' and attr.get('key') != 'lifecycle:transition' and attr.get('key') != 'time:timestamp':
            if attr.get('key') not in event_classifier and attr.get('key') != 'time:timestamp':
                # object already exists in objects list? if yes then take its id as an object_id
                object_type = attr.get('key')
                object_value = attr.get('value')
                object_key = f"{object_type}_{object_value}_{log_hash}"
                 #generate a SHA-1 hash of the object key
                if object_key not in existing_objects:
                    object_hash = f"o_{hashlib.sha1(object_key.encode()).hexdigest()}"
                    existing_objects[object_key] = {"id": object_hash, "count": 1}
                    objects.append({"id": object_hash, "object_type": object_type, "attributes": [{"object_attribute_name": object_type, "object_attribute_value": object_value}]})
                else:
                    existing_objects[object_key]["count"] += 1

                object_id_hashed = existing_objects[object_key]["id"]

                event_object.append({"eventID": event_id_hashed, "objectID": object_id_hashed, "qualifier": "event_object"})

In [18]:
if len(object_object) < 1:
    # if this part is empty, add an empty entry for demonstrative purposes
    object_object.append({"from": "", "to": "", "object_relation_type": ""})

In [19]:
output_data = {
    "events": events,
    "objects": objects,
    "event_object": event_object,
    "object_object": object_object
}

In [20]:
with open(OUTPUT_FILENAME, 'w') as json_file:
    json.dump(output_data, json_file, indent=4)

print(f"Output saved to {OUTPUT_FILENAME}")

Output saved to 13small.json


In [21]:
import json
import xml.etree.ElementTree as ET
import hashlib
from collections import OrderedDict

INPUT_FILES = ["BPIC15_1.xes", "BPIC15_2.xes", "BPIC15_3.xes", "BPIC15_4.xes", "BPIC15_5.xes"]  # List of input files
CLASSIFIER = None  # Optional user-provided classifier

def process_file(input_filename):
    output_filename = input_filename.replace(".xes", ".json")

    # Read the file content and compute its hash
    with open(input_filename) as f:
        log_string = "".join(f.readlines())
    log_hash = hash(log_string)
    del log_string

    events = []
    objects = []
    event_object = []
    object_object = []

    existing_objects = {}

    tree = ET.parse(input_filename)
    root = tree.getroot()

    # Get the first classifier
    classifiers = root.findall('.//{http://www.xes-standard.org/}classifier')
    if CLASSIFIER:
        classifier = CLASSIFIER
    elif classifiers:
        classifier = classifiers[0].get('keys')
    else:
        classifier = "concept:name lifecycle:transition"
    classifier = tuple(classifier.split(" "))

    event_id_counter = 1
    object_id_counter = 1
    case_id_counter = 1

    for case in root.findall('.//{http://www.xes-standard.org/}trace'):
        case_attributes = [child for child in case.iter() if child.tag != '{http://www.xes-standard.org/}event']
        xes_case_id = None

        for attr in case_attributes:
            if attr.get('key') == 'concept:name':
                xes_case_id = attr.get('value')
                break

        case_id = f"case_{case_id_counter}_{xes_case_id}_{log_hash}" if xes_case_id else f"case_{case_id_counter}_{log_hash}"
        case_id_counter += 1
        case_id_hashed = f"o_{hashlib.sha1(case_id.encode()).hexdigest()}"
        objects.append({
            "id": case_id_hashed,
            "object_type": "case",
            "attributes": [{"object_attribute_name": "concept:name", "object_attribute_value": case_id}]
        })

        for attr in case_attributes:
            if attr.get('key') != 'concept:name':
                object_type = attr.get('key')
                object_value = attr.get('value')
                object_key = f"{object_type}_{object_value}_{log_hash}"
                
                if object_key not in existing_objects:
                    object_hash = f"o_{hashlib.sha1(object_key.encode()).hexdigest()}"
                    existing_objects[object_key] = {"id": object_hash, "count": 1}
                    objects.append({
                        "id": object_hash,
                        "object_type": object_type,
                        "attributes": [{"object_attribute_name": object_type, "object_attribute_value": object_value}]
                    })
                else:
                    existing_objects[object_key]["count"] += 1

                object_id_hashed = existing_objects[object_key]["id"]
                object_object.append({"from": case_id_hashed, "to": object_id_hashed, "object_relation_type": "case_object"})

        for event in [child for child in case.iter() if child.tag == '{http://www.xes-standard.org/}event']:
            event_id = f"event_{event_id_counter}_{log_hash}"
            event_id_counter += 1
            event_id_hashed = f"e_{hashlib.sha1(event_id.encode()).hexdigest()}"
            event_time_element = event.find('.//{http://www.xes-standard.org/}date')
            event_time_iso8601 = None

            if event_time_element is not None:
                event_time = event_time_element.attrib.get('value')
                if event_time:
                    event_time_iso8601 = event_time.replace("T", " ").replace("Z", "")

            event_classifier = OrderedDict.fromkeys(classifier)

            event_attributes = [attr for attr in event.iter()]
            for attr in event_attributes:
                if attr.get('key') in event_classifier:
                    event_classifier[attr.get('key')] = attr.get('value')

            event_type = " ".join([v for v in event_classifier.values() if v is not None])

            events.append({
                "id": event_id_hashed,
                "time": event_time_iso8601,
                "event_type": event_type,
                "attributes": [{"event_attribute_name": k, "event_attribute_value": v} for k, v in event_classifier.items() if v is not None]
            })

            event_object.append({"eventID": event_id_hashed, "objectID": case_id_hashed, "qualifier": "case_event"})

            for attr in event_attributes:
                if attr.get('key') not in event_classifier and attr.get('key') != 'time:timestamp':
                    object_type = attr.get('key')
                    object_value = attr.get('value')
                    object_key = f"{object_type}_{object_value}_{log_hash}"

                    if object_key not in existing_objects:
                        object_hash = f"o_{hashlib.sha1(object_key.encode()).hexdigest()}"
                        existing_objects[object_key] = {"id": object_hash, "count": 1}
                        objects.append({
                            "id": object_hash,
                            "object_type": object_type,
                            "attributes": [{"object_attribute_name": object_type, "object_attribute_value": object_value}]
                        })
                    else:
                        existing_objects[object_key]["count"] += 1

                    object_id_hashed = existing_objects[object_key]["id"]
                    event_object.append({"eventID": event_id_hashed, "objectID": object_id_hashed, "qualifier": "event_object"})

    if len(object_object) < 1:
        object_object.append({"from": "", "to": "", "object_relation_type": ""})

    output_data = {
        "events": events,
        "objects": objects,
        "event_object": event_object,
        "object_object": object_object
    }

    with open(output_filename, 'w') as json_file:
        json.dump(output_data, json_file, indent=4)

    print(f"Output saved to {output_filename}")

for input_file in INPUT_FILES:
    process_file(input_file)


FileNotFoundError: [Errno 2] No such file or directory: 'BPIC15_1.xes'

In [None]:
import json
import os
import xml.etree.ElementTree as ET
import hashlib
from collections import OrderedDict

# Define input files
INPUT_FILES = ["BPIC15_1.xes", "BPIC15_2.xes", "BPIC15_3.xes", "BPIC15_4.xes", "BPIC15_5.xes"]
CLASSIFIER = None  # Optional user-provided classifier

# Prefixes for TTL files
PREFIXES = {
    'ex': 'http://example.org/',
}
BASE_URI = PREFIXES['ex']

def process_file(input_filename):
    """
    Process an XES file to extract events, objects, and relations, and save them in JSON format.
    """
    output_json_filename = input_filename.replace(".xes", ".json")

    # Read the file content and compute its hash
    with open(input_filename) as f:
        log_string = "".join(f.readlines())
    log_hash = hash(log_string)
    del log_string

    events = []
    objects = []
    event_object = []
    object_object = []

    existing_objects = {}

    tree = ET.parse(input_filename)
    root = tree.getroot()

    # Get the first classifier
    classifiers = root.findall('.//{http://www.xes-standard.org/}classifier')
    if CLASSIFIER:
        classifier = CLASSIFIER
    elif classifiers:
        classifier = classifiers[0].get('keys')
    else:
        classifier = "concept:name lifecycle:transition"
    classifier = tuple(classifier.split(" "))

    event_id_counter = 1
    object_id_counter = 1
    case_id_counter = 1

    for case in root.findall('.//{http://www.xes-standard.org/}trace'):
        case_attributes = [child for child in case.iter() if child.tag != '{http://www.xes-standard.org/}event']
        xes_case_id = None

        for attr in case_attributes:
            if attr.get('key') == 'concept:name':
                xes_case_id = attr.get('value')
                break

        case_id = f"case_{case_id_counter}_{xes_case_id}_{log_hash}" if xes_case_id else f"case_{case_id_counter}_{log_hash}"
        case_id_counter += 1
        case_id_hashed = f"o_{hashlib.sha1(case_id.encode()).hexdigest()}"
        objects.append({
            "id": case_id_hashed,
            "object_type": "case",
            "attributes": [{"object_attribute_name": "concept:name", "object_attribute_value": case_id}]
        })

        for attr in case_attributes:
            if attr.get('key') != 'concept:name':
                object_type = attr.get('key')
                object_value = attr.get('value')
                object_key = f"{object_type}_{object_value}_{log_hash}"
                
                if object_key not in existing_objects:
                    object_hash = f"o_{hashlib.sha1(object_key.encode()).hexdigest()}"
                    existing_objects[object_key] = {"id": object_hash, "count": 1}
                    objects.append({
                        "id": object_hash,
                        "object_type": object_type,
                        "attributes": [{"object_attribute_name": object_type, "object_attribute_value": object_value}]
                    })
                else:
                    existing_objects[object_key]["count"] += 1

                object_id_hashed = existing_objects[object_key]["id"]
                object_object.append({"from": case_id_hashed, "to": object_id_hashed, "object_relation_type": "case_object"})

        for event in [child for child in case.iter() if child.tag == '{http://www.xes-standard.org/}event']:
            event_id = f"event_{event_id_counter}_{log_hash}"
            event_id_counter += 1
            event_id_hashed = f"e_{hashlib.sha1(event_id.encode()).hexdigest()}"
            event_time_element = event.find('.//{http://www.xes-standard.org/}date')
            event_time_iso8601 = None

            if event_time_element is not None:
                event_time = event_time_element.attrib.get('value')
                if event_time:
                    event_time_iso8601 = event_time.replace("T", " ").replace("Z", "")

            event_classifier = OrderedDict.fromkeys(classifier)

            event_attributes = [attr for attr in event.iter()]
            for attr in event_attributes:
                if attr.get('key') in event_classifier:
                    event_classifier[attr.get('key')] = attr.get('value')

            event_type = " ".join([v for v in event_classifier.values() if v is not None])

            events.append({
                "id": event_id_hashed,
                "time": event_time_iso8601,
                "event_type": event_type,
                "attributes": [{"event_attribute_name": k, "event_attribute_value": v} for k, v in event_classifier.items() if v is not None]
            })

            event_object.append({"eventID": event_id_hashed, "objectID": case_id_hashed, "qualifier": "case_event"})

            for attr in event_attributes:
                if attr.get('key') not in event_classifier and attr.get('key') != 'time:timestamp':
                    object_type = attr.get('key')
                    object_value = attr.get('value')
                    object_key = f"{object_type}_{object_value}_{log_hash}"

                    if object_key not in existing_objects:
                        object_hash = f"o_{hashlib.sha1(object_key.encode()).hexdigest()}"
                        existing_objects[object_key] = {"id": object_hash, "count": 1}
                        objects.append({
                            "id": object_hash,
                            "object_type": object_type,
                            "attributes": [{"object_attribute_name": object_type, "object_attribute_value": object_value}]
                        })
                    else:
                        existing_objects[object_key]["count"] += 1

                    object_id_hashed = existing_objects[object_key]["id"]
                    event_object.append({"eventID": event_id_hashed, "objectID": object_id_hashed, "qualifier": "event_object"})

    if len(object_object) < 1:
        object_object.append({"from": "", "to": "", "object_relation_type": ""})

    output_data = {
        "events": events,
        "objects": objects,
        "event_object": event_object,
        "object_object": object_object
    }

    with open(output_json_filename, 'w') as json_file:
        json.dump(output_data, json_file, indent=4)

    print(f"JSON Output saved to {output_json_filename}")
    
    # Process TTL Relations
    output_ttl_filename = input_filename.replace(".xes", "_relation.ttl")
    create_ttl_relations(input_filename, output_ttl_filename)


def create_ttl_relations(input_filename, output_filename):
    """
    Create TTL relations from an XES file.
    """
    tree = ET.parse(input_filename)
    root = tree.getroot()
    
    triples = set()
    
    for trace in root.findall('.//{http://www.xes-standard.org/}trace'):
        case_id = trace.attrib.get('concept:name', 'unknown_case_id')

        trace_attrs = {}
        event_attrs = {}

        for attr in trace:
            key = attr.attrib.get('key')
            value = attr.attrib.get('value')
            if key and value:
                trace_attrs[key] = value

        event_counter = 0
        for event in trace.findall('./{http://www.xes-standard.org/}event'):
            event_counter += 1
            event_id = f"event_{event_counter}"
            
            event_attrs[event_id] = {}
            
            for attr in event:
                key = attr.attrib.get('key')
                value = attr.attrib.get('value')
                if key and value:
                    event_attrs[event_id][key] = value
            
            for trace_key, trace_value in trace_attrs.items():
                subject = f'{BASE_URI}{trace_key}'
                for event_key, event_value in event_attrs[event_id].items():
                    object = f'{BASE_URI}{event_key}'
                    triple = (subject, object)
                    triples.add(triple)
    
    with open(output_filename, 'w') as f:
        # Write prefix declaration
        f.write('@prefix ex: <http://example.org/> .\n\n')
        
        for subject, object in triples:
            relation = f'<{subject}> ex:subject_object <{object}> .'
            f.write(f'{relation}\n')
    
    print(f"TTL Output saved to {output_filename}")

def extract_triples_from_ttl(filename):
    """
    Extract triples from a TTL file.
    """
    triples = set()
    with open(filename, 'r') as f:
        for line in f:
            if line.startswith('<') and ' ex:subject_object ' in line:
                triples.add(line.strip())
    return triples

def find_common_triples(output_filenames, common_output_filename):
    """
    Find common triples among multiple TTL files and write them to a new file.
    """
    if not output_filenames:
        return
    
    common_triples = extract_triples_from_ttl(output_filenames[0])
    
    for filename in output_filenames[1:]:
        current_triples = extract_triples_from_ttl(filename)
        common_triples &= current_triples
    
    with open(common_output_filename, 'w') as f:
        # Write prefix declaration
        f.write('@prefix ex: <http://example.org/> .\n\n')
        
        # Sort the triples before writing
        sorted_triples = sorted(common_triples)
        
        for triple in sorted_triples:
            f.write(f'{triple}\n')
    
    print(f"Common triples saved to {common_output_filename}")

if __name__ == "__main__":
    input_filenames = ["2015-1.xes", "2015-2.xes", "2015-3.xes", "2015-4.xes", "2015-5.xes"]
    output_filenames = [f"{filename.split('.')[0]}_relation.ttl" for filename in input_filenames]
    common_output_filename = "common_relationsBPIC15.ttl"

    if len(INPUT_FILES) != len(output_filenames):
        raise ValueError("The number of input filenames must match the number of output filenames.")
    
    for input_filename, output_filename in zip(INPUT_FILES, output_filenames):
        if os.path.exists(input_filename):
            process_file(input_filename)
        else:
            print(f"File not found: {input_filename}")

    find_common_triples(output_filenames, common_output_filename)
    #Merged outputs of all

In [None]:
import json
import xml.etree.ElementTree as ET
import hashlib
from collections import OrderedDict

INPUT_FILES = ["bpic13/BPI2013_incidents.xes"]  # List of input files
CLASSIFIER = None  # Optional user-provided classifier

def process_file(input_filename):
    output_filename = input_filename.replace(".xes", ".json")

    # Read the file content and compute its hash
    with open(input_filename) as f:
        log_string = "".join(f.readlines())
    log_hash = hash(log_string)
    del log_string

    events = []
    objects = []
    event_object = []
    object_object = []

    existing_objects = {}

    tree = ET.parse(input_filename)
    root = tree.getroot()

    # Get the first classifier
    classifiers = root.findall('.//{http://www.xes-standard.org/}classifier')
    if CLASSIFIER:
        classifier = CLASSIFIER
    elif classifiers:
        classifier = classifiers[0].get('keys')
    else:
        classifier = "concept:name lifecycle:transition"
    classifier = tuple(classifier.split(" "))

    event_id_counter = 1
    object_id_counter = 1
    case_id_counter = 1

    for case in root.findall('.//{http://www.xes-standard.org/}trace'):
        case_attributes = [child for child in case.iter() if child.tag != '{http://www.xes-standard.org/}event']
        xes_case_id = None

        for attr in case_attributes:
            if attr.get('key') == 'concept:name':
                xes_case_id = attr.get('value')
                break

        case_id = f"case_{case_id_counter}_{xes_case_id}_{log_hash}" if xes_case_id else f"case_{case_id_counter}_{log_hash}"
        case_id_counter += 1
        case_id_hashed = f"o_{hashlib.sha1(case_id.encode()).hexdigest()}"
        objects.append({
            "id": case_id_hashed,
            "object_type": "case",
            "attributes": [{"object_attribute_name": "concept:name", "object_attribute_value": case_id}]
        })

        for attr in case_attributes:
            if attr.get('key') != 'concept:name':
                object_type = attr.get('key')
                object_value = attr.get('value')
                object_key = f"{object_type}_{object_value}_{log_hash}"
                
                if object_key not in existing_objects:
                    object_hash = f"o_{hashlib.sha1(object_key.encode()).hexdigest()}"
                    existing_objects[object_key] = {"id": object_hash, "count": 1}
                    objects.append({
                        "id": object_hash,
                        "object_type": object_type,
                        "attributes": [{"object_attribute_name": object_type, "object_attribute_value": object_value}]
                    })
                else:
                    existing_objects[object_key]["count"] += 1

                object_id_hashed = existing_objects[object_key]["id"]
                object_object.append({"from": case_id_hashed, "to": object_id_hashed, "object_relation_type": "case_object"})

        for event in [child for child in case.iter() if child.tag == '{http://www.xes-standard.org/}event']:
            event_id = f"event_{event_id_counter}_{log_hash}"
            event_id_counter += 1
            event_id_hashed = f"e_{hashlib.sha1(event_id.encode()).hexdigest()}"
            event_time_element = event.find('.//{http://www.xes-standard.org/}date')
            event_time_iso8601 = None

            if event_time_element is not None:
                event_time = event_time_element.attrib.get('value')
                if event_time:
                    event_time_iso8601 = event_time.replace("T", " ").replace("Z", "")

            event_classifier = OrderedDict.fromkeys(classifier)

            event_attributes = [attr for attr in event.iter()]
            for attr in event_attributes:
                if attr.get('key') in event_classifier:
                    event_classifier[attr.get('key')] = attr.get('value')

            event_type = " ".join([v for v in event_classifier.values() if v is not None])

            events.append({
                "id": event_id_hashed,
                "time": event_time_iso8601,
                "event_type": event_type,
                "attributes": [{"event_attribute_name": k, "event_attribute_value": v} for k, v in event_classifier.items() if v is not None]
            })

            event_object.append({"eventID": event_id_hashed, "objectID": case_id_hashed, "qualifier": "case_event"})

            for attr in event_attributes:
                if attr.get('key') not in event_classifier and attr.get('key') != 'time:timestamp':
                    object_type = attr.get('key')
                    object_value = attr.get('value')
                    object_key = f"{object_type}_{object_value}_{log_hash}"

                    if object_key not in existing_objects:
                        object_hash = f"o_{hashlib.sha1(object_key.encode()).hexdigest()}"
                        existing_objects[object_key] = {"id": object_hash, "count": 1}
                        objects.append({
                            "id": object_hash,
                            "object_type": object_type,
                            "attributes": [{"object_attribute_name": object_type, "object_attribute_value": object_value}]
                        })
                    else:
                        existing_objects[object_key]["count"] += 1

                    object_id_hashed = existing_objects[object_key]["id"]
                    event_object.append({"eventID": event_id_hashed, "objectID": object_id_hashed, "qualifier": "event_object"})

    if len(object_object) < 1:
        object_object.append({"from": "", "to": "", "object_relation_type": ""})

    output_data = {
        "events": events,
        "objects": objects,
        "event_object": event_object,
        "object_object": object_object
    }

    with open(output_filename, 'w') as json_file:
        json.dump(output_data, json_file, indent=4)

    print(f"Output saved to {output_filename}")

for input_file in INPUT_FILES:
    process_file(input_file)


In [None]:
import json
import xml.etree.ElementTree as ET
import hashlib
from collections import OrderedDict

INPUT_FILES = ["bpic13/13small.xes"]  # List of input files
CLASSIFIER = None  # Optional user-provided classifier

def process_file(input_filename):
    output_filename = input_filename.replace(".xes", ".json")

    # Read the file content and compute its hash
    with open(input_filename) as f:
        log_string = "".join(f.readlines())
    log_hash = hash(log_string)
    del log_string

    events = []
    objects = []
    event_object = []
    object_object = []

    existing_objects = {}

    tree = ET.parse(input_filename)
    root = tree.getroot()

    # Get the first classifier
    classifiers = root.findall('.//{http://www.xes-standard.org/}classifier')
    if CLASSIFIER:
        classifier = CLASSIFIER
    elif classifiers:
        classifier = classifiers[0].get('keys')
    else:
        classifier = "concept:name lifecycle:transition"
    classifier = tuple(classifier.split(" "))

    event_id_counter = 1
    object_id_counter = 1
    case_id_counter = 1

    for case in root.findall('.//{http://www.xes-standard.org/}trace'):
        case_attributes = [child for child in case.iter() if child.tag != '{http://www.xes-standard.org/}event']
        xes_case_id = None

        for attr in case_attributes:
            if attr.get('key') == 'concept:name':
                xes_case_id = attr.get('value')
                break

        case_id = f"case_{case_id_counter}_{xes_case_id}_{log_hash}" if xes_case_id else f"case_{case_id_counter}_{log_hash}"
        case_id_counter += 1
        case_id_hashed = f"o_{hashlib.sha1(case_id.encode()).hexdigest()}"
        objects.append({
            "id": case_id_hashed,
            "object_type": "case",
            "attributes": [{"object_attribute_name": "concept:name", "object_attribute_value": case_id}]
        })

        for attr in case_attributes:
            if attr.get('key') != 'concept:name':
                object_type = attr.get('key')
                object_value = attr.get('value')
                object_key = f"{object_type}_{object_value}_{log_hash}"
                
                if object_key not in existing_objects:
                    object_hash = f"o_{hashlib.sha1(object_key.encode()).hexdigest()}"
                    existing_objects[object_key] = {"id": object_hash, "count": 1}
                    objects.append({
                        "id": object_hash,
                        "object_type": object_type,
                        "attributes": [{"object_attribute_name": object_type, "object_attribute_value": object_value}]
                    })
                else:
                    existing_objects[object_key]["count"] += 1

                object_id_hashed = existing_objects[object_key]["id"]
                object_object.append({"from": case_id_hashed, "to": object_id_hashed, "object_relation_type": "case_object"})

        for event in [child for child in case.iter() if child.tag == '{http://www.xes-standard.org/}event']:
            event_id = f"event_{event_id_counter}_{log_hash}"
            event_id_counter += 1
            event_id_hashed = f"e_{hashlib.sha1(event_id.encode()).hexdigest()}"
            event_time_element = event.find('.//{http://www.xes-standard.org/}date')
            event_time_iso8601 = None

            if event_time_element is not None:
                event_time = event_time_element.attrib.get('value')
                if event_time:
                    event_time_iso8601 = event_time.replace("T", " ").replace("Z", "")

            event_classifier = OrderedDict.fromkeys(classifier)

            event_attributes = [attr for attr in event.iter()]
            for attr in event_attributes:
                if attr.get('key') in event_classifier:
                    event_classifier[attr.get('key')] = attr.get('value')

            event_type = " ".join([v for v in event_classifier.values() if v is not None])

            events.append({
                "id": event_id_hashed,
                "time": event_time_iso8601,
                "event_type": event_type,
                "attributes": [{"event_attribute_name": k, "event_attribute_value": v} for k, v in event_classifier.items() if v is not None]
            })

            event_object.append({"eventID": event_id_hashed, "objectID": case_id_hashed, "qualifier": "case_event"})

            for attr in event_attributes:
                if attr.get('key') not in event_classifier and attr.get('key') != 'time:timestamp':
                    object_type = attr.get('key')
                    object_value = attr.get('value')
                    object_key = f"{object_type}_{object_value}_{log_hash}"

                    if object_key not in existing_objects:
                        object_hash = f"o_{hashlib.sha1(object_key.encode()).hexdigest()}"
                        existing_objects[object_key] = {"id": object_hash, "count": 1}
                        objects.append({
                            "id": object_hash,
                            "object_type": object_type,
                            "attributes": [{"object_attribute_name": object_type, "object_attribute_value": object_value}]
                        })
                    else:
                        existing_objects[object_key]["count"] += 1

                    object_id_hashed = existing_objects[object_key]["id"]
                    event_object.append({"eventID": event_id_hashed, "objectID": object_id_hashed, "qualifier": "event_object"})

    if len(object_object) < 1:
        object_object.append({"from": "", "to": "", "object_relation_type": ""})

    output_data = {
        "events": events,
        "objects": objects,
        "event_object": event_object,
        "object_object": object_object
    }

    with open(output_filename, 'w') as json_file:
        json.dump(output_data, json_file, indent=4)

    print(f"Output saved to {output_filename}")

for input_file in INPUT_FILES:
    process_file(input_file)
