In [35]:
import os
from collections import defaultdict
from gatenlp import Document
import json
import re

In [36]:
text_doc_dict = defaultdict(str)
for file in os.listdir("../data"):
    file_path = os.path.join("../data", file)
    if os.path.isfile(file_path):        
        with open(file_path, 'r') as fileReader:
          content = fileReader.read()
          text_doc_dict[file] = content

In [37]:
# Configuration for clustering
ENABLE_CLUSTERING = True  # Set to True to enable entity clustering
print(f'CLUSTERING_ENABLED={ENABLE_CLUSTERING}')

CLUSTERING_ENABLED=True


In [38]:
text_doc_dict.keys()

dict_keys(['GDPR_Chapter_X.txt', 'DataAct_Chapter_VII.txt', 'GDPR_Chapter_VIII.txt', 'AIAct_Chapter_XI.txt', 'GDPR_Chapter_III.txt', 'DataGovernanceAct_Chapter_V.txt', 'DataAct_Chapter_VI.txt', 'AIAct_Chapter_intro.txt', 'GDPR_Chapter_VI.txt', 'GDPR_Chapter_I.txt', 'DataAct_Chapter_VIII.txt', 'DataAct_Chapter_III.txt', 'GDPR_Chapter_VII.txt', 'AIAct_Chapter_XIII.txt', 'AIAct_Chapter_X.txt', 'DataGovernanceAct_Chapter_VIII.txt', 'AIAct_Chapter_VII.txt', 'GDPR_Chapter_II.txt', 'AIAct_Chapter_IV.txt', 'DataAct_Chapter_II.txt', 'DataGovernanceAct_Chapter_IV.txt', 'DataAct_Chapter_V.txt', 'AIAct_Chapter_I.txt', 'AIAct_Chapter_III.txt', 'AIAct_Chapter_VIII.txt', 'GDPR_Chapter_IX.txt', 'DataAct-intro.txt', 'DataAct_Chapter_IX.txt', 'DataGovernanceAct_Chapter_intro.txt', 'DataAct_Chapter_X.txt', 'DataGovernanceAct_Chapter_VII.txt', 'DataGovernanceAct_Chapter_II.txt', 'GDPR_intro.txt', 'DataAct_Chapter_IV.txt', 'AIAct_Chapter_II.txt', 'GDPR_Chapter_IV.txt', 'AIAct_Chapter_V.txt', 'DataGovernanc

In [39]:
gdocs = []
entities_path = "../entities"

# Entity type mapping
entity_type_mapping = {
    "REGULATION_REF": "REGULATION",
    "ACTOR_ROLE": "ROLE", 
    "PROCESSING_OPERATION": "DATA.OP",
    "DATA_CATEGORY": "DATA.CAT"
    # AUTHORITY remains unchanged
}

for key, value in text_doc_dict.items():
    base_doc = Document(value)
    base_doc.name = key.replace(".txt", "")
    ent_set = base_doc.annset("entities_")
    ent_json_path = os.path.join(entities_path, key.replace(".txt", ".json"))
    
    # Dictionary to store clusters by entity type, then by normalized text
    clusters_by_type = {}
    cluster_id = 1  # Progressive cluster ID counter
    
    with open(ent_json_path, "r") as json_file:
        ent_obj = json.load(json_file)
        for ent_type in ent_obj.keys():
            # Apply entity type mapping
            mapped_ent_type = entity_type_mapping.get(ent_type.upper(), ent_type.upper())
            
            if ENABLE_CLUSTERING:
                clusters_by_type[mapped_ent_type] = {}
            
            for entity_mention in ent_obj[ent_type]:
                # Escape special regex characters in the entity mention
                escaped_entity = re.escape(entity_mention)
                
                # Create a more flexible pattern for entities with special characters
                # Use word boundary only at the beginning if it starts with a word character
                # Use word boundary only at the end if it ends with a word character
                start_boundary = r'\b' if entity_mention[0].isalnum() else r'(?<!\w)'
                end_boundary = r'\b' if entity_mention[-1].isalnum() else r'(?!\w)'
                pattern = start_boundary + escaped_entity + end_boundary
                
                # Find all occurrences using regex
                for match in re.finditer(pattern, value, re.IGNORECASE):
                    start = match.start()
                    end = match.end()
                    actual_text = value[start:end]  # Get the actual matched text (preserves case)
                    
                    # Add annotation with mapped entity type and get the actual annotation object with its ID
                    annotation = ent_set.add(start, end, mapped_ent_type, features={"text": actual_text})
                    actual_annotation_id = annotation.id  # Get the real GateNLP annotation ID
                    
                    if ENABLE_CLUSTERING:
                        # Use the actual matched text for clustering (normalized), only within same type
                        normalized_key = actual_text.lower().strip()
                        
                        if normalized_key not in clusters_by_type[mapped_ent_type]:
                            clusters_by_type[mapped_ent_type][normalized_key] = {
                                "id": cluster_id,  # Unique cluster ID
                                "title": actual_text,  # Use first occurrence as cluster title
                                "type": mapped_ent_type,
                                "nelements": 0,
                                "mentions": []
                            }
                            cluster_id += 1  # Increment for next cluster
                        
                        clusters_by_type[mapped_ent_type][normalized_key]["mentions"].append({
                            "id": actual_annotation_id,  # Use the real GateNLP annotation ID
                            "mention": actual_text
                        })
                        clusters_by_type[mapped_ent_type][normalized_key]["nelements"] = len(clusters_by_type[mapped_ent_type][normalized_key]["mentions"])
    
    # Add clusters to document features in the correct format (only if clustering is enabled)
    if ENABLE_CLUSTERING and clusters_by_type:
        all_clusters = []
        for ent_type, type_clusters in clusters_by_type.items():
            all_clusters.extend(list(type_clusters.values()))
        
        if all_clusters:
            base_doc.features["clusters"] = {
                "entities_": all_clusters
            }
    
    gdocs.append(base_doc)

In [40]:
for doc in gdocs:
  if not os.path.exists("./output"):
    os.mkdir("./output")
  file_path = os.path.join("./output", doc.name + '.json')
  with open(file_path, 'w') as fileWriter:
    json.dump(doc.to_dict(), fileWriter)