### Convert annotations to the format used by GLiNER for finetuning.

Define the imports

In [1]:
import json
import re

Define paths to the annotation files

In [2]:
PATH_PLATINUM_TRAIN = "../Annotations/Train/platinum_quality/json_format/train_platinum.json"
PATH_GOLD_TRAIN = "../Annotations/Train/gold_quality/json_format/train_gold.json"
PATH_SILVER_TRAIN = "../Annotations/Train/silver_quality/json_format/train_silver.json"
PATH_BRONZE_TRAIN = "../Annotations/Train/bronze_quality/json_format/train_bronze.json"
PATH_DEV = "../Annotations/Dev/json_format/dev.json"

Define output paths

In [3]:
PATH_OUTPUT_PLATINUM_TRAIN = "../Train/NER/data/train_platinum.json"
PATH_OUTPUT_GOLD_TRAIN = "../Train/NER/data/train_gold.json"
PATH_OUTPUT_SILVER_TRAIN = "../Train/NER/data/train_silver.json"
PATH_OUTPUT_BRONZE_TRAIN = "../Train/NER/data/train_bronze.json"
PATH_OUTPUT_DEV = "../Train/NER/data/dev.json"

Load the input files into dictionary variables

In [4]:
with open(PATH_PLATINUM_TRAIN, 'r', encoding='utf-8') as file:
	train_platinum = json.load(file)

with open(PATH_GOLD_TRAIN, 'r', encoding='utf-8') as file:
	train_gold = json.load(file)

with open(PATH_SILVER_TRAIN, 'r', encoding='utf-8') as file:
	train_silver = json.load(file)
	
with open(PATH_BRONZE_TRAIN, 'r', encoding='utf-8') as file:
	train_bronze = json.load(file)

with open(PATH_DEV, 'r', encoding='utf-8') as file:
	dev = json.load(file)

Define the function to parse the annotations to the GLiNER finetuning format

In [5]:
def tokenize_text_with_positions(text):
    # Split text into tokens, preserving punctuation (except for hyphens and underscores)
    tokens = []
    token_spans = []  # list of (start_char_index, end_char_index) for each token
    pattern = re.compile(r"\w+|[.,!?;:\'\"()\[\]{}<>]|[\s]+|\S")
    for match in pattern.finditer(text):
        token = match.group()
        if token.isspace():
            continue  # Skip whitespace tokens
        start_pos = match.start()
        if re.match(r"\w+-\w+", token) or re.match(r"\w+_\w+", token):
            # Keep hyphenated or underscored words intact
            tokens.append(token)
            token_spans.append((start_pos, match.end()))
        else:
            # Split contractions (e.g., "don't" -> "don", "'", "t")
            contraction_match = re.match(r"(\w+)(')(\w+)", token)
            if contraction_match:
                groups = contraction_match.groups()
                for group in groups:
                    end_pos = start_pos + len(group)
                    tokens.append(group)
                    token_spans.append((start_pos, end_pos))
                    start_pos = end_pos
            else:
                tokens.append(token)
                token_spans.append((start_pos, match.end()))
    return tokens, token_spans

def process_annotations(data):
    output_data = []

    for doc_id, doc_data in data.items():
        overall_tokenized_text = []
        overall_ner = []
        token_offset = 0

        fields = ["title", "abstract"]

        for field in fields:
            text = doc_data["metadata"].get(field, "")
            tokens, token_spans = tokenize_text_with_positions(text)

            # Collect entities for this field
            field_entities = []
            for entity in doc_data.get("entities", []):
                mention_location = entity.get("location", "")
                if mention_location == field:
                    field_entities.append(entity)

            # Map entities from character indices to token indices
            for entity in field_entities:
                entity_start_char = entity["start_idx"]
                entity_end_char = entity["end_idx"] + 1  # Adjusting end index to be exclusive
                entity_label = entity["label"]

                entity_start_token_index = None
                entity_end_token_index = None

                for i, (token_start_char, token_end_char) in enumerate(token_spans):
                    if token_end_char <= entity_start_char:
                        continue  # Token is before the entity
                    if token_start_char >= entity_end_char:
                        break  # Token is after the entity
                    # Token overlaps with entity
                    if entity_start_token_index is None:
                        entity_start_token_index = i
                    entity_end_token_index = i  # Update to the last overlapping token

                if entity_start_token_index is not None and entity_end_token_index is not None:
                    overall_ner.append([
                        entity_start_token_index + token_offset,
                        entity_end_token_index + token_offset,
                        entity_label.lower()
                    ])
                else:
                    print(f"Warning: Could not find tokens for entity in doc {doc_id}, field {field}")

            # Append tokens to the overall tokenized text
            overall_tokenized_text.extend(tokens)
            token_offset += len(tokens)

        # Sort the word positions by the start index
        overall_ner.sort(key=lambda x: x[0])

        # Create the output dictionary for this document
        output_doc = {
            "tokenized_text": overall_tokenized_text,
            "ner": overall_ner
        }

        output_data.append(output_doc)

    return output_data

In [6]:
processed_train_platinum = process_annotations(train_platinum)
processed_train_gold = process_annotations(train_gold)
processed_train_silver = process_annotations(train_silver)
processed_train_bronze = process_annotations(train_bronze)
processed_dev = process_annotations(dev)

Dump to json

In [7]:
def dump_to_json(dict, output_file_path):
	with open(output_file_path, 'w', encoding='utf-8') as f:
		#json.dump(dict, f, indent=2)
		json.dump(dict, f)

dump_to_json(processed_train_platinum, PATH_OUTPUT_PLATINUM_TRAIN)
dump_to_json(processed_train_gold, PATH_OUTPUT_GOLD_TRAIN)
dump_to_json(processed_train_silver, PATH_OUTPUT_SILVER_TRAIN)
dump_to_json(processed_train_bronze, PATH_OUTPUT_BRONZE_TRAIN)
dump_to_json(processed_dev, PATH_OUTPUT_DEV)