In [2]:
import re
import json

def extract_entities(text, regex):
    pattern = re.compile(regex)
    matches = pattern.finditer(text)
    entities = []
    clean_text = ""
    last_end = 0
    for match in matches:
        entity_text = match.group(2)
        entity_type = match.group(1).upper()
        clean_text += text[last_end:match.start()]
        start = len(clean_text)
        clean_text += entity_text
        end = len(clean_text)
        entities.append((start, end, entity_type))
        last_end = match.end()
    clean_text += text[last_end:]
    return clean_text, entities

def convert_to_custom_format(lines, regex):
    converted_data = []
    for line in lines:
        clean_text, entities = extract_entities(line.strip(), regex)
        if clean_text:
            converted_data.append({
                "text": clean_text,
                "entities": entities
            })
    return converted_data

def process_file(file_path, regex):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    converted_data = convert_to_custom_format(lines, regex)
    return converted_data

def save_json(data, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)
    print(f"Data saved to {output_file}")

def process_and_save(train_file1, regex1, train_file2, regex2, output_file):
    train_data1 = process_file(train_file1, regex1)
    train_data2 = process_file(train_file2, regex2)
    full_data = train_data1 + train_data2
    save_json(full_data, output_file)
    return full_data

data_train1 = 'dataset/data_train.txt'
data_train2 = 'dataset/training_data_enamex.txt'
data_test1 = 'dataset/data_test.txt'
data_test2 = 'dataset/testing_data_enamex.txt'
regex1 = r'<(\w+)>([^<]+)</\1>'
regex2 = r'<ENAMEX TYPE="(\w+)">(.*?)</ENAMEX>'
train_data = process_and_save(data_train1, regex1, data_train2, regex2, 'train_data_json/data_train_full.json')
test_data = process_and_save(data_test1, regex1, data_test2, regex2, 'test_data_json/data_test_full.json')

Data saved to train_data_json/data_train_full.json
Data saved to test_data_json/data_test_full.json
