In [78]:
import json
from tqdm import tqdm
import random
from spacy.training import Example
import ru_core_news_lg
from datasets import load_dataset
import warnings
warnings.filterwarnings("ignore")

In [79]:
def read_data(path: str) -> list[dict]:
    data_array = []
    with open(path, 'r') as file:
        for line in file:
            data = json.loads(line)
            data_array.append(data)
    return data_array

In [80]:
train = read_data('train.jsonl')
test = read_data('test.jsonl')
dataset_codalab = {'train': train, 'test': test}

In [81]:
dataset_hf = load_dataset("iluvvatar/RuNNE")

## Create train dataset

In [83]:
train_data = [entry for entry in dataset_codalab['train']]

In [84]:
def compare_and_take(source_dataset, target_dataset=dataset_codalab['test']) -> list[dict]:
    """
    Compares two datasets and returns a list of samples which is not lies in test_data
    Parameters:
        - source_dataset (list): A list that can contain samples from test dataset
        - target_dataset (list): A test dataset
    Returns:
        - output (list): A list of dictionaries containing sentences and named entities that are not similar between the two datasets.
    """
    output = []

    for source in source_dataset:
        flag = True
        for target in target_dataset:
            source_sentence = source['text']
            target_sentence = target['senences']
            if len(source_sentence) == len(target_sentence):
                same_chars = sum([source_char == target_char for source_char, target_char in zip(source_sentence, target_sentence)])
                
                if same_chars / len(source_sentence) > 0.5:
                    flag = False
                    break
        if flag:
            ners = []
            
            for ner in  source['entities']:
                start, end, cls = ner.split()
                ners.append([int(start), int(end) - 1, cls])
            output.append({'sentences': source_sentence, 'ners': ners})
    return output

In [85]:
train_data += compare_and_take(dataset_hf['test'])
train_data += compare_and_take(dataset_hf['train'])

In [86]:
unique_classes = set(entity[-1] for entry in dataset_codalab['train'] for entity in entry['ners'])

In [94]:
def prepare_data(data):
    """Prepares data for NER model training.
    Parameters:
        - data (list): List of dictionaries containing 'sentences' and 'ners' keys.
    Returns:
        - prepared_data (list): List of tuples containing text and entities dictionary that ready for spacy.
    """
    prepared_data = []
    for entry in data:
        text = entry['sentences']
        ners = entry['ners']
        entities = [(ner[0], ner[1] + 1, ner[2]) for ner in ners]
        prepared_data.append((text, {"entities": entities}))
    return prepared_data

In [95]:
def remove_nested_entities(prepared_data):
    """"Removes nested entities from prepared data."
    Parameters:
        - prepared_data (list): List of tuples containing text and entities.
    Returns:
        - list: List of tuples with nested entities removed.
    Processing Logic:
        - Loop through prepared_data.
        - Create a set to store entity spans.
        - Create a new list to store non-overlapping entities.
        - Check if entity index is already in set.
        - If not, add entity to new list and add span to set.
        - Update entities with new list of non-overlapping entities."""
    for _, entities in prepared_data:
        span = set()
        new_entities = []
        for entity in entities['entities']:
            if not any(index in span for index in range(entity[0], entity[1])):
                new_entities.append(entity)
                for index in range(entity[0], entity[1]):
                    span.add(index)
        entities['entities'] = new_entities

In [96]:
prepared_data = prepare_data(train_data)
remove_nested_entities(prepared_data)

## Train model

In [92]:
nlp = ru_core_news_lg.load()
optimizer = nlp.initialize()

In [108]:
for epoch in range(40):
    for i, (raw_text, entity_offsets) in enumerate(tqdm(prepared_data, desc=f'Epoch {epoch}', leave=False)):
        doc = nlp.make_doc(raw_text)
        example = Example.from_dict(doc, entity_offsets)
        nlp.update([example], sgd=optimizer)

                                                             

## Testing

In [116]:
answer = []
for entry in tqdm(dataset_codalab['test'], desc='Evaluating'):
    doc = nlp(entry['senences'])
    ners = [[ent.start_char, ent.end_char - 1, ent.label_] for ent in doc.ents if ent.label_ in unique_classes]
    answer.append({'ners': ners, 'id': entry['id']})

Evaluating: 100%|██████████| 65/65 [00:06<00:00,  9.77it/s]


In [117]:
with open('output_data/test.jsonl', 'w') as outfile:
    for entry in answer:
        json.dump(entry, outfile)
        outfile.write('\n')