In [None]:
%env SAVE_PATH=../data/uzbek_ner.json

In [1]:
from datasets import load_dataset
from copy import deepcopy
from tqdm import tqdm

In [2]:
dataset = load_dataset("risqaliyevds/uzbek_ner", split="train")

In [3]:
ners_tags = [
    "LOC",
    "ORG",
    "PERSON",
    "DATE",
    "MONEY",
    "PERCENT",
    "QUANTITY",
    "TIME",
    "PRODUCT",
    "EVENT",
    "WORK_OF_ART",
    "LANGUAGE",
    "CARDINAL",
    "ORDINAL",
    "NORP",
    "FACILITY",
    "LAW",
    "GPE"
]

In [4]:
def get_loc_ner(text, ner):
    """
    Find the position of ner in the text.
    Returns the start and end positions of the ner.
    """
    start = text.find(ner)
    end = start + len(ner) if start != -1 else -1
    return start, end


def extract_named_entities(data, ners_tags):
    """
    Extract named entities from the data and resolve any overlaps.
    Returns a list of tuples (start, end, tag) representing the named entities.
    """
    text = data["text"]
    new_ner = {}

    for tag, values in data["ner"].items():
        if tag in ners_tags and values:
            for value in values:
                start, end = get_loc_ner(text, value)
                if start != -1:  # Ensure valid positions
                    new_ner[(start, end)] = tag

    new_ner_list = resolve_overlaps(new_ner)
    return new_ner_list


def resolve_overlaps(new_ner):
    """
    Resolve overlapping named entities, keeping the longest one.
    Returns a list of tuples (start, end, tag) representing the named entities.
    """
    new_ner_list = []

    for key in new_ner.keys():
        start, end = key
        tag = new_ner[key]
        if len(new_ner_list) == 0:
            new_ner_list.append((start, end, tag))
        else:
            is_intersection = False
            for i in range(len(new_ner_list)):
                s, e, t = new_ner_list[i]
                if (start >= s and start <= e) or (end >= s and end <= e):
                    is_intersection = True
                    if end - start > e - s:
                        new_ner_list[i] = (start, end, tag)
            if not is_intersection:
                new_ner_list.append((start, end, tag))

    return new_ner_list


def get_labeled_list(chunk):
    chunk_ = deepcopy(chunk)

    text = chunk_["text"]
    words = text.split(" ")

    entities = extract_named_entities(chunk_, ners_tags)
    labels = ["O"] * len(words)

    for start, end, tag in entities:
        entity_text = text[start:end]
        entity_words = entity_text.split(" ")

        for i, word in enumerate(words):
            word_start = text.find(word, 0 if i == 0 else text.find(words[i - 1]) + len(words[i - 1]) + 1)
            word_end = word_start + len(word)

            if word_start >= start and word_end <= end:
                if i == 0 or labels[i - 1] == "O":
                    labels[i] = f"B-{tag}"
                else:
                    labels[i] = f"I-{tag}"

    return labels


def make_dataset(dataset):
    labeled_dataset = []

    for chunk in tqdm(dataset):
        labels = get_labeled_list(chunk)
        labeled_dataset.append({
            "text": chunk["text"],
            "labels": labels
        })

    return labeled_dataset

In [5]:
labeled_dataset = make_dataset(dataset)

100%|███████████████████████████████████████████████████████████████████████████| 19609/19609 [00:04<00:00, 4138.86it/s]


In [6]:
print(labeled_dataset[0])

{'text': "Shvetsiya hukumati Stokholmdagi asosiy piyodalar ko‘chasi Drottninggatanda odamlar ustiga yuk mashinasini haydab borgani gumon qilinayotgan shaxsni qo‘lga oldi. Bu haqda Expressen TV efirida Shvetsiya bosh vaziri Stefan Lyoven ma'lum qildi. Ushbu shaxs surati ijtimoiy tarmoqda tarqalmoqda. U Spendrups kompaniyasiga tegishli yuk mashinasini o‘g‘irlagani aytilmoqda.   Oxirgi ma'lumotlarga ko‘ra, ushbu hujum oqibatida halok bo‘lganlar soni 5 nafarga yetdi, jarohatlanganlar soni aniq emas.", 'labels': ['B-GPE', 'O', 'B-LOC', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'I-PERSON', 'I-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}


In [11]:
save_path = os.environ["SAVE_PATH"]

import json

with open(save_path, "w") as f:
    json.dump(labeled_dataset, f, ensure_ascii=False, indent=4)