In [None]:
DO_TRAIN = True

In [None]:
!pip install spacy spacy-transformers razdel datasets seqeval transformers[torch] torch accelerate==0.21.0
!python -m spacy download ru_core_news_lg

In [None]:
# Загрузка обучающего датасета.
import pandas as pd
import json
data = pd.read_csv("ner_data_train.csv")
data_clean = data.copy()
data_clean['entities'] = data_clean['entities'].apply(lambda l: l.replace('\,', ',')if isinstance(l, str) else l)
data_clean['entities'] = data_clean['entities'].apply(lambda l: l.replace('\\\\', '\\')if isinstance(l, str) else l)
data_clean['entities'] = data_clean['entities'].apply(lambda l: '[' + l + ']'if isinstance(l, str) else l)
data_clean['entities'] = data_clean['entities'].apply(lambda l: json.loads(l)if isinstance(l, str) else l)

Формирование датасета для Spacy

In [None]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
spacy_num_train = 4495
all_data=[]
for index, row in data_clean.iterrows():
  entities = row['entities']
  newList = []
  for obj in entities:
      item = [obj['offset'], obj['offset'] + obj['length'], obj['label']]
      newList.append(item)

  entity = {"entities": newList}
  text = row['video_info'].replace('"', '\"')
  res = [text, entity]
  all_data.append(res)

def convert(path, dataset):
    nlp = spacy.blank("ru")
    db = DocBin()
    for text, annot in tqdm(dataset):
            doc = nlp.make_doc(text)
            ents = []
            for start, end, label in annot["entities"]:
                span = doc.char_span(start, end, label=label, alignment_mode="contract")
                if not span is None:
                    ents.append(span)
            doc.ents = ents
            db.add(doc)
    db.to_disk(path)

convert("train.spacy", all_data[:spacy_num_train])
convert("dev.spacy", all_data[spacy_num_train:])

In [None]:
# Отключена из-за возможной проблемы с токенизатором
#!python -m spacy train "spacy_deberta-stage-1.cfg" --output "spacy_deberta" --paths.train "train.spacy" --paths.dev "dev.spacy" --training.eval_frequency 1 --training.max_steps 1 --gpu-id 0
#!python -m spacy train "spacy_deberta-stage-2.cfg" --output "spacy_deberta" --paths.train "train.spacy" --paths.dev "dev.spacy" --training.eval_frequency 1 --training.max_steps 1 --gpu-id 0

if DO_TRAIN:
  !python -m spacy train "spacy_multilingual-uncased-stage-1.cfg" --output "spacy_multilingual" --paths.train "train.spacy" --paths.dev "dev.spacy" --training.eval_frequency 20 --training.max_steps 300 --gpu-id 0
  !python -m spacy train "spacy_multilingual-uncased-stage-2.cfg" --output "spacy_multilingual" --paths.train "train.spacy" --paths.dev "dev.spacy" --training.eval_frequency 20 --training.max_steps 300 --gpu-id 0

  !python -m spacy train "spacy_rubert-tiny2-stage-1.cfg" --output "spacy_rubert" --paths.train "train.spacy" --paths.dev "dev.spacy" --training.eval_frequency 20 --training.max_steps 300 --gpu-id 0
  !python -m spacy train "spacy_rubert-tiny2-stage-2.cfg" --output "spacy_rubert" --paths.train "train.spacy" --paths.dev "dev.spacy" --training.eval_frequency 20 --training.max_steps 300 --gpu-id 0

In [None]:
from spacy.symbols import ORTH
#spacy_deberta = spacy.load(R"spacy_deberta/model-last")
#spacy_deberta.tokenizer.add_special_case(":>", [{ORTH: ":"}, {ORTH: ">"}])

spacy_multilingual = spacy.load(R"spacy_multilingual/model-last")
spacy_multilingual.tokenizer.add_special_case(":>", [{ORTH: ":"}, {ORTH: ">"}])

spacy_rubert = spacy.load(R"spacy_rubert/model-last")
spacy_rubert.tokenizer.add_special_case(":>", [{ORTH: ":"}, {ORTH: ">"}])

# Универсальный метод для получения BIO-тегов при помощи Spacy-модели.
def get_bio_tags_spacy(doc):
    bio = []
    for ent in doc:
        cat = ent.ent_iob_
        if cat != 'O':
            cat = cat + '-' + ent.ent_type_
        bio.append(cat)
    return bio

# Модель из Baseline

Подготовка датасета

In [None]:
from razdel import tokenize
def extract_labels(item):
    raw_toks = list(tokenize(item['video_info']))
    words = [tok.text for tok in raw_toks]
    # присвоим для начала каждому слову тег 'О' - тег, означающий отсутствие NER-а
    word_labels = ['O'] * len(raw_toks)
    char2word = [None] * len(item['video_info'])
    # так как NER можем состаять из нескольких слов, то нам нужно сохранить эту инфорцию
    for i, word in enumerate(raw_toks):
        char2word[word.start:word.stop] = [i] * len(word.text)
    labels = item['entities']
    if isinstance(labels, dict):
        labels = [labels]
    if labels is not None:
        for e in labels:
            if e['label'] != 'не найдено':
                e_words = sorted({idx for idx in char2word[e['offset']:e['offset']+e['length']] if idx is not None})
                if e_words:
                    word_labels[e_words[0]] = 'B-' + e['label']
                    for idx in e_words[1:]:
                        word_labels[idx] = 'I-' + e['label']
                else:
                    continue
            else:
                continue
        return {'tokens': words, 'tags': word_labels}
    else: return {'tokens': words, 'tags': word_labels}

from sklearn.model_selection import train_test_split
ner_data = [extract_labels(item) for i, item in data_clean.iterrows()]
ner_train, ner_test = train_test_split(ner_data, test_size=0.2, random_state=1)
label_list = sorted({label for item in ner_train for label in item['tags']})
if 'O' in label_list:
    label_list.remove('O')
    label_list = ['O'] + label_list

from datasets import Dataset, DatasetDict
ner_data = DatasetDict({
    'train': Dataset.from_pandas(pd.DataFrame(ner_train)),
    'test': Dataset.from_pandas(pd.DataFrame(ner_test))
})
ner_data

Обучение модели

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset, load_metric

model_checkpoint = "cointegrated/rubert-tiny"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, device='gpu')
def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        label_ids = [label_list.index(idx) if isinstance(idx, str) else idx for idx in label_ids]

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs
tokenized_datasets = ner_data.map(tokenize_and_align_labels, batched=True)

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
base_model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
base_model.config.id2label = dict(enumerate(label_list))
base_model.config.label2id = {v: k for k, v in base_model.config.id2label.items()}

In [None]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

In [None]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
        }

In [None]:
batch_size = 32
train_args = TrainingArguments(
    "ner",
    evaluation_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=25,
    weight_decay=0.01,
    save_strategy='no',
    report_to='none',
)

trainer = Trainer(
    base_model,
    train_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
for param in base_model.parameters():
    param.requires_grad = True
if DO_TRAIN:
  trainer.train()

In [None]:
if DO_TRAIN:
  trainer.evaluate()
  base_model.save_pretrained('base_model_bert')
  tokenizer.save_pretrained('base_model_bert')
else:
  label_list = \
  ['O',
 'B-Дата',
 'B-бренд',
 'B-вид спорта',
 'B-видеоигра',
 'B-команда',
 'B-лига',
 'B-локация',
 'B-модель',
 'B-название проекта',
 'B-организация',
 'B-персона',
 'B-сезон',
 'B-серия',
 'I-Дата',
 'I-бренд',
 'I-вид спорта',
 'I-видеоигра',
 'I-команда',
 'I-лига',
 'I-локация',
 'I-модель',
 'I-название проекта',
 'I-организация',
 'I-персона',
 'I-сезон',
 'I-серия']
  base_model = AutoModelForTokenClassification.from_pretrained('base_model_bert', num_labels=len(label_list))
  tokenizer = AutoTokenizer.from_pretrained('base_model_bert', device='gpu')

In [None]:
import torch
import re
def get_bio_tags_base(text):
    dismissed_token = re.compile(r'\xad+|\u200b+')
    text = [re.sub(dismissed_token, '[UNK]', tok) for tok in text]
    tokens = tokenizer(text, truncation=True, is_split_into_words=True, return_tensors='pt')
    words = tokens.word_ids()
    tokens = {k: v.to(base_model.device) for k, v in tokens.items()}

    with torch.no_grad():
        pred = base_model(**tokens)

    indices = pred.logits.argmax(dim=-1)[0].cpu().numpy()
    labels = []
    prev=words[1] # это всегда ноль - первое слово
    labels = [label_list[indices[1]]]
    for word, tag in zip(words[1:-1], indices[1:-1]):
        if word != prev:
            labels.append(label_list[tag])
            prev=word
    return labels

In [None]:
# Т.к. не всегда удается сопоставить токены разных токенизаторов
def validate_and_fix_bio(text, bio_tags):
  razdel_tokens = list(tokenize(text))
  len_razdel = len(razdel_tokens)
  len_bio_tags = len(bio_tags)
  if len_razdel > len_bio_tags:
    for x in range(len_razdel - len_bio_tags):
      bio_tags.append('O')
  if len_razdel < len_bio_tags:
    bio_tags = bio_tags[0: len_razdel]
  return bio_tags

print(validate_and_fix_bio('Ереван',[]))
print(validate_and_fix_bio('',['O']))
print(validate_and_fix_bio('Ереван',['O']))
print(validate_and_fix_bio('',['']))

Подготовка данных для отправки

In [None]:
def submission_spacy(spacy_model):
    ner_test_data = pd.read_csv("ner_test.csv")
    new_sub = pd.DataFrame(columns=[['video_info', 'entities_prediction']])
    for i, elem in ner_test_data.iterrows():
        text = elem['video_info']
        new_sub.loc[i, 'video_info'] = text
        tags = get_bio_tags_spacy(spacy_model(text))
        tags = validate_and_fix_bio(text, tags)
        new_sub.loc[i, 'entities_prediction'] = str(tags)
    return new_sub

def submission_base():
    ner_test_data = pd.read_csv("ner_test.csv")
    new_sub = pd.DataFrame(columns=[['video_info', 'entities_prediction']])
    for i, elem in ner_test_data.iterrows():
        text = elem['video_info']
        new_sub.loc[i, 'video_info'] = text
        tags = get_bio_tags_base(text)
        tags = validate_and_fix_bio(text, tags)
        new_sub.loc[i, 'entities_prediction'] = str(tags)
    return new_sub

def submission_hybrid():
    ner_test_data = pd.read_csv("ner_test.csv")
    new_sub = pd.DataFrame(columns=[['video_info', 'entities_prediction']])
    for i, elem in ner_test_data.iterrows():
        text = elem['video_info']
        new_sub.loc[i, 'video_info'] = text

        tags_b = get_bio_tags_base(text)
        tags_b = validate_and_fix_bio(text, tags_b)

        tags_s1 = get_bio_tags_spacy(spacy_multilingual(text))
        tags_s1 = validate_and_fix_bio(text, tags_s1)

        tags_s2 = get_bio_tags_spacy(spacy_rubert(text))
        tags_s2 = validate_and_fix_bio(text, tags_s2)
        hybrid_tags = []
        for t1, t2, t3 in zip(tags_b, tags_s1, tags_s2):
          t = t1
          if t == 'O':
            t = t2
          if t == 'O':
            t = t3
          hybrid_tags.append(t)

        new_sub.loc[i, 'entities_prediction'] = str(hybrid_tags)
    return new_sub

In [None]:
spacy_multilingual_result = submission_spacy(spacy_multilingual)
spacy_multilingual_result.to_csv('submission_spacy_multilingual.csv', index=False)

spacy_rubert = submission_spacy(spacy_rubert)
spacy_rubert.to_csv('submission_spacy_rubert.csv', index=False)

base_rubert = submission_base()
base_rubert.to_csv('submission_base_rubert.csv', index=False)

hybryd = submission_hybrid()
hybryd.to_csv('submission_hybryd.csv', index=False)