In [None]:
!pip install datasets
!pip install tokenizers
!pip install transformers

In [None]:
from datasets import load_dataset



In [None]:
ru_dataset = load_dataset("surdan/nerel_short")

In [None]:
ru_dataset

In [None]:
ru_label_names = ['O', 'I-AGE', 'B-AGE', 'B-AWARD', 'I-AWARD', 'B-CITY', 'I-CITY', 'B-COUNTRY', 'I-COUNTRY', 'B-CRIME', 'I-CRIME', 'B-DATE', 'I-DATE', 'B-DISEASE', 'I-DISEASE', 'B-DISTRICT', 'I-DISTRICT', 'B-EVENT', 'I-EVENT', 'B-FACILITY', 'I-FACILITY', 'B-FAMILY', 'I-FAMILY', 'B-IDEOLOGY', 'I-IDEOLOGY', 'B-LANGUAGE', 'I-LAW', 'B-LAW', 'B-LOCATION', 'I-LOCATION', 'B-MONEY', 'I-MONEY', 'B-NATIONALITY', 'I-NATIONALITY', 'B-NUMBER', 'I-NUMBER', 'B-ORDINAL', 'I-ORDINAL', 'B-ORGANIZATION', 'I-ORGANIZATION', 'B-PENALTY', 'I-PENALTY', 'B-PERCENT', 'I-PERCENT', 'B-PERSON', 'I-PERSON', 'I-PRODUCT', 'B-PRODUCT', 'B-PROFESSION', 'I-PROFESSION', 'B-RELIGION', 'I-RELIGION', 'B-STATE_OR_PROVINCE', 'I-STATE_OR_PROVINCE', 'B-TIME', 'I-TIME', 'B-WORK_OF_ART', 'I-WORK_OF_ART']


In [None]:
from transformers import AutoTokenizer

ru_tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")

In [None]:
def ru_tokenize_adjust_labels(all_samples_per_split):
    ru_tokenized_samples = ru_tokenizer.batch_encode_plus(all_samples_per_split["sequences"], is_split_into_words=True)
    total_adjusted_labels = []
    print(len(ru_tokenized_samples["input_ids"]))
    
    for k in range(0, len(ru_tokenized_samples["input_ids"])):
        prev_wid = -1
        word_ids_list = ru_tokenized_samples.word_ids(batch_index=k)
        existing_label_ids = all_samples_per_split["ids"][k]
        i = -1
        adjusted_label_ids = []
   
        for wid in word_ids_list:
            if (wid is None):
                adjusted_label_ids.append(-100)
            elif (wid != prev_wid):
                i = i + 1
                adjusted_label_ids.append(existing_label_ids[i])
                prev_wid = wid
            else:
                label_name = ru_label_names[existing_label_ids[i]]
                adjusted_label_ids.append(existing_label_ids[i])
        
        total_adjusted_labels.append(adjusted_label_ids)
    
    ru_tokenized_samples["labels"] = total_adjusted_labels
    return ru_tokenized_samples

ru_tokenized_dataset = ru_dataset.map(ru_tokenize_adjust_labels, batched=True)

In [None]:
ru_tokenized_dataset["test"][1]

In [None]:
from transformers import DataCollatorForTokenClassification

In [None]:
ru_data_collator = DataCollatorForTokenClassification(ru_tokenizer)

In [None]:
from transformers import AutoModelForTokenClassification, pipeline, TrainingArguments, Trainer
import numpy as np
from datasets import load_metric

In [None]:

metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [ru_label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [ru_label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }
    for k in results.keys():
        if (k not in flattened_results.keys()):
            flattened_results[k+"_f1"]=results[k]["f1"]

    return flattened_results


In [None]:
#ru_model = AutoModelForTokenClassification.from_pretrained("sberbank-ai/sbert_large_nlu_ru", num_labels=len(ru_label_names))
ru_model = AutoModelForTokenClassification.from_pretrained("DeepPavlov/rubert-base-cased", num_labels=len(ru_label_names))
ru_training_args = TrainingArguments(
    output_dir="./fine_tune_bert_output",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    eval_accumulation_steps=5,
    num_train_epochs=7,
    weight_decay=0.01,
    logging_steps = 250,
    run_name = "ep_10_tokenized_11",
    save_strategy='no'
)
ru_trainer = Trainer(
    model=ru_model,
    args=ru_training_args,
    train_dataset=ru_tokenized_dataset["train"],
    eval_dataset=ru_tokenized_dataset["dev"],
    data_collator=ru_data_collator,
    tokenizer=ru_tokenizer,
    compute_metrics=compute_metrics
)

ru_trainer.train()

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
nerpipeline = pipeline('ner', model=ru_model, tokenizer=ru_tokenizer, device=0)
text = "Новым послом Южной Кореи в России стал бывший посол в Камбодже Чан Хо Чжин, передает Yonhap."
nerpipeline(text)

In [None]:
ru_model.save_pretrained("./model")

In [None]:
test_model = AutoModelForTokenClassification.from_pretrained("./model", num_labels=len(ru_label_names))

In [None]:
def parser_predict(pred_res):
    out_res = []
    
    for i in range(0, len(pred_res)):
        word = pred_res[i]["word"]
        label = ru_label_names[int(pred_res[i]['entity'][6:])]
        
        if (word[:2] == '##'):
            out_res[len(out_res)-1]["word"] = out_res[len(out_res)-1]["word"] + word[2:]
        else:
            out_res.append({"word": word, "label": label})
    return out_res

In [None]:
test_nerpipeline = pipeline('ner', model=test_model, tokenizer=ru_tokenizer, device=0)
test_text = "Новым послом Южной Кореи в России стал бывший посол в Камбодже Чан Хо Чжин, передает Yonhap."
parser_predict(test_nerpipeline(test_text))