# Named Entity Recognition for Prozhito

In [1]:
import pandas as pd 
import numpy as np 
import random
from collections import defaultdict, Counter
import re

In [2]:
import torch
from torch import nn 
from torch.nn import CrossEntropyLoss
from torch.utils.data import Dataset, DataLoader 
from torch.optim import Adam
from transformers import get_linear_schedule_with_warmup, AutoTokenizer, AutoModelForPreTraining
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from datasets import Dataset, DatasetDict, load_metric
import torch.nn.functional as F
from transformers import pipeline



In [3]:
from transformers import DataCollatorForTokenClassification

In [4]:
label_list = ['B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'B-FAC', 'I-FAC', 'B-CHAR', 'I-CHAR', 'O']
MODEL_PATH = "DeepPavlov/rubert-base-cased-sentence"

OUTPUT_DIR = '.'
res = {}
tokenizer = None 

In [5]:
def set_random_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

In [6]:
set_random_seed(42)

# Uploading the Data

In [7]:
!git clone https://github.com/tatnashev/prozhito

Cloning into 'prozhito'...
remote: Enumerating objects: 63, done.[K
remote: Counting objects: 100% (63/63), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 63 (delta 32), reused 50 (delta 22), pack-reused 0[K
Unpacking objects: 100% (63/63), 2.15 MiB | 4.75 MiB/s, done.


In [8]:
df_train = pd.read_csv('prozhito/prozhito_data/df_train_prozhito.csv')
df_test = pd.read_csv('prozhito/prozhito_data/df_test_prozhito.csv')
df_val = pd.read_csv('prozhito/prozhito_data/df_val_prozhito.csv')

df_train['BIO_nums'] = df_train['BIO_nums'].apply(lambda x: eval(x))
df_test['BIO_nums'] = df_test['BIO_nums'].apply(lambda x: eval(x))
df_val['BIO_nums'] = df_val['BIO_nums'].apply(lambda x: eval(x))

df_train['BIO_list'] = df_train['BIO_list'].apply(lambda x: eval(x))
df_test['BIO_list'] = df_test['BIO_list'].apply(lambda x: eval(x))
df_val['BIO_list'] = df_val['BIO_list'].apply(lambda x: eval(x))

df_train['tokens'] = df_train['tokens'].apply(lambda x: x.split())
df_test['tokens'] = df_test['tokens'].apply(lambda x: x.split())
df_val['tokens'] = df_val['tokens'].apply(lambda x: x.split())

df_train = df_train[['tokens', 'BIO_list', 'BIO_nums']].rename(columns={'BIO_list': 'ner_bio', 'BIO_nums': 'ner_tags'})
df_test = df_test[['tokens', 'BIO_list', 'BIO_nums']].rename(columns={'BIO_list': 'ner_bio', 'BIO_nums': 'ner_tags'})
df_val = df_val[['tokens', 'BIO_list', 'BIO_nums']].rename(columns={'BIO_list': 'ner_bio', 'BIO_nums': 'ner_tags'})

df_train

Unnamed: 0,tokens,ner_bio,ner_tags
0,"[У, меня, большая, симпатия, к, Лукьянину, —, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."
1,"[>, Каким, приговором, ,, указом, каким, >, Ты...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."
2,"[Подумал, ,, что, летом, ребята, куда, затащил...","[O, O, O, O, B-CHAR, O, O, O]","[10, 10, 10, 10, 8, 10, 10, 10]"
3,"[Нашел, потрясающие, материалы, о, В, ., М, .,...","[O, O, O, O, B-PER, I-PER, I-PER, I-PER, I-PER...","[10, 10, 10, 10, 4, 5, 5, 5, 5, 10, 10, 8, 8, ..."
4,"[[Без, даты, ., ]]","[O, O, O, O]","[10, 10, 10, 10]"
...,...,...,...
1253,"[Где, граница, между, сегодняшней, жизнью, и, ...","[O, O, O, O, O, O, O, O, O]","[10, 10, 10, 10, 10, 10, 10, 10, 10]"
1254,"[Длится, уже, около, часа, .]","[O, O, O, O, O]","[10, 10, 10, 10, 10]"
1255,"[23, мая, ,, примерно, в, 7, часов, (, 18, .]","[O, O, O, O, O, O, O, O, O, O]","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10]"
1256,"[В, русском, переводе, примерно, такой, :, «, ...","[O, O, O, O, O, O, O, O, O, O, B-CHAR, O]","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8, 10]"


In [9]:
raw_datasets = DatasetDict({
    'train': Dataset.from_pandas(df_train[['tokens', 'ner_tags']]),
    'test': Dataset.from_pandas(df_test[['tokens', 'ner_tags']]),
    'val': Dataset.from_pandas(df_val[['tokens', 'ner_tags']])
})

In [10]:
print(raw_datasets["train"][2]["ner_tags"])

[10, 10, 10, 10, 8, 10, 10, 10]


In [11]:
label_names = {
    0: 'B-LOC',
    1: 'I-LOC',
    2: 'B-ORG',
    3: 'I-ORG',
    4: 'B-PER', 
    5: 'I-PER', 
    6: 'B-FAC',
    7: 'I-FAC',
    8: 'B-CHAR',
    9: 'I-CHAR',
    10: 'O'
}

# Tokenization

In [12]:
from transformers import AutoTokenizer

model_checkpoint = "DeepPavlov/rubert-base-cased-sentence"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [13]:
inputs = tokenizer(raw_datasets["train"][2]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'Под',
 '##ума',
 '##л',
 ',',
 'что',
 'летом',
 'ребята',
 'куда',
 'зата',
 '##щил',
 '##и',
 '.',
 '[SEP]']

In [14]:
inputs.word_ids()

[None, 0, 0, 0, 1, 2, 3, 4, 5, 6, 6, 6, 7, None]

In [15]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [16]:
labels = raw_datasets["train"][2]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[10, 10, 10, 10, 8, 10, 10, 10]
[-100, 10, 10, 10, 10, 10, 10, 8, 10, 10, 10, 10, 10, -100]


In [17]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [18]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

  0%|          | 0/2 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [19]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Metrics

In [20]:
!pip install seqeval

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16181 sha256=b24aca7a5d4e81d6a3e1ca47fba2db7040609b0f1c8cd7e34647740f535af75f
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
[0m

In [21]:
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0
[0m

In [22]:
import evaluate

metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [23]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

# Defining a Model

In [24]:
id2label = {i: label for i, label in label_names.items()}
label2id = {v: k for k, v in id2label.items()}

In [25]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Downloading pytorch_model.bin:   0%|          | 0.00/711M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-sentence and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Fine-tuning the Model

In [26]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir='.',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=20,
    weight_decay=1e-4,
    report_to='none',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    load_best_model_at_end=True,
    save_total_limit=1
)

In [27]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.105146,0.372549,0.542857,0.44186,0.957545
2,No log,0.083998,0.589474,0.8,0.678788,0.976004
3,No log,0.067132,0.637363,0.828571,0.720497,0.980618
4,0.133500,0.070132,0.739726,0.771429,0.755245,0.986617
5,0.133500,0.053165,0.75,0.814286,0.780822,0.988002
6,0.133500,0.075026,0.820896,0.785714,0.80292,0.988002
7,0.014300,0.069662,0.797297,0.842857,0.819444,0.98754
8,0.014300,0.069608,0.813333,0.871429,0.841379,0.989848
9,0.014300,0.073488,0.789474,0.857143,0.821918,0.98754
10,0.003500,0.071128,0.828571,0.828571,0.828571,0.989386


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=3160, training_loss=0.024364790111733012, metrics={'train_runtime': 360.0386, 'train_samples_per_second': 69.881, 'train_steps_per_second': 8.777, 'total_flos': 525210426883176.0, 'train_loss': 0.024364790111733012, 'epoch': 20.0})

In [28]:
trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.064448,0.780822,0.814286,0.797203,0.987079
2,No log,0.080635,0.873016,0.785714,0.827068,0.988002
3,No log,0.0603,0.833333,0.857143,0.84507,0.990309
4,0.007300,0.063994,0.775,0.885714,0.826667,0.989386
5,0.007300,0.071958,0.813333,0.871429,0.841379,0.989848
6,0.007300,0.072345,0.783784,0.828571,0.805556,0.988925
7,0.004400,0.064433,0.789474,0.857143,0.821918,0.989848
8,0.004400,0.074768,0.772152,0.871429,0.818792,0.989386
9,0.004400,0.082731,0.810811,0.857143,0.833333,0.988002
10,0.001400,0.067447,0.828571,0.828571,0.828571,0.990309


TrainOutput(global_step=3160, training_loss=0.0023265786810860605, metrics={'train_runtime': 363.6633, 'train_samples_per_second': 69.185, 'train_steps_per_second': 8.689, 'total_flos': 525210426883176.0, 'train_loss': 0.0023265786810860605, 'epoch': 20.0})

# Inference

## Data preparation

In [None]:
df = pd.read_csv('/kaggle/input/prozhito-texts/all_prozhito_texts_markup.csv')['text']
df

In [None]:
def batched(df, batch_size):
    total = len(df)
    n_steps = total // batch_size + (total % batch_size > 0)
    for i in range(n_steps):
        start = i * batch_size
        stop = min((i + 1) * batch_size, total)
        yield df.iloc[start:stop]

In [None]:
from tqdm import tqdm

In [None]:
predictions = []
for batch in tqdm(batched(df, 128)):
    batch_texts = batch.tolist()
    batch_texts_lens = [len(t.split()) for t in batch_texts]
    batch_encoded = {k: v.cuda() for k, v in tokenizer(batch_texts, return_tensors="pt", padding=True).items()}
    
    batch_outputs = model(**batch_encoded)
    batch_predictions = batch_outputs.logits.argmax(-1).detach().cpu().clone().numpy()
    predictions.append(batch_predictions)
predictions = np.concatenate(predictions)

In [None]:
texts = ["Маша приехала в Москву и пошла в парк Горького", 'Маша', 'я живу в России .']
texts_lens = [len(i.split()) for i in texts]
encoding = {k: v.cuda() for k, v in tokenizer(text, return_tensors="pt", padding=True).items()}

In [None]:
encoding

In [None]:
model.cuda()
outputs = model(**encoding)
predictions = outputs.logits.argmax(-1)


In [None]:
predictions = predictions.detach().cpu().clone().numpy()

In [None]:
predictions

In [None]:
def postprocess(predictions, label2id, texts_lens):
    BIOs = []
    for pred, text_len in zip(predictions, text
                              s_lens):
        bio = []
        for label in pred[1:-1]:
            bio.append(id2label[label])
        BIOs.append(bio[:text_len])
    return BIOs

In [None]:
postprocess(predictions, label2id, texts_lens)