In [1]:
!pip install -q datasets transformers seqeval
!pip install -q corus razdel

[K     |████████████████████████████████| 290 kB 5.5 MB/s 
[K     |████████████████████████████████| 3.1 MB 34.7 MB/s 
[K     |████████████████████████████████| 43 kB 1.4 MB/s 
[K     |████████████████████████████████| 1.1 MB 35.5 MB/s 
[K     |████████████████████████████████| 243 kB 47.1 MB/s 
[K     |████████████████████████████████| 132 kB 47.7 MB/s 
[K     |████████████████████████████████| 59 kB 6.6 MB/s 
[K     |████████████████████████████████| 895 kB 37.1 MB/s 
[K     |████████████████████████████████| 596 kB 47.4 MB/s 
[K     |████████████████████████████████| 3.3 MB 32.8 MB/s 
[K     |████████████████████████████████| 192 kB 43.7 MB/s 
[K     |████████████████████████████████| 271 kB 36.6 MB/s 
[K     |████████████████████████████████| 160 kB 45.8 MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 83 kB 1.5 MB/s 
[?25h

In [2]:
import pandas as pd
import numpy as np
from datasets import load_dataset, load_metric, Dataset, DatasetDict
from razdel import tokenize
import torch
from sklearn.model_selection import train_test_split
import os

from transformers import set_seed

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
!nvidia-smi

Thu Nov 11 19:20:19 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P8    32W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
def seed_everything(seed: int):
    "Seeds and fixes every possible random state."
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    set_seed(seed)

SEED = 3407
seed_everything(SEED)

In [6]:
model_checkpoint = "cointegrated/rubert-tiny2"
batch_size = 4

In [7]:
!wget -q -O train.csv https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/fmWGQJvwU5ejog
!wget -q -O train_labels.json https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/0nJ2QTRb9-U7tA
!wget -q https://github.com/cimm-kzn/RuDReC/raw/master/data/rudrec_annotated.json
!wget -q -O labeled_train_by_hand.json https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/Oq154PAxDGeIFg

In [8]:
from corus import load_rudrec
drugs = list(load_rudrec('rudrec_annotated.json'))
print(len(drugs))

4809


In [9]:
train_df = pd.read_csv('train.csv')
train_labels = pd.read_json('train_labels.json').T

In [10]:
labeled_by_hand_df = pd.read_json('labeled_train_by_hand.json')
labeled_by_hand_df['id'] = labeled_by_hand_df['id'] - np.ones(len(labeled_by_hand_df['id']))
labeled_by_hand_df['id'] = labeled_by_hand_df['id'].astype(int)

In [11]:
from razdel import tokenize
def extract_labels_drug(item):
    raw_toks = list(tokenize(item.text))
    words = [tok.text for tok in raw_toks]
    word_labels = ['O'] * len(raw_toks)
    char2word = [None] * len(item.text)
    for i, word in enumerate(raw_toks):
        char2word[word.start:word.stop] = [i] * len(word.text)

    for e in item.entities:
        if e.entity_type == 'ADR':
            e_words = sorted({idx for idx in char2word[e.start:e.end] if idx is not None})
            word_labels[e_words[0]] = 'B-' + e.entity_type
            for idx in e_words[1:]:
                word_labels[idx] = 'I-' + e.entity_type

    if "B-ADR" not in word_labels:
        return 0

    return {'tokens': words, 'tags': word_labels}

In [12]:
def extract_labels(train_index):
    text = train_df.text[train_index]
    raw_toks = list(tokenize(text))
    words = [tok.text for tok in raw_toks]

    word_labels = ['O'] * len(raw_toks)
    char2word = [None] * len(text)

    for i, word in enumerate(raw_toks):
        char2word[word.start:word.stop] = [i] * len(word.text)

    spans = train_labels.span[train_index]
    for e in spans:
        entity_type = "ADR"
        e_words = sorted({idx for idx in char2word[e[0]:e[1]] if idx is not None})
        word_labels[e_words[0]] = 'B-' + entity_type
        for idx in e_words[1:]:
            word_labels[idx] = 'I-' + entity_type

    return {'tokens': words, 'tags': word_labels}

In [13]:
def extract_labels_by_hand(labeled_by_hand_index):
    text = labeled_by_hand_df['data'][labeled_by_hand_index]['text']


    raw_toks = list(tokenize(text))
    words = [tok.text for tok in raw_toks]

    word_labels = ['O'] * len(raw_toks)
    char2word = [None] * len(text)

    for i, word in enumerate(raw_toks):
        char2word[word.start:word.stop] = [i] * len(word.text)

    spans = labeled_by_hand_df['annotations'][labeled_by_hand_index][0]['result']

    for e in spans:
        e = e['value']
        entity_type = "ADR"
        e_words = sorted({idx for idx in char2word[e['start']:e['end']] if idx is not None})
        word_labels[e_words[0]] = 'B-' + entity_type
        for idx in e_words[1:]:
            word_labels[idx] = 'I-' + entity_type

    return {'tokens': words, 'tags': word_labels}

In [14]:
ner_data_drug = []

for item in drugs:
    dct = extract_labels_drug(item)
    if dct != 0:
        ner_data_drug.append(dct)

len(ner_data_drug)

376

In [15]:
ner_data_train = [extract_labels(i) for i in range(0, 30)]
ner_data_by_hand = [extract_labels_by_hand(i) for i in labeled_by_hand_df.index]

ner_data = ner_data_train + ner_data_by_hand + ner_data_drug

In [16]:
TEST_SIZE = 0

if TEST_SIZE == 0:
    ner_train = ner_data
    ner_test = [ner_data[0]]

else:   
    ner_train, ner_test = train_test_split(ner_data, test_size=TEST_SIZE, random_state=200)

In [17]:
pd.options.display.max_colwidth = 300
pd.DataFrame(ner_train).sample(1)

Unnamed: 0,tokens,tags
188,"[Телёнок, умер, ,, из, заднего, прохода, кал, с, кровью, ,, что, это, может, быть, ?]","[O, O, O, O, O, O, B-ADR, I-ADR, I-ADR, O, O, O, O, O, O]"


In [18]:
ner_data = DatasetDict({
    'train': Dataset.from_pandas(pd.DataFrame(ner_train)),
    'test': Dataset.from_pandas(pd.DataFrame(ner_test))
})
ner_data

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 670
    })
    test: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 1
    })
})

In [19]:
label_list = sorted({label for item in ner_train for label in item['tags']})
if 'O' in label_list:
    label_list.remove('O')
    label_list = ['O'] + label_list
label_list

['O', 'B-ADR', 'I-ADR']

In [20]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/401 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.66M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [21]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        label_ids = [label_list.index(idx) if isinstance(idx, str) else idx for idx in label_ids]

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [22]:
tokenized_datasets = ner_data.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [23]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
model.config.id2label = dict(enumerate(label_list))
model.config.label2id = {v: k for k, v in model.config.id2label.items()}

Downloading:   0%|          | 0.00/715 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112M [00:00<?, ?B/s]

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized 

In [24]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [25]:
metric = load_metric("seqeval")

Downloading:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

In [26]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [27]:
import logging
from transformers.trainer import logger as noisy_logger
noisy_logger.setLevel(logging.WARNING)

In [28]:
for param in model.parameters():
    param.requires_grad = True

In [29]:
if TEST_SIZE == 0:

    args = TrainingArguments(
    "ner",
    # evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    # per_device_eval_batch_size=batch_size,
    num_train_epochs=16,
    weight_decay=0.01,
    save_strategy='no',
    report_to='none')


    trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=None,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics)
    
else:
    args = TrainingArguments(
    "ner",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=16,
    weight_decay=0.01,
    save_strategy='no',
    report_to='none')
    

    trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics)

In [30]:
trainer.train()

Step,Training Loss
500,0.2609
1000,0.067
1500,0.0246
2000,0.0096
2500,0.0053


TrainOutput(global_step=2688, training_loss=0.06858305792723383, metrics={'train_runtime': 112.8432, 'train_samples_per_second': 94.999, 'train_steps_per_second': 23.821, 'total_flos': 15552548671680.0, 'train_loss': 0.06858305792723383, 'epoch': 16.0})

In [31]:
if TEST_SIZE != 0:
    trainer.evaluate()

In [32]:
path_to_model = 'ner_bert_top_229/'
model.save_pretrained(path_to_model)
tokenizer.save_pretrained(path_to_model)

Configuration saved in ner_bert_top_229/config.json
Model weights saved in ner_bert_top_229/pytorch_model.bin
tokenizer config file saved in ner_bert_top_229/tokenizer_config.json
Special tokens file saved in ner_bert_top_229/special_tokens_map.json


('ner_bert_top_229/tokenizer_config.json',
 'ner_bert_top_229/special_tokens_map.json',
 'ner_bert_top_229/vocab.txt',
 'ner_bert_top_229/added_tokens.json',
 'ner_bert_top_229/tokenizer.json')

# Test dataset

In [33]:
!wget -q -O test.csv https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/Wo70d4_PAwujqA

In [34]:
test_df = pd.read_csv('test.csv')

In [None]:
from spans_prediction import NER
ner_model = NER(path_to_model, path_to_model)

In [38]:
# for d in range(0, len(test_df)):
#     print(test_df.text[d])
#     print(ner_model.hugginface_pipeline(test_df.text[d].lower()))
#     print()
#     print()

# Save test dataframe with predicted spans

In [39]:
test_df['span'] = test_df['text'].apply(lambda x: ner_model.predict_spans(x))



In [40]:
test_df.to_json('test_df_with_spans229.json')

# Download model and tokenizer from google-colab

In [None]:
# print(path_to_model)

In [None]:
# !zip -r /content/ner_bert_top20.zip /content/ner_bert_top20

In [None]:
# from google.colab import files
# files.download("/content/ner_bert_top.zip")