In [1]:
! pip install -q datasets transformers seqeval
!pip install -q corus razdel

[K     |████████████████████████████████| 290 kB 4.1 MB/s 
[K     |████████████████████████████████| 3.1 MB 40.8 MB/s 
[K     |████████████████████████████████| 43 kB 2.0 MB/s 
[K     |████████████████████████████████| 1.1 MB 51.5 MB/s 
[K     |████████████████████████████████| 243 kB 44.6 MB/s 
[K     |████████████████████████████████| 132 kB 46.7 MB/s 
[K     |████████████████████████████████| 59 kB 6.9 MB/s 
[K     |████████████████████████████████| 596 kB 42.1 MB/s 
[K     |████████████████████████████████| 3.3 MB 37.4 MB/s 
[K     |████████████████████████████████| 895 kB 44.9 MB/s 
[K     |████████████████████████████████| 192 kB 41.9 MB/s 
[K     |████████████████████████████████| 271 kB 51.1 MB/s 
[K     |████████████████████████████████| 160 kB 47.9 MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 83 kB 1.3 MB/s 
[?25h

In [2]:
import pandas as pd
import numpy as np
from datasets import load_dataset, load_metric, Dataset, DatasetDict
from razdel import tokenize
import torch
from sklearn.model_selection import train_test_split
import os

from transformers import set_seed

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
!nvidia-smi

Sat Nov  6 12:38:06 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8    27W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [7]:
def seed_everything(seed: int):
    "Seeds and fixes every possible random state."
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    set_seed(seed)

SEED = 3407
seed_everything(SEED)

In [8]:
model_checkpoint = "cointegrated/rubert-tiny2"
batch_size = 4

In [9]:
!wget -q -O train.csv https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/fmWGQJvwU5ejog
!wget -q -O train_labels.json https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/0nJ2QTRb9-U7tA
!wget -q https://github.com/cimm-kzn/RuDReC/raw/master/data/rudrec_annotated.json
!wget -q -O labeled_train_by_hand.json https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/Oq154PAxDGeIFg

In [10]:
from corus import load_rudrec
drugs = list(load_rudrec('rudrec_annotated.json'))
print(len(drugs))

4809


In [11]:
train_df = pd.read_csv('train.csv')
train_labels = pd.read_json('train_labels.json').T

In [12]:
labeled_by_hand_df = pd.read_json('labeled_train_by_hand.json')
labeled_by_hand_df['id'] = labeled_by_hand_df['id'] - np.ones(len(labeled_by_hand_df['id']))
labeled_by_hand_df['id'] = labeled_by_hand_df['id'].astype(int)

In [13]:
from razdel import tokenize
def extract_labels_drug(item):
    raw_toks = list(tokenize(item.text))
    words = [tok.text for tok in raw_toks]
    word_labels = ['O'] * len(raw_toks)
    char2word = [None] * len(item.text)
    for i, word in enumerate(raw_toks):
        char2word[word.start:word.stop] = [i] * len(word.text)

    for e in item.entities:
        if e.entity_type == 'ADR':
            e_words = sorted({idx for idx in char2word[e.start:e.end] if idx is not None})
            word_labels[e_words[0]] = 'B-' + e.entity_type
            for idx in e_words[1:]:
                word_labels[idx] = 'I-' + e.entity_type

    if "B-ADR" not in word_labels:
        return 0

    return {'tokens': words, 'tags': word_labels}

In [14]:
def extract_labels(train_index):
    text = train_df.text[train_index]
    raw_toks = list(tokenize(text))
    words = [tok.text for tok in raw_toks]

    word_labels = ['O'] * len(raw_toks)
    char2word = [None] * len(text)

    for i, word in enumerate(raw_toks):
        char2word[word.start:word.stop] = [i] * len(word.text)

    spans = train_labels.span[train_index]
    for e in spans:
        entity_type = "ADR"
        e_words = sorted({idx for idx in char2word[e[0]:e[1]] if idx is not None})
        word_labels[e_words[0]] = 'B-' + entity_type
        for idx in e_words[1:]:
            word_labels[idx] = 'I-' + entity_type

    return {'tokens': words, 'tags': word_labels}

In [15]:
def extract_labels_by_hand(labeled_by_hand_index):
    text = labeled_by_hand_df['data'][labeled_by_hand_index]['text']


    raw_toks = list(tokenize(text))
    words = [tok.text for tok in raw_toks]

    word_labels = ['O'] * len(raw_toks)
    char2word = [None] * len(text)

    for i, word in enumerate(raw_toks):
        char2word[word.start:word.stop] = [i] * len(word.text)

    spans = labeled_by_hand_df['annotations'][labeled_by_hand_index][0]['result']

    for e in spans:
        e = e['value']
        entity_type = "ADR"
        e_words = sorted({idx for idx in char2word[e['start']:e['end']] if idx is not None})
        word_labels[e_words[0]] = 'B-' + entity_type
        for idx in e_words[1:]:
            word_labels[idx] = 'I-' + entity_type

    return {'tokens': words, 'tags': word_labels}

In [16]:
ner_data_drug = []

for item in drugs:
    dct = extract_labels_drug(item)
    if dct != 0:
        ner_data_drug.append(dct)

len(ner_data_drug)

376

In [185]:
ner_data_train = [extract_labels(i) for i in range(0, 30)]
ner_data_by_hand = [extract_labels_by_hand(i) for i in labeled_by_hand_df.index]

ner_data = ner_data_train + ner_data_by_hand + ner_data_drug

In [186]:
TEST_SIZE = 0

if TEST_SIZE == 0:
    ner_train = ner_data
    ner_test = [ner_data[0]]

else:   
    ner_train, ner_test = train_test_split(ner_data, test_size=TEST_SIZE, random_state=200)

In [187]:
pd.options.display.max_colwidth = 300
pd.DataFrame(ner_train).sample(1)

Unnamed: 0,tokens,tags
361,"[В, первый, раз, ,, когда, ребенок, заболел, обычным, орви, ,, нам, сразу, же, врач, навыписывал, целый, ряд, лекарств, ,, в, том, числе, и, антибиотик, "", бактрим, "", ,, обычно, должны, сразу, же, выписать, к, нему, Линекс, ,, для, улучшения, пищеварения, ,, но, никто, нам, ничего, не, прописал...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-ADR, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-ADR, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O,..."


In [188]:
ner_data = DatasetDict({
    'train': Dataset.from_pandas(pd.DataFrame(ner_train)),
    'test': Dataset.from_pandas(pd.DataFrame(ner_test))
})
ner_data

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 670
    })
    test: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 1
    })
})

In [189]:
label_list = sorted({label for item in ner_train for label in item['tags']})
if 'O' in label_list:
    label_list.remove('O')
    label_list = ['O'] + label_list
label_list

['O', 'B-ADR', 'I-ADR']

In [190]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

loading file https://huggingface.co/cointegrated/rubert-tiny2/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/16af2afaa4ceaa8d50b689bd4c2f7ef7fe3bfac06c0aac7d82a5c1c72298b62a.cc3312d07ccf88871a3c2b7cb3442138e6785101efead94d9f77e96301cf7f4a
loading file https://huggingface.co/cointegrated/rubert-tiny2/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/cfa6d82dc8ecc7fe3f06deb449f38968cdd188bb84c8da0e06f0bbfddbede1e3.550ab7157d36210bf96c7c3b30e621933d37d635c5f2e290f7e88bd5f7c9198a
loading file https://huggingface.co/cointegrated/rubert-tiny2/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/cointegrated/rubert-tiny2/resolve/main/special_tokens_map.json from cache at /root/.cache/huggingface/transformers/20317640533199c6b37a557395cd5ee5fcb8777be7c89bb1314bfd43058b35e9.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
loading file https://huggingface.co/cointegrated/rubert-tiny2/resolv

In [191]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        label_ids = [label_list.index(idx) if isinstance(idx, str) else idx for idx in label_ids]

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [192]:
tokenized_datasets = ner_data.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [201]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
model.config.id2label = dict(enumerate(label_list))
model.config.label2id = {v: k for k, v in model.config.id2label.items()}

loading configuration file https://huggingface.co/cointegrated/rubert-tiny2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/726139048d10597682731a7a4c0b8ef0382927911bc0f6f050a4f7f0afb04c2a.149cdc07694f3925e290abc5528c57a543bcbc9af955c0202b1028584ad15cb4
Model config BertConfig {
  "_name_or_path": "/gd/MyDrive/models/rubert-tiny-mlm-nli-sentence",
  "architectures": [
    "BertForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "emb_size": 312,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 312,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 600,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 2048,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 3,
  "pad_token_id": 0

In [202]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [203]:
metric = load_metric("seqeval")

In [204]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [205]:
import logging
from transformers.trainer import logger as noisy_logger
noisy_logger.setLevel(logging.WARNING)

In [206]:
for param in model.parameters():
    param.requires_grad = True

In [207]:
if TEST_SIZE == 0:

    args = TrainingArguments(
    "ner",
    # evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    # per_device_eval_batch_size=batch_size,
    num_train_epochs=16,
    weight_decay=0.01,
    save_strategy='no',
    report_to='none')


    trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=None,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics)
    
else:
    args = TrainingArguments(
    "ner",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=16,
    weight_decay=0.01,
    save_strategy='no',
    report_to='none')
    

    trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics)

PyTorch: setting up devices


In [208]:
trainer.train()

Step,Training Loss
500,0.2653
1000,0.0662
1500,0.023
2000,0.0099
2500,0.0052


TrainOutput(global_step=2688, training_loss=0.06902689832661833, metrics={'train_runtime': 108.0, 'train_samples_per_second': 99.259, 'train_steps_per_second': 24.889, 'total_flos': 15289883012160.0, 'train_loss': 0.06902689832661833, 'epoch': 16.0})

In [209]:
if TEST_SIZE != 0:
    trainer.evaluate()

In [210]:
path_to_model = 'ner_bert_top20/'
model.save_pretrained(path_to_model)
tokenizer.save_pretrained(path_to_model)

Configuration saved in ner_bert_top20/config.json
Model weights saved in ner_bert_top20/pytorch_model.bin
tokenizer config file saved in ner_bert_top20/tokenizer_config.json
Special tokens file saved in ner_bert_top20/special_tokens_map.json


('ner_bert_top20/tokenizer_config.json',
 'ner_bert_top20/special_tokens_map.json',
 'ner_bert_top20/vocab.txt',
 'ner_bert_top20/added_tokens.json',
 'ner_bert_top20/tokenizer.json')

# Test dataset

In [213]:
!wget -q -O test.csv https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/Wo70d4_PAwujqA

In [214]:
test_df = pd.read_csv('test.csv')

In [None]:
from spans_prediction import NER
ner_model = NER(path_to_model, path_to_model)

In [217]:
# for d in range(0, len(test_df)):
#     print(test_df.text[d])
#     print(ner_model.hugginface_pipeline(test_df.text[d].lower()))
#     print()
#     print()

# Save test dataframe with predicted spans

In [218]:
test_df['span'] = test_df['text'].apply(lambda x: ner_model.predict_spans(x))



In [219]:
test_df.to_json('test_df_with_spans20.json')

# Download model and tokenizer from google-colab

In [220]:
print(path_to_model)

ner_bert_top20/


In [221]:
!zip -r /content/ner_bert_top20.zip /content/ner_bert_top20

  adding: content/ner_bert_top20/ (stored 0%)
  adding: content/ner_bert_top20/vocab.txt (deflated 64%)
  adding: content/ner_bert_top20/pytorch_model.bin (deflated 8%)
  adding: content/ner_bert_top20/special_tokens_map.json (deflated 40%)
  adding: content/ner_bert_top20/tokenizer.json (deflated 65%)
  adding: content/ner_bert_top20/tokenizer_config.json (deflated 42%)
  adding: content/ner_bert_top20/config.json (deflated 49%)


In [None]:
# from google.colab import files
# files.download("/content/ner_bert_top.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>