<a href="https://colab.research.google.com/github/schokoro/cnn_crf_nertagger/blob/dataset/NER_tagger.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CNN и CRF для извлечения именованных сущностей



In [0]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if device == 'cpu':
    print('cpu')
else:
    n_gpu = torch.cuda.device_count()
    print(torch.cuda.get_device_name(0))
    

Tesla T4


In [0]:
import sys
sys.path.append('/content/cnn_crf_nertagger')

In [0]:
!rm -rf cnn_crf_nertagger/

In [0]:
!git clone -b dataset https://github.com/schokoro/cnn_crf_nertagger.git > /dev/null
!pip install allennlp wget youtokentome ipymarkup seqeval> /dev/null


Cloning into 'cnn_crf_nertagger'...
remote: Enumerating objects: 210, done.[K
remote: Counting objects: 100% (210/210), done.[K
remote: Compressing objects: 100% (128/128), done.[K
remote: Total 210 (delta 103), reused 160 (delta 65), pack-reused 0[K
Receiving objects: 100% (210/210), 198.63 KiB | 318.00 KiB/s, done.
Resolving deltas: 100% (103/103), done.


In [0]:
%load_ext autoreload
%autoreload 2

import spacy
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from torch.nn import functional as F
from torch.utils.data import TensorDataset
from allennlp.data.dataset_readers.conll2003 import Conll2003DatasetReader
from allennlp.common.util import ensure_list
from allennlp.data.vocabulary import Vocabulary
from allennlp.training.metrics import SpanBasedF1Measure
from torch import nn
from torch.nn import functional as F
# from torch.utils.data import TensorDataset
from pdb import set_trace
from gc import collect
from tqdm.notebook import tqdm
import wget
from multiprocessing import cpu_count
import cnn_crf_nertagger
from cnn_crf_nertagger.modules.modules import NERTaggerModel, NERTagger
from cnn_crf_nertagger.utils.pipeline import train_eval_loop, predict_with_model
from cnn_crf_nertagger.utils.prepare import tag_corpus_to_tensor, tokenize_corpus, make_yttm_tokenizer, highlight_text, tensor_to_tags, ConllDataset
from os import path, listdir
import pandas as pd
import seaborn as sns
from matplotlib import rcParams
from seqeval.metrics import classification_report
sns.set()
%matplotlib inline

rcParams['figure.figsize'] = 12, 12
torch.backends.cudnn.deterministic=False 

## Подготовка данных




### Загружаем корпуса

In [0]:
!rm -fv /content/cnn_crf_nertagger/data/*


removed '/content/cnn_crf_nertagger/data/readme.txt'


In [0]:
path_data = '/content/cnn_crf_nertagger/data/'
path_train = '/content/cnn_crf_nertagger/data/eng.train'
path_valid = '/content/cnn_crf_nertagger/data/eng.testa'
path_test = '/content/cnn_crf_nertagger/data/eng.testb'

dataset_urls = {
    'eng.testa': 'https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testa',
    'eng.testb': 'https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testb',
    'eng.train': 'https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.train'}
for file_name in dataset_urls:
    wget.download(dataset_urls[file_name], path.join(path_data, file_name))

In [0]:
%%time
conll_reader = Conll2003DatasetReader()
train_conll = ensure_list(conll_reader.read(path_train))
valid_conll = ensure_list(conll_reader.read(path_valid))
test_conll = ensure_list(conll_reader.read(path_test))

14041it [00:01, 12731.85it/s]
3250it [00:00, 9210.15it/s]
3453it [00:00, 25483.36it/s]

CPU times: user 1.52 s, sys: 73.7 ms, total: 1.6 s
Wall time: 1.6 s





In [0]:
all_conll = train_conll + valid_conll + test_conll
len(all_conll), len(train_conll), len(valid_conll), len(test_conll)

(20744, 14041, 3250, 3453)

### Готовим словари соответсвия тег-id и bpe - токенизатор



In [0]:
vocab = Vocabulary.from_instances(all_conll)
vocab.add_token_to_namespace('[PAD]', 'labels')

In [0]:

max_sent_len = 0
for instance in all_conll[: ]:
    if len(instance['tokens']) >  max_sent_len:
        max_sent_len = len(instance['tokens'])
    
print(f'Максимальная длина предложения: {max_sent_len} токенов')

Максимальная длина предложения: 124 токенов


In [0]:
%time bpe_tokenizer = make_yttm_tokenizer(train_conll, 500)

CPU times: user 147 ms, sys: 34.1 ms, total: 181 ms
Wall time: 133 ms


In [0]:
%%time
max_token_len = 0
for p in tqdm(range(101)):
    len_tokens = [len(bpe_tokenizer.encode(token.text, dropout_prob=p/100))  for token in tokens]
    if max(len_tokens) > max_token_len:
        p_dropout = .01 * p
        max_token_len = max(len_tokens)

print(f'Максимальная длина токена: {max_token_len} субтокенов при вероятности дропаута {p_dropout}')

HBox(children=(FloatProgress(value=0.0, max=101.0), HTML(value='')))


Максимальная длина токена: 62 субтокенов при вероятности дропаута 0.1
CPU times: user 10.2 s, sys: 57.3 ms, total: 10.2 s
Wall time: 10.2 s


В датасете присутствует очень мало тегов вида B-XXX. Попробуем чуть размножить предложения с этими тегами. Использование дропаута в bpe внесёт немного разнообразия в эти предложения.

In [0]:
%%time
BPE_DROPOUT = .5
train_dataset = ConllDataset(train_conll, bpe_tokenizer, vocab, max_sent_len, max_token_len, 200, BPE_DROPOUT)
valid_dataset = ConllDataset(valid_conll, bpe_tokenizer, vocab, max_sent_len, max_token_len)
test_dataset = ConllDataset(test_conll, bpe_tokenizer, vocab, max_sent_len, max_token_len)


CPU times: user 16.4 ms, sys: 0 ns, total: 16.4 ms
Wall time: 16.4 ms


In [0]:
len(train_dataset), len(valid_dataset), len(test_dataset) 

(24441, 3250, 3453)

## Создаём и обучаем сеть

In [0]:
models_path = '/content/cnn_crf_nertagger/models/best_model.pth'

In [0]:
try:
    del model    
    collect()
except:
    print('no model')
finally:
    torch.cuda.empty_cache()


no model


Сеть состоит из двух однотипных свёрточных блоков и CRF. На первом уровне мы осуществляем свёртку над субтокенами и делаем глобал-пуллинг. Затем получившиеся эмбедденги токенов передаём на следующий свёрточный блок. Он значительно глубже, чтобы увеличить рецептивное поле. Выход второго блока передаём в CRF, который возвращает нам `log-likelihood`.

In [0]:
torch.cuda.empty_cache()
model = NERTaggerModel(len(bpe_tokenizer.vocab()), len(tag2id), tag2id, embedding_size=64,
                                              single_backbone_kwargs=dict(layers_n=3, kernel_size=3, dropout=0.3, dilation=None),
                                              context_backbone_kwargs=dict(layers_n=5, kernel_size=3, dropout=0.3, dilation=[1, 1, 2 , 2, 2]))
print('Количество параметров', sum(np.product(t.shape) for t in model.parameters()))

Количество параметров 131621


In [0]:
# try:
#     model.load_state_dict(torch.load(models_path))
# except:
#     print('no model')

In [0]:
losses = {}

In [0]:
(best_val_loss,
 best_model,
 losses) = train_eval_loop(
     model,
     train_dataset,
     valid_dataset,
     lr=1e-3,
     epoch_n=200,
     batch_size=256,
     device=device,
     early_stopping_patience=10,
     l2_reg_alpha = 1e-6,
     max_batches_per_epoch_train=50,
     max_batches_per_epoch_val=100,
     dataloader_workers_n=cpu_count(),
     lr_scheduler_ctor=lambda optim: torch.optim.lr_scheduler.ReduceLROnPlateau(
         optim, patience=4,
         factor=0.1,
         threshold=1e-3,
         verbose=True),
     verbose_batch=False,
     prev_loss=losses
    )
 
torch.save(best_model.state_dict(), models_path)

Эпоха 0
Эпоха: 51 итераций, 27.64 сек
Среднее значение функции потерь на обучении 22.60297296561447
Среднее значение функции потерь на валидации 11.368022918701172
Новая лучшая модель!

Эпоха 1
Эпоха: 51 итераций, 28.58 сек
Среднее значение функции потерь на обучении 9.692762290730196
Среднее значение функции потерь на валидации 8.114393197573149
Новая лучшая модель!

Эпоха 2
Эпоха: 51 итераций, 27.33 сек
Среднее значение функции потерь на обучении 6.5182739800097895
Среднее значение функции потерь на валидации 6.348157809330867
Новая лучшая модель!

Эпоха 3
Эпоха: 51 итераций, 27.67 сек
Среднее значение функции потерь на обучении 4.962647914886475
Среднее значение функции потерь на валидации 5.22773893062885
Новая лучшая модель!

Эпоха 4
Эпоха: 51 итераций, 28.45 сек
Среднее значение функции потерь на обучении 4.01747794712291
Среднее значение функции потерь на валидации 4.534874274180486
Новая лучшая модель!

Эпоха 5
Эпоха: 51 итераций, 27.83 сек
Среднее значение функции потерь на об

In [0]:
pd.DataFrame(losses).plot();

In [0]:
model.load_state_dict(torch.load(models_path))

## Проверки

In [0]:
id2tag = {item[1]: item[0] for item in tag2id.items()}
UNIQUE_TAGS = [id2tag[i] for i in range(len(tag2id))]


### Проверка - train

In [0]:
%%time 
train_targets = [item[1] for item in train_dataset]
train_targets = torch.stack(train_targets)
train_targets.shape

In [0]:
train_pred = predict_with_model(model, train_dataset)
train_golden_tags = tensor_to_tags(train_targets, id2tag)
train_pred_tags = tensor_to_tags(train_pred, id2tag)
print(classification_report(train_golden_tags, train_pred_tags))
print(classification_report(train_golden_tags, train_pred_tags, suffix=True))


### Проверка - valid

In [0]:
%%time 
valid_targets = [item[1] for item in valid_dataset]
valid_targets = torch.stack(valid_targets)
valid_targets.shape

In [0]:
valid_pred = predict_with_model(model, valid_dataset)

 
valid_golden_tags = tensor_to_tags(valid_targets, id2tag)
valid_pred_tags = tensor_to_tags(valid_pred, id2tag)
print(classification_report(valid_golden_tags, valid_pred_tags))
print(classification_report(valid_golden_tags, valid_pred_tags, suffix=True))


### Проверка - test

In [0]:
%%time 
test_targets = [item[1] for item in test_dataset]
test_targets = torch.stack(test_targets)
test_targets.shape

In [0]:
test_pred = predict_with_model(model, test_dataset)
 
test_golden_tags = tensor_to_tags(test_targets, id2tag)
test_pred_tags = tensor_to_tags(test_pred, id2tag)
print(classification_report(test_golden_tags, test_pred_tags))
print(classification_report(test_golden_tags, test_pred_tags, suffix=True))

           precision    recall  f1-score   support

      PER       0.86      0.84      0.85      1617
      LOC       0.81      0.87      0.84      1668
      ORG       0.72      0.77      0.74      1661
     MISC       0.75      0.73      0.74       702

    micro avg   0.79      0.81      0.80      5648
    macro avg   0.79      0.81      0.80      5648

           precision    recall  f1-score   support

        I       0.91      0.94      0.92      5599
        B       0.11      0.06      0.07        18

    micro avg   0.91      0.94      0.92      5617
    macro avg   0.91      0.94      0.92      5617


## Применение теггера

In [0]:
!python -m spacy download en_core_web_sm > /dev/null
nlp = spacy.load('en_core_web_sm')

In [0]:

ner_tagger = NERTagger(model, bpe_tokenizer, UNIQUE_TAGS, max_sent_len, max_token_len, 0)

Несколько предложений из новостей с сайта [BBC](https://www.bbc.com/news)

In [0]:
test_sentences = [
    'Mr Trump said Mr Linick no longer had his full confidence and that he would be removed in 30 days.',
    'Mr Linick had begun investigating Secretary of State Mike Pompeo for suspected abuse of office, reports say.',
    'Democrats say Mr Trump is retaliating against public servants who want to hold his administration to account.',
    'Donald Trump, who is campaigning for re-election in November, has stepped up his attacks on China in recent weeks, blaming it for the spread of Covid-19.',
    'The team led by Fernando Novas from the Natural Sciences Museum in Buenos Aires discovered many fossils during its month-long field work in Estancia La Anita, in southern Santa Cruz province.',
    "The rehearsal at Kennedy Space Center saw a Falcon-9 vehicle's ascent into the sky deliberately terminated just 80 seconds after lift-off."
]
test_sentences_tokenized = tokenize_corpus(test_sentences)
# test_sentences_tokenized = [[token.text for token in nlp.tokenizer(sent) ] for sent in test_sentences]

In [0]:
for sent_tokens, sent_tags in zip(test_sentences_tokenized, ner_tagger(test_sentences)):
    highlight_text(sent_tokens, sent_tags)