<a href="https://colab.research.google.com/github/schokoro/cnn_crf_nertagger/blob/master/NER_tagger.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CNN и CRF для извлечения именованных сущностей



In [None]:
import torch

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [None]:
if device == 'cpu':
    print('cpu')
else:
    n_gpu = torch.cuda.device_count()
    print(torch.cuda.get_device_name(0))
    

In [None]:
import sys
sys.path.append('cnn_crf_nertagger')

In [None]:
%load_ext autoreload
%autoreload 2

import spacy
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from torch.nn import functional as F
from torch.utils.data import TensorDataset
from allennlp.data.dataset_readers.conll2003 import Conll2003DatasetReader
from allennlp.common.util import ensure_list
# from allennlp.data.vocabulary import Vocabulary
from allennlp.training.metrics import SpanBasedF1Measure
from torch import nn
from torch.nn import functional as F
# from torch.utils.data import TensorDataset
from pdb import set_trace
from gc import collect
from tqdm.auto import tqdm, trange
import wget
from multiprocessing import cpu_count
import cnn_crf_nertagger
from cnn_crf_nertagger.modules.modules import NERTaggerModel, NERTagger
from cnn_crf_nertagger.utils.pipeline import train_eval_loop, predict_with_model
from cnn_crf_nertagger.utils.prepare import tag_corpus_to_tensor, tokenize_corpus, make_yttm_tokenizer, highlight_text, tensor_to_tags, ConllDataset
from os import path, listdir
import pandas as pd
import seaborn as sns
from matplotlib import rcParams
from seqeval.metrics import classification_report
sns.set()
%matplotlib inline

rcParams['figure.figsize'] = 12, 12
torch.backends.cudnn.deterministic=False 

## Подготовка данных




### Загружаем корпуса

In [None]:
data_path = 'data'
path_data = f'./data/'
path_train = f'{data_path}/eng.train'
path_valid = f'{data_path}/eng.testa'
path_test = f'{data_path}/eng.testb'

# dataset_urls = {
#     'eng.testa': 'https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testa',
#     'eng.testb': 'https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testb',
#     'eng.train': 'https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.train'}
# for file_name in dataset_urls:
#     wget.download(dataset_urls[file_name], path.join(path_data, file_name))

In [None]:
%%time
conll_reader = Conll2003DatasetReader()
train_conll = ensure_list(conll_reader.read(path_train))
valid_conll = ensure_list(conll_reader.read(path_valid))
test_conll = ensure_list(conll_reader.read(path_test))

In [None]:
all_conll = train_conll + valid_conll + test_conll
len(all_conll), len(train_conll), len(valid_conll), len(test_conll)

### Готовим словари соответсвия тег-id и bpe - токенизатор



In [None]:
tags = set()
tokens = set()

max_sent_len = 0
for instance in all_conll[: ]:
    if len(instance['tokens']) >  max_sent_len:
        max_sent_len = len(instance['tokens'])
    tags.update(instance['tags'])
    tokens.update([t.text for t in instance['tokens']])
    
print(f'Максимальная длина предложения: {max_sent_len} токенов')

In [None]:
%time bpe_tokenizer = make_yttm_tokenizer(train_conll, 400)

In [None]:
%%time
max_token_len = 0
for p in trange(101):
    len_tokens = [len(bpe_tokenizer.encode(token, dropout_prob=p/100))  for token in tokens]
    if max(len_tokens) > max_token_len:
        p_dropout = .01 * p
        max_token_len = max(len_tokens)

print(f'Максимальная длина токена: {max_token_len} субтокенов при вероятности дропаута {p_dropout}')

In [None]:
tag2id = {tag: num for num, tag in enumerate(['<NOTAG>'] + list(tags))}

В датасете присутствует очень мало тегов вида B-XXX. Попробуем чуть размножить предложения с этими тегами. Использование дропаута в bpe внесёт немного разнообразия в эти предложения.

In [None]:
%%time
BPE_DROPOUT = .25
train_dataset = ConllDataset(train_conll, bpe_tokenizer, tag2id, max_sent_len, max_token_len, 50, BPE_DROPOUT)
valid_dataset = ConllDataset(valid_conll, bpe_tokenizer, tag2id, max_sent_len, max_token_len)
test_dataset = ConllDataset(test_conll, bpe_tokenizer, tag2id, max_sent_len, max_token_len)


In [None]:
len(train_dataset), len(valid_dataset), len(test_dataset) 

## Создаём и обучаем сеть

In [None]:
models_path = './models/best_model.pth'

In [None]:
try:
    del model    
    collect()
except:
    print('no model')
finally:
    torch.cuda.empty_cache()


Сеть состоит из двух однотипных свёрточных блоков и CRF. На первом уровне мы осуществляем свёртку над субтокенами и делаем глобал-пуллинг. Затем получившиеся эмбедденги токенов передаём на следующий свёрточный блок. Он значительно глубже, чтобы увеличить рецептивное поле. Выход второго блока передаём в CRF, который возвращает нам `log-likelihood`.

In [None]:
torch.cuda.empty_cache()
model = NERTaggerModel(len(bpe_tokenizer.vocab()), len(tag2id), tag2id, embedding_size=64,
                                              single_backbone_kwargs=dict(layers_n=3, kernel_size=3, dropout=0.2, dilation=[1, 1, 1]),
                                              context_backbone_kwargs=dict(layers_n=6, kernel_size=3, dropout=0.1, dilation=[1, 1, 1, 2, 2 , 2]))
print('Количество параметров', sum(np.product(t.shape) for t in model.parameters()))

In [None]:
# try:
#     model.load_state_dict(torch.load(models_path))
# except:
#     print('no model')

In [None]:
losses = {}

In [None]:
(best_val_loss,
 best_model,
 losses) = train_eval_loop(
    model,
    train_dataset,
    valid_dataset,
    lr=5e-3,
    epoch_n=200,
    batch_size=400,
    device=device,
    early_stopping_patience=8,
    l2_reg_alpha = 1e-6,
    max_batches_per_epoch_train=50,
    max_batches_per_epoch_val=50,
    dataloader_workers_n=cpu_count(),
    # optimizer_ctor=lambda params: torch.optim.SGD(
    #     params,
    #     lr=4e-3,
    #     momentum=0.9,
    #     weight_decay=1e-6
    # ),
    lr_scheduler_ctor=lambda optim: torch.optim.lr_scheduler.ReduceLROnPlateau(
        optim, patience=4,
        factor=0.1,
        threshold=1e-2,
        verbose=True,
        min_lr=1e-7),
    verbose_batch=False,
    verbose_liveloss=False,
    prev_loss=losses
)
 
torch.save(best_model.state_dict(), models_path)

In [None]:
pd.DataFrame(losses).plot();

In [None]:
model.load_state_dict(torch.load(models_path))

## Проверки

In [None]:
id2tag = {item[1]: item[0] for item in tag2id.items()}
UNIQUE_TAGS = [id2tag[i] for i in range(len(tag2id))]


### Проверка - train

In [None]:
%%time 
train_targets = [item[1] for item in train_dataset]
train_targets = torch.stack(train_targets)
train_targets.shape

In [None]:
train_pred = predict_with_model(model, train_dataset)
train_golden_tags = tensor_to_tags(train_targets, id2tag)
train_pred_tags = tensor_to_tags(train_pred, id2tag)
print(classification_report(train_golden_tags, train_pred_tags, digits=4))
print(classification_report(train_golden_tags, train_pred_tags, digits=4, suffix=True))


### Проверка - valid

In [None]:
%%time 
valid_targets = [item[1] for item in valid_dataset]
valid_targets = torch.stack(valid_targets)
valid_targets.shape

In [None]:
valid_pred = predict_with_model(model, valid_dataset)

 
valid_golden_tags = tensor_to_tags(valid_targets, id2tag)
valid_pred_tags = tensor_to_tags(valid_pred, id2tag)
print(classification_report(valid_golden_tags, valid_pred_tags, digits=4))
print(classification_report(valid_golden_tags, valid_pred_tags, digits=4, suffix=True))


### Проверка - test

In [None]:
%%time 
test_targets = [item[1] for item in test_dataset]
test_targets = torch.stack(test_targets)
test_targets.shape

In [None]:
test_pred = predict_with_model(model, test_dataset)
 
test_golden_tags = tensor_to_tags(test_targets, id2tag)
test_pred_tags = tensor_to_tags(test_pred, id2tag)
print(classification_report(test_golden_tags, test_pred_tags, digits=4))
print(classification_report(test_golden_tags, test_pred_tags, digits=4, suffix=True))

## Применение теггера

In [None]:
!python -m spacy download en_core_web_sm > /dev/null
nlp = spacy.load('en_core_web_sm')

In [None]:

ner_tagger = NERTagger(model, bpe_tokenizer, UNIQUE_TAGS, max_sent_len, max_token_len, 0)

Несколько предложений из новостей с сайта [BBC](https://www.bbc.com/news)

In [None]:
test_sentences = [
    'Mr Trump said Mr Linick no longer had his full confidence and that he would be removed in 30 days.',
    'Mr Linick had begun investigating Secretary of State Mike Pompeo for suspected abuse of office, reports say.',
    'Democrats say Mr Trump is retaliating against public servants who want to hold his administration to account.',
    'Donald Trump, who is campaigning for re-election in November, has stepped up his attacks on China in recent weeks, blaming it for the spread of Covid-19.',
    'The team led by Fernando Novas from the Natural Sciences Museum in Buenos Aires discovered many fossils during its month-long field work in Estancia La Anita, in southern Santa Cruz province.',
    "The rehearsal at Kennedy Space Center saw a Falcon-9 vehicle's ascent into the sky deliberately terminated just 80 seconds after lift-off."
]
test_sentences_tokenized = tokenize_corpus(test_sentences)
# test_sentences_tokenized = [[token.text for token in nlp.tokenizer(sent) ] for sent in test_sentences]

In [None]:
for sent_tokens, sent_tags in zip(test_sentences_tokenized, ner_tagger(test_sentences)):
    highlight_text(sent_tokens, sent_tags)