<a href="https://colab.research.google.com/github/schokoro/cnn_crf_nertagger/blob/master/NER_tagger.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CNN и CRF для извлечения именованных сущностей



In [1]:
import torch

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
if device == 'cpu':
    print('cpu')
else:
    n_gpu = torch.cuda.device_count()
    print(torch.cuda.get_device_name(0))
    

Tesla K80


In [4]:
import sys
sys.path.append('/content/cnn_crf_nertagger')

In [5]:
import sys
sys.path.append('cnn_crf_nertagger')

In [6]:
!rm -rf cnn_crf_nertagger/
!git clone https://github.com/schokoro/cnn_crf_nertagger.git > /dev/null  #  -b dev



Cloning into 'cnn_crf_nertagger'...
remote: Enumerating objects: 339, done.[K
remote: Counting objects: 100% (60/60), done.[K
remote: Compressing objects: 100% (39/39), done.[K
remote: Total 339 (delta 23), reused 49 (delta 16), pack-reused 279[K
Receiving objects: 100% (339/339), 1.25 MiB | 9.29 MiB/s, done.
Resolving deltas: 100% (162/162), done.


In [7]:
!pip install -U allennlp pywget youtokentome ipymarkup seqeval livelossplot google-cloud-storage> /dev/null

In [8]:
%load_ext autoreload
%autoreload 2
from pywget import wget
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from allennlp.data.dataset_readers.conll2003 import Conll2003DatasetReader
from allennlp.common.util import ensure_list
from multiprocessing import cpu_count
from modules.modules import CNN_RNN_CRF, NERTagger, CNN_CNN_CRF
from utils.pipeline import train_eval_loop, predict_with_model
from utils.prepare import tag_corpus_to_tensor, tokenize_corpus, highlight_text, tensor_to_tags, ConllDataset
from os import path, listdir
import pandas as pd
import seaborn as sns
from matplotlib import rcParams
from seqeval.metrics import classification_report
sns.set()
%matplotlib inline

rcParams['figure.figsize'] = 12, 12
torch.backends.cudnn.deterministic=False 

## Подготовка данных




### Загружаем корпуса

In [9]:
!rm -fv /content/cnn_crf_nertagger/data/*


removed '/content/cnn_crf_nertagger/data/eng.testa'
removed '/content/cnn_crf_nertagger/data/eng.testb'
removed '/content/cnn_crf_nertagger/data/eng.train'
removed '/content/cnn_crf_nertagger/data/readme.txt'


In [10]:
data_path = '/content/'
path_data = f'./data/'
path_train = f'{data_path}/eng.train'
path_valid = f'{data_path}/eng.testa'
path_test = f'{data_path}/eng.testb'

dataset_urls = {
    'eng.testa': 'https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testa',
    'eng.testb': 'https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testb',
    'eng.train': 'https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.train'}
for file_name in dataset_urls:
    wget.download(dataset_urls[file_name], path.join(data_path, file_name))

In [11]:
%%time
conll_reader = Conll2003DatasetReader()
train_conll = ensure_list(conll_reader.read(path_train))
valid_conll = ensure_list(conll_reader.read(path_valid))
test_conll = ensure_list(conll_reader.read(path_test))

CPU times: user 2.14 s, sys: 123 ms, total: 2.26 s
Wall time: 2.27 s


In [12]:
all_conll = train_conll + valid_conll + test_conll
len(all_conll), len(train_conll), len(valid_conll), len(test_conll)

(20744, 14041, 3250, 3453)

### Готовим словари соответсвия тег-id и bpe - токенизатор



In [13]:
tags = set()
tokens = set()

max_sent_len = 0
for instance in all_conll[:]:
    if len(instance['tokens']) > max_sent_len:
        max_sent_len = len(instance['tokens'])
    tags.update(instance['tags'])
    tokens.update([t.text for t in instance['tokens']])

print(f'Максимальная длина предложения: {max_sent_len} токенов')

Максимальная длина предложения: 124 токенов


In [14]:
%%time
chars = set()
for token in tokens:
    chars.update(token)
tag2id = {tag: num for num, tag in enumerate(['<NOTAG>'] + list(tags))}
char2id = {char: num+1 for num, char in enumerate(chars)}
# print(f'Максимальная длина токена: {max_token_len} субтокенов при вероятности дропаута {p_dropout}')

CPU times: user 26.1 ms, sys: 0 ns, total: 26.1 ms
Wall time: 26.1 ms


В датасете присутствует очень мало тегов вида B-XXX. Попробуем чуть размножить предложения с этими тегами. Использование дропаута в bpe внесёт немного разнообразия в эти предложения.

In [15]:
%%time
BPE_DROPOUT = .25
max_token_len = max([len(token) for token in tokens])
train_dataset = ConllDataset(train_conll, char2id, tag2id, max_sent_len, max_token_len)
valid_dataset = ConllDataset(valid_conll, char2id, tag2id, max_sent_len, max_token_len)
test_dataset = ConllDataset(test_conll, char2id, tag2id, max_sent_len, max_token_len)


CPU times: user 14.1 ms, sys: 9 µs, total: 14.1 ms
Wall time: 14.2 ms


In [16]:
len(train_dataset), len(valid_dataset), len(test_dataset) 

(14041, 3250, 3453)

## Создаём и обучаем сеть

In [17]:
models_path = '/content/best_model.pth'

In [18]:
try:
    del model    
    collect()
except:
    print('no model')
finally:
    torch.cuda.empty_cache()


no model


Сеть состоит из двух однотипных свёрточных блоков и CRF. На первом уровне мы осуществляем свёртку над субтокенами и делаем глобал-пуллинг. Затем получившиеся эмбедденги токенов передаём на следующий свёрточный блок. Он значительно глубже, чтобы увеличить рецептивное поле. Выход второго блока передаём в CRF, который возвращает нам `log-likelihood`.

In [19]:
torch.cuda.empty_cache()
# model = CNN_RNN_CRF(
#     len(char2id), len(tag2id), tag2id, embedding_size=64,
#     single_backbone_kwargs=dict(layers_n=3, kernel_size=3, dropout=0.2, dilation=[1, 1, 1]),
#     rnn_hidden_size=256, rnn_layer=2, dropout=0)

model = CNN_CNN_CRF(
    len(char2id), len(tag2id), tag2id, embedding_size=64,
    single_backbone_kwargs=dict(layers_n=3, kernel_size=3, dropout=0.2, dilation=[1, 1, 1]),
    context_backbone_kwargs=dict(layers_n=6, kernel_size=3, dropout=0.1, dilation=[1, 1, 1,  2, 2, 2]),  dropout1=0.3)

In [20]:
# try:
#     model.load_state_dict(torch.load(models_path))
# except:
#     print('no model')

In [21]:
losses = {}

In [None]:
(best_val_loss,
 best_model,
 losses) = train_eval_loop(
    model,
    train_dataset,
    valid_dataset,
    lr=1e-3,
    epoch_n=10,
    batch_size=256,
    device=device,
    early_stopping_patience=8,
    l2_reg_alpha=1e-6,
    max_batches_per_epoch_train=50,
    max_batches_per_epoch_val=50,
    dataloader_workers_n=cpu_count(),
    # optimizer_ctor=lambda params: torch.optim.SGD(
    #     params,
    #     lr=1e-2,
    #     weight_decay=1e-6
    # ),
    lr_scheduler_ctor=lambda optim: torch.optim.lr_scheduler.ReduceLROnPlateau(
        optim, patience=4,
        factor=0.1,
        threshold=1e-2,
        verbose=True,
        min_lr=1e-5),
    verbose_batch=False,
    verbose_liveloss=False,
    prev_loss=losses
)
 
torch.save(best_model.state_dict(), models_path)

Эпоха 0
Эпоха: 51 итераций, 209.99 сек
Среднее значение функции потерь на обучении 19.779133964987363
Среднее значение функции потерь на валидации 10.198091873755821
Новая лучшая модель!

Эпоха 1
Эпоха: 51 итераций, 209.24 сек
Среднее значение функции потерь на обучении 8.831965390373679
Среднее значение функции потерь на валидации 6.679713982802171
Новая лучшая модель!

Эпоха 2


In [None]:
pd.DataFrame(losses).plot();

In [None]:
model.load_state_dict(torch.load(models_path))

## Проверки

In [None]:
id2tag = {item[1]: item[0] for item in tag2id.items()}
UNIQUE_TAGS = [id2tag[i] for i in range(len(tag2id))]


### Проверка - train

In [None]:
%%time 
train_targets = [item[1] for item in train_dataset]
train_targets = torch.stack(train_targets)
train_targets.shape

In [None]:
train_pred = predict_with_model(model, train_dataset)
train_golden_tags = tensor_to_tags(train_targets, id2tag)
train_pred_tags = tensor_to_tags(train_pred, id2tag)
print(classification_report(train_golden_tags, train_pred_tags, digits=4))
print(classification_report(train_golden_tags, train_pred_tags, digits=4, suffix=True))


### Проверка - valid

In [None]:
%%time 
valid_targets = [item[1] for item in valid_dataset]
valid_targets = torch.stack(valid_targets)
valid_targets.shape

In [None]:
valid_pred = predict_with_model(model, valid_dataset)

 
valid_golden_tags = tensor_to_tags(valid_targets, id2tag)
valid_pred_tags = tensor_to_tags(valid_pred, id2tag)
print(classification_report(valid_golden_tags, valid_pred_tags, digits=4))
print(classification_report(valid_golden_tags, valid_pred_tags, digits=4, suffix=True))


### Проверка - test

In [None]:
%%time 
test_targets = [item[1] for item in test_dataset]
test_targets = torch.stack(test_targets)
test_targets.shape

In [None]:
test_pred = predict_with_model(model, test_dataset)
 
test_golden_tags = tensor_to_tags(test_targets, id2tag)
test_pred_tags = tensor_to_tags(test_pred, id2tag)
print(classification_report(test_golden_tags, test_pred_tags, digits=4))
print(classification_report(test_golden_tags, test_pred_tags, digits=4, suffix=True))

## Применение теггера

In [None]:
!python -m spacy download en_core_web_sm > /dev/null
nlp = spacy.load('en_core_web_sm')

In [None]:

ner_tagger = NERTagger(model, bpe_tokenizer, UNIQUE_TAGS, max_sent_len, max_token_len, 0)

Несколько предложений из новостей с сайта [BBC](https://www.bbc.com/news)

In [None]:
test_sentences = [
    'Mr Trump said Mr Linick no longer had his full confidence and that he would be removed in 30 days.',
    'Mr Linick had begun investigating Secretary of State Mike Pompeo for suspected abuse of office, reports say.',
    'Democrats say Mr Trump is retaliating against public servants who want to hold his administration to account.',
    'Donald Trump, who is campaigning for re-election in November, has stepped up his attacks on China in recent weeks, blaming it for the spread of Covid-19.',
    'The team led by Fernando Novas from the Natural Sciences Museum in Buenos Aires discovered many fossils during its month-long field work in Estancia La Anita, in southern Santa Cruz province.',
    "The rehearsal at Kennedy Space Center saw a Falcon-9 vehicle's ascent into the sky deliberately terminated just 80 seconds after lift-off."
]
test_sentences_tokenized = tokenize_corpus(test_sentences)
# test_sentences_tokenized = [[token.text for token in nlp.tokenizer(sent) ] for sent in test_sentences]

In [None]:
for sent_tokens, sent_tags in zip(test_sentences_tokenized, ner_tagger(test_sentences)):
    highlight_text(sent_tokens, sent_tags)