<a href="https://colab.research.google.com/github/schokoro/cnn_crf_nertagger/blob/dev/NER_tagger.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CNN и CRF для извлечения именованных сущностей



In [1]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if device == 'cpu':
    print('cpu')
else:
    n_gpu = torch.cuda.device_count()
    print(torch.cuda.get_device_name(0))
    

Tesla T4


In [0]:
!rm -rf cnn_crf_nertagger/

In [3]:
!git clone -b dev https://github.com/schokoro/cnn_crf_nertagger.git > /dev/null
!pip install allennlp wget> /dev/null


Cloning into 'cnn_crf_nertagger'...
remote: Enumerating objects: 117, done.[K
remote: Counting objects: 100% (117/117), done.[K
remote: Compressing objects: 100% (64/64), done.[K
remote: Total 117 (delta 50), reused 99 (delta 38), pack-reused 0[K
Receiving objects: 100% (117/117), 33.39 KiB | 16.70 MiB/s, done.
Resolving deltas: 100% (50/50), done.


In [0]:
import sys
sys.path.append('/content/cnn_crf_nertagger')

In [0]:
%load_ext autoreload
%autoreload 2

import spacy
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import classification_report
import numpy as np
from torch.nn import functional as F
from torch.utils.data import TensorDataset
from allennlp.data.dataset_readers.conll2003 import Conll2003DatasetReader
from allennlp.common.util import ensure_list
from torch import nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset
from pdb import set_trace
from gc import collect
import wget
import cnn_crf_nertagger
from cnn_crf_nertagger.modules.modules import NERTaggerModel, NERTagger
from cnn_crf_nertagger.utils.pipeline import train_eval_loop, predict_with_model
from cnn_crf_nertagger.utils.prepare import tag_corpus_to_tensor, tokenize_corpus
from os import path, listdir



# torch.backends.cudnn.deterministic=False 

## Загрузка корпусов

In [0]:
!rm /content/cnn_crf_nertagger/data/*


In [0]:
path_data = '/content/cnn_crf_nertagger/data/'
path_train = '/content/cnn_crf_nertagger/data/eng.train'
path_valid = '/content/cnn_crf_nertagger/data/eng.testa'
path_test = '/content/cnn_crf_nertagger/data/eng.testb'

dataset_urls = {
    'eng.testa': 'https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testa',
    'eng.testb': 'https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testb',
    'eng.train': 'https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.train'}
for file_name in dataset_urls:
    wget.download(dataset_urls[file_name], path.join(path_data, file_name))

In [8]:
%%time
conll_reader = Conll2003DatasetReader()
train_conll = ensure_list(conll_reader.read(path_train))
valid_conll = ensure_list(conll_reader.read(path_valid))
test_conll = ensure_list(conll_reader.read(path_test))

14041it [00:01, 13300.16it/s]
3250it [00:00, 10023.37it/s]
3453it [00:00, 25732.70it/s]

CPU times: user 1.46 s, sys: 67.8 ms, total: 1.52 s
Wall time: 1.52 s





In [9]:
all_conll = train_conll + valid_conll + test_conll
len(all_conll), len(train_conll), len(valid_conll), len(test_conll)

(20744, 14041, 3250, 3453)

In [10]:
tags = set()
tokens = set()

max_sent_len = 0
for instance in all_conll[: ]:
    if len(instance['tokens']) >  max_sent_len:
        max_sent_len = len(instance['tokens'])
    tags.update(instance['tags'])
    tokens.update(instance['tokens'])
    
print(f'Максимальная длина предложения: {max_sent_len} токенов')

Максимальная длина предложения: 124 токенов


In [11]:
max_token_len = max([len(token.text) for token in tokens])
print(f'Максимальная длина токена: {max_token_len} символов')

Максимальная длина токена: 61 символов


Находим множество всех символов

In [12]:
chars = set()
for token in tokens:
    chars.update(token.text)
    
len(chars)

85

Составляем словари соответсвия тег-id, символ-id



In [0]:
tag2id = {tag: num for num, tag in enumerate(['<NOTAG>'] + list(tags))}
char2id = {char: num+1 for num, char in enumerate(chars)}
id2char = {item[1]: item[0] for item in char2id.items()}

In [14]:
%%time

train_inputs, train_targets = tag_corpus_to_tensor(train_conll, char2id, tag2id, max_sent_len, max_token_len, 50)
valid_inputs, valid_targets = tag_corpus_to_tensor(valid_conll, char2id, tag2id, max_sent_len, max_token_len)
test_inputs, test_targets = tag_corpus_to_tensor(test_conll, char2id, tag2id, max_sent_len, max_token_len)

100%|██████████| 14041/14041 [00:07<00:00, 1759.75it/s]
100%|██████████| 3250/3250 [00:02<00:00, 1367.17it/s]
100%|██████████| 3453/3453 [00:01<00:00, 2125.12it/s]

CPU times: user 12.2 s, sys: 853 ms, total: 13.1 s
Wall time: 13.1 s





In [0]:
train_dataset = TensorDataset(train_inputs, train_targets)
valid_dataset = TensorDataset(valid_inputs, valid_targets)
test_dataset = TensorDataset(test_inputs, test_targets)

In [0]:
models_path = '/content/cnn_crf_nertagger/models/best_model.pth'

In [0]:
try:
    del model    
    collect()
except:
    print('no model')
finally:
    torch.cuda.empty_cache()


In [25]:
torch.cuda.empty_cache()
model = NERTaggerModel(len(char2id), len(tag2id), tag2id, embedding_size=64,
                                              single_backbone_kwargs=dict(layers_n=3, kernel_size=3, dropout=0.3, dilation=None),
                                              context_backbone_kwargs=dict(layers_n=5, kernel_size=3, dropout=0.3, dilation=[1,1,2,2,1]))
print('Количество параметров', sum(np.product(t.shape) for t in model.parameters()))

Количество параметров 105061


In [0]:
# try:
#     model.load_state_dict(torch.load(models_path))
# except:
#     print('no model')

In [27]:
(best_val_loss,
 best_model) = train_eval_loop(model,
                               train_dataset,
                               valid_dataset,
                               lr=1.25e-3,
                               epoch_n=200,
                               batch_size=128,
                               device=device,
                               early_stopping_patience=8,
                               l2_reg_alpha = 1e-6,
                               max_batches_per_epoch_train=100,
                               max_batches_per_epoch_val=100,
                               lr_scheduler_ctor=lambda optim: torch.optim.lr_scheduler.ReduceLROnPlateau(
                                                                                     optim, patience=3,
                                                                                     factor=0.2,
                                                                                     threshold=1e-3,
                                                                                     verbose=True),
                               verbose_batch=False)
 
torch.save(best_model.state_dict(), models_path)

Эпоха 0
Эпоха: 101 итераций, 28.92 сек
Среднее значение функции потерь на обучении 10.75617245400306
Среднее значение функции потерь на валидации 5.743589428754953
Новая лучшая модель!

Эпоха 1
Эпоха: 101 итераций, 29.06 сек
Среднее значение функции потерь на обучении 5.055423805029085
Среднее значение функции потерь на валидации 3.9450427156228285
Новая лучшая модель!

Эпоха 2
Эпоха: 101 итераций, 28.48 сек
Среднее значение функции потерь на обучении 3.7963552923485784
Среднее значение функции потерь на валидации 3.2114748450425954
Новая лучшая модель!

Эпоха 3
Эпоха: 101 итераций, 28.92 сек
Среднее значение функции потерь на обучении 3.1256460340896455
Среднее значение функции потерь на валидации 2.787000495653886
Новая лучшая модель!

Эпоха 4
Эпоха: 101 итераций, 28.77 сек
Среднее значение функции потерь на обучении 2.6602400435079443
Среднее значение функции потерь на валидации 2.4811268872939625
Новая лучшая модель!

Эпоха 5
Эпоха: 101 итераций, 28.32 сек
Среднее значение функции 

In [28]:
model.load_state_dict(torch.load(models_path))

<All keys matched successfully>

## Проверки

In [0]:
id2tag = {item[1]: item[0] for item in tag2id.items()}
UNIQUE_TAGS = [id2tag[i] for i in range(len(tag2id))]


### Проверка - train

In [30]:
train_pred = predict_with_model(model, train_dataset)
print(classification_report(train_targets.view(-1), train_pred.reshape(-1), target_names=UNIQUE_TAGS))

521it [00:20, 25.44it/s]                               


              precision    recall  f1-score   support

     <NOTAG>       1.00      1.00      1.00   1810563
      I-MISC       0.90      0.96      0.93      8356
       B-LOC       1.00      0.09      0.17       561
       I-PER       0.97      0.99      0.98     12278
       I-LOC       0.93      0.98      0.95     10486
       I-ORG       0.91      0.95      0.93     11801
      B-MISC       0.98      0.78      0.87      1887
           O       1.00      1.00      1.00    206328
       B-ORG       1.00      0.50      0.67      1224

    accuracy                           1.00   2063484
   macro avg       0.97      0.81      0.83   2063484
weighted avg       1.00      1.00      1.00   2063484



### Проверка - valid

In [31]:
valid_pred = predict_with_model(model, valid_dataset)


# print(classification_report(valid_targets.view(-1), valid_pred.reshape(-1), target_names=UNIQUE_TAGS))

100%|██████████| 102/101.5625 [00:04<00:00, 24.77it/s]


### Проверка - test

In [32]:
test_pred = predict_with_model(model, test_dataset)

print(classification_report(test_targets.view(-1), test_pred.reshape(-1), target_names=UNIQUE_TAGS))

100%|██████████| 108/107.90625 [00:04<00:00, 26.94it/s]


              precision    recall  f1-score   support

     <NOTAG>       1.00      1.00      1.00    381737
      I-MISC       0.72      0.77      0.75       909
       B-LOC       0.00      0.00      0.00         6
       I-PER       0.87      0.88      0.88      2773
       I-LOC       0.79      0.87      0.83      1919
       I-ORG       0.77      0.80      0.78      2491
      B-MISC       0.00      0.00      0.00         9
           O       0.99      0.98      0.99     38323
       B-ORG       0.00      0.00      0.00         5

    accuracy                           1.00    428172
   macro avg       0.57      0.59      0.58    428172
weighted avg       1.00      1.00      1.00    428172



## Применение теггера

In [0]:
!python -m spacy download en_core_web_sm > /dev/null

In [0]:

ner_tagger = NERTagger(model, char2id, UNIQUE_TAGS, max_sent_len, max_token_len)

In [0]:
nlp = spacy.load('en_core_web_sm')


In [0]:
test_sentences = [
    'Mr Trump said Mr Linick no longer had his full confidence and that he would be removed in 30 days.',
    'Mr Linick had begun investigating Secretary of State Mike Pompeo for suspected abuse of office, reports say.',
    'Democrats say Mr Trump is retaliating against public servants who want to hold his administration to account.',
    'Donald Trump, who is campaigning for re-election in November, has stepped up his attacks on China in recent weeks, blaming it for the spread of Covid-19.'
]
test_sentences_tokenized = tokenize_corpus(test_sentences)
# test_sentences_tokenized = [[token.text for token in nlp.tokenizer(sent) ] for sent in test_sentences]

In [37]:
for sent_tokens, sent_tags in zip(test_sentences_tokenized, ner_tagger(test_sentences)):
    print()
    assert len(sent_tokens) == len(sent_tags)
    print(' '.join('{}-{}'.format(tok, tag) for tok, tag in zip(sent_tokens, sent_tags)))

1it [00:00, 93.97it/s]                   


Mr-O Trump-O said-O Mr-O Linick-I-PER no-O longer-O had-O his-O full-O confidence-O and-O that-O he-O would-O be-O removed-O in-O 30-O days-O .-O

Mr-I-PER Linick-I-PER had-O begun-O investigating-O Secretary-O of-O State-I-ORG Mike-I-ORG Pompeo-I-ORG for-O suspected-O abuse-O of-O office-O ,-O reports-O say-O .-O

Democrats-I-MISC say-O Mr-O Trump-O is-O retaliating-O against-O public-O servants-O who-O want-O to-O hold-O his-O administration-O to-O account-O .-O

Donald-I-PER Trump-I-PER ,-O who-O is-O campaigning-O for-O re-O --O election-O in-O November-O ,-O has-O stepped-O up-O his-O attacks-O on-O China-I-LOC in-O recent-O weeks-O ,-O blaming-O it-O for-O the-O spread-O of-O Covid-19-I-MISC .-O



