<a href="https://colab.research.google.com/github/schokoro/cnn_crf_nertagger/blob/dev/NER_tagger.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Свёрточные нейросети и POS-теггинг

POS-теггинг - определение частей речи (снятие частеречной неоднозначности)

In [0]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if device == 'cpu':
    print('cpu')
else:
    n_gpu = torch.cuda.device_count()
    print(torch.cuda.get_device_name(0))
    

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!rm -rf cnn_crf_nertagger/

In [0]:
!git clone -b dev https://github.com/schokoro/cnn_crf_nertagger.git > /dev/null
!pip install allennlp > /dev/null

In [0]:
%load_ext autoreload
%reload_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import classification_report
import numpy as np
from torch.nn import functional as F
from torch.utils.data import TensorDataset
from allennlp.data.dataset_readers.conll2003 import Conll2003DatasetReader
from allennlp.common.util import ensure_list
from torch import nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset
from pdb import set_trace
from gc import collect

import cnn_crf_nertagger
from cnn_crf_nertagger.modules.modules import NERTaggerModel
from cnn_crf_nertagger.utils.pipeline import train_eval_loop, predict_with_model
from cnn_crf_nertagger.utils.prepare import tag_corpus_to_tensor

# init_random_seed()

torch.backends.cudnn.deterministic=False 

## Загрузка корпусов

In [0]:
path_train = '/content/drive/My Drive/NER_tagger/data/eng.train'
path_valid = '/content/drive/My Drive/NER_tagger/data/eng.testa'
path_test = '/content/drive/My Drive/NER_tagger/data/eng.testb'


In [0]:
%%time
conll_reader = Conll2003DatasetReader()
train_conll = ensure_list(conll_reader.read(path_train))
valid_conll = ensure_list(conll_reader.read(path_valid))
test_conll = ensure_list(conll_reader.read(path_test))

In [0]:
all_conll = train_conll + valid_conll + test_conll
len(all_conll), len(train_conll), len(valid_conll), len(test_conll)

In [0]:
tags = set()
tokens = set()

max_sent_len = 0
for instance in all_conll[: ]:
    if len(instance['tokens']) >  max_sent_len:
        max_sent_len = len(instance['tokens'])
    tags.update(instance['tags'])
    tokens.update(instance['tokens'])
    
max_sent_len

In [0]:
max_token_len = max([len(token.text) for token in tokens])
max_token_len

In [0]:
chars = set()
for token in tokens:
    chars.update(token.text)
    
len(chars)

In [0]:
tag2id = {tag: num for num, tag in enumerate(['<NOTAG>'] + list(tags))}
char2id = {char: num+1 for num, char in enumerate(chars)}
id2char = {item[1]: item[0] for item in char2id.items()}

In [0]:
%%time

train_inputs, train_targets = tag_corpus_to_tensor(train_conll, char2id, tag2id, max_sent_len, max_token_len)
valid_inputs, valid_targets = tag_corpus_to_tensor(valid_conll, char2id, tag2id, max_sent_len, max_token_len)
test_inputs, test_targets = tag_corpus_to_tensor(test_conll, char2id, tag2id, max_sent_len, max_token_len)

In [0]:
train_dataset = TensorDataset(train_inputs, train_targets)
valid_dataset = TensorDataset(valid_inputs, valid_targets)
test_dataset = TensorDataset(test_inputs, test_targets)

In [0]:
models_path = '/content/drive/My Drive/NER_tagger/models/char_token_level_ner.pth'

In [0]:
try:
    del sentence_level_model    
except:
    print('no model')
finally:
    torch.cuda.empty_cache()


In [0]:
torch.cuda.empty_cache()
sentence_level_model = NERTaggerModel(len(char2id), len(tag2id), tag2id, embedding_size=64,
                                              single_backbone_kwargs=dict(layers_n=3, kernel_size=3, dropout=0.1, dilation=[1,1,2]),
                                              context_backbone_kwargs=dict(layers_n=4, kernel_size=3, dropout=0.1, dilation=[1,1,2,3]))
print('Количество параметров', sum(np.product(t.shape) for t in sentence_level_model.parameters()))

In [0]:
sentence_level_model.load_state_dict(torch.load(models_path))

In [0]:
(best_val_loss,
 best_sentence_level_model) = train_eval_loop(sentence_level_model,
                                              train_dataset,
                                              valid_dataset,
                                              tag2id,
                                              lr=5e-3,
                                              epoch_n=200,
                                              batch_size=32,
                                              device=device,
                                              early_stopping_patience=8,
                                              l2_reg_alpha = 1e-7,
                                              max_batches_per_epoch_train=200,
                                              max_batches_per_epoch_val=200,
                                              lr_scheduler_ctor=lambda optim: torch.optim.lr_scheduler.ReduceLROnPlateau(
                                                  optim, patience=2,
                                                  factor=0.1,
                                                  threshold=1e-4,
                                                  verbose=True),
                                             verbose_batch=False)
 
torch.save(best_sentence_level_model.state_dict(), models_path)

In [0]:
# Если Вы запускаете ноутбук на colab, добавьте в начало пути /content/stepik-dl-nlp
sentence_level_model.load_state_dict(torch.load(models_path))

In [0]:
train_pred = predict_with_model(sentence_level_model, train_dataset)
train_pred.shape

In [0]:

# train_loss = F.cross_entropy(torch.tensor(train_pred),
                            #  torch.tensor(train_targets))
# print('Среднее значение функции потерь на обучении', float(train_loss))
UNIQUE_TAGS =[key for key in tag2id.keys()]
# UNIQUE_TAGS.remove('<NOTAG>')
# train_pred = train_pred.argmax(1)
# mask = (train_inputs[:, :, 1] != 0)
# train_pred = -sentence_level_model.crf(train_pred.permute(0, 2, 1), train_targets, mask)

print(train_pred.shape)
print(classification_report(train_targets.view(-1), train_pred.reshape(-1), target_names=UNIQUE_TAGS))
print()

In [0]:
for n in range(9):
    print((train_pred.argmax(1).reshape(-1) == n).sum())

In [0]:
# valid_pred = predict_with_model(sentence_level_model, valid_dataset)
# # valid_loss = F.cross_entropy(torch.tensor(valid_pred),
# #                             torch.tensor(valid_targets))
# # print('Среднее значение функции потерь на валидации', float(valid_loss))

# print(classification_report(valid_targets.view(-1), valid_pred.reshape(-1), target_names=UNIQUE_TAGS))

In [0]:
for n in range(9):
    print((valid_pred.argmax(1).reshape(-1) == n).sum())

In [0]:
test_pred = predict_with_model(sentence_level_model, test_dataset)
# test_loss = F.cross_entropy(torch.tensor(test_pred),
#                             torch.tensor(test_targets))
# print('Среднее значение функции потерь на тесте', float(test_loss))
print(classification_report(test_targets.view(-1), test_pred.reshape(-1), target_names=UNIQUE_TAGS))

In [0]:
for n in range(9):
    print((test_pred.argmax(1).reshape(-1) == n).sum())

## Применение теггера

In [0]:

sentence_level_pos_tagger = POSTagger(sentence_level_model, char2id, UNIQUE_TAGS, max_sent_len, max_token_len)

In [0]:
!python -m spacy download en_core_web_sm

In [0]:
import spacy
nlp = spacy.load('en_core_web_sm')


In [0]:
test_sentences = [
    'Mr Trump said Mr Linick no longer had his full confidence and that he would be removed in 30 days.',
    'Mr Linick had begun investigating Secretary of State Mike Pompeo for suspected abuse of office, reports say.',
    'Democrats say Mr Trump is retaliating against public servants who want to hold his administration to account.',
    'Donald Trump, who is campaigning for re-election in November, has stepped up his attacks on China in recent weeks, blaming it for the spread of Covid-19.'
]
# test_sentences_tokenized = tokenize_corpus(test_sentences, min_token_size=1)
test_sentences_tokenized = [[token.text for token in nlp.tokenizer(sent) ] for sent in test_sentences]

In [0]:
for sent in test_sentences_tokenized:
    print(sent)

In [0]:
for sent_tokens, sent_tags in zip(test_sentences_tokenized, sentence_level_pos_tagger(test_sentences)):
    print(' '.join('{}-{}'.format(tok, tag) for tok, tag in zip(sent_tokens, sent_tags)))
    print()