In [None]:
!pip install pytorch-crf
!pip install deeppavlov
!pip install tensorflow-gpu==1.15.2

Collecting pytorch-crf
  Downloading https://files.pythonhosted.org/packages/96/7d/4c4688e26ea015fc118a0327e5726e6596836abce9182d3738be8ec2e32a/pytorch_crf-0.7.2-py3-none-any.whl
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2
Collecting deeppavlov
[?25l  Downloading https://files.pythonhosted.org/packages/6d/f6/df4ce4c5c5cafd8d357a4c02cb1ccb5ff1d8f3c21de3e5d02299eef56342/deeppavlov-0.12.1-py3-none-any.whl (948kB)
[K     |████████████████████████████████| 952kB 4.7MB/s 
[?25hCollecting aio-pika==6.4.1
[?25l  Downloading https://files.pythonhosted.org/packages/c8/07/196a4115cbef31fa0c3dabdea146f02dffe5e49998341d20dbe2278953bc/aio_pika-6.4.1-py3-none-any.whl (40kB)
[K     |████████████████████████████████| 51kB 7.3MB/s 
[?25hCollecting rusenttokenize==0.0.5
  Downloading https://files.pythonhosted.org/packages/25/4c/a2f00be5def774a3df2e5387145f1cb54e324607ec4a7e23f573645946e7/rusenttokenize-0.0.5-py3-none-any.whl
Collecting pytelegrambotapi==3.6.

Collecting tensorflow-gpu==1.15.2
[?25l  Downloading https://files.pythonhosted.org/packages/32/ca/58e40e5077fa2a92004f398d705a288e958434f123938f4ce75ffe25b64b/tensorflow_gpu-1.15.2-cp36-cp36m-manylinux2010_x86_64.whl (411.0MB)
[K     |████████████████████████████████| 411.0MB 42kB/s 
Collecting tensorboard<1.16.0,>=1.15.0
[?25l  Downloading https://files.pythonhosted.org/packages/1e/e9/d3d747a97f7188f48aa5eda486907f3b345cd409f0a0850468ba867db246/tensorboard-1.15.0-py3-none-any.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 46.3MB/s 
Collecting tensorflow-estimator==1.15.1
[?25l  Downloading https://files.pythonhosted.org/packages/de/62/2ee9cd74c9fa2fa450877847ba560b260f5d0fb70ee0595203082dafcc9d/tensorflow_estimator-1.15.1-py2.py3-none-any.whl (503kB)
[K     |████████████████████████████████| 512kB 39.1MB/s 
Collecting keras-applications>=1.0.8
[?25l  Downloading https://files.pythonhosted.org/packages/71/e3/19762fdfc62877ae9102edf6342d71b28fbfd9dea3d2f96a882ce099b

In [None]:
from google.colab import drive
drive.mount("/content/drive")
%cd /content/drive/My Drive/Colab Notebooks/RuREBus

Mounted at /content/drive
/content/drive/My Drive/Colab Notebooks/RuREBus


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader
from torchcrf import CRF
from deeppavlov.models.embedders.elmo_embedder import ELMoEmbedder
import re
import json
import nltk
nltk.download("punkt")
from nltk import sent_tokenize, word_tokenize
from nltk.tokenize.util import align_tokens
from glob import glob
from functools import partial
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook as tqdm
from conlleval import evaluate as prec_rec_f
from brat_format import read_file, BratDoc

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package perluniprops to /root/nltk_data...
[nltk_data]   Unzipping misc/perluniprops.zip.
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping corpora/nonbreaking_prefixes.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#Data

Data preparation functions.

In [None]:
def span_sentences(text, shift=0):
    """
    Extracts sentences and their spans from text.

    Parameters
    text : str
        Text to extract sentences and spans from.
    shift : int
        Initial position from which to start counting span.

    Returns
    sents : List[str]
        Sentences extracted from text.
    spans : List[Tuple[int, int]]
        Extracted sentences position in text.
    """
    
    sents = sent_tokenize(text, language="russian")
    spans = align_tokens(sents, text)
    spans = [(start + shift, end + shift) for start, end in spans]
    
    return sents, spans


def span_tokens(text, shift=0):
    """
    Extracts tokens and their spans from text.

    Parameters
    text : str
        Text to extract tokens and spans from.
    shift : int
        Initial position from which to start counting span.

    Returns
    tokens : List[str]
        Tokens extracted from text.
    spans : List[Tuple[int, int]]
        Extracted tokens position in text.
    """
    
    tokens, spans = [], []

    for tok in re.finditer(r"([^\W_]+|\S)", text):
        tokens.append(tok.group(1))
        spans.append((shift + tok.start(1), 
                      shift + tok.end(1)))
    
    return tokens, spans


def to_conll(brat_ners, spans):
    """
    Converts named entities from brat to conll format. In conll format every 
    token has a tag:
    B-named_entity_type - for the first token in named entity,
    I-named_entity_type - for a token of named entity that is not first,
    O - for a token out of named entity.

    Parameters
    brat_ners : List[Dict]
        Named entities in brat format.
    spans : List[Tuple[int, int]]
        Position of tokens in reference text.

    Returns
    conll_ners : List[str]
        Conll tags of the tokens corresponding to spans.
    """
    
    conll_ners = []

    for token_start, token_end in spans:
        
        for ner in brat_ners:
            
            if (ner["start"] <= token_start) and (ner["end"] >= token_end):
                prefix = "I" if (ner["start"] < token_start) else "B"
                conll_ners.append(prefix + "-" + ner["ner_type"])
                break
        
        else:
            conll_ners.append("O")  
    
    return conll_ners


def to_brat(conll_ners, spans, ner_id=1):
    """
    Converts named entities from conll to brat format. In brat format every 
    named entity is represented with its id, type, and position in reference 
    text.

    Parameters
    conll_ners : List[str]
        Conll tags of the tokens corresponding to spans.
    spans : List[tuple[int]]
        Position of tokens in reference text.
    ner_id : int
        The initial id from which to start counting ner_ids

    Returns
    brat_ners : List[Dict]
        Named entities in brat format.
    """

    brat_ners = []
    prev = "O"

    for tag, (token_start, token_end)  in zip(conll_ners, spans):
        splitted_tag = tag.split("-")
        
        if len(splitted_tag) > 1:
            prefix, ner_type = splitted_tag
            
            if prefix == "I":
                
                if prev != "O":
                    brat_ners[-1]["end"] = token_end
                    prev = "I"
                    continue
            
            brat_ners.append({"ner_id": ner_id, 
                              "ner_type": ner_type, 
                              "start": token_start, 
                              "end": token_end})
            prev = "B"
            ner_id += 1
        
        else:
            prev = "O"

    return brat_ners


def extract_data(files):
    """
    Given text sequence as tokens, predicts corresponding conll tags.

    Parameters
    files : List[str]
        Paths to .ann files to extract data from.

    Returns
    tokens : List[List[str]]
        Tokenized text sequences.
    tags : List[List[str]]
        Conll tags corresponding to token sequences.
    """

    tokens, tags = [], []

    for file_path in tqdm(files):
        brat_doc = read_file(file_path)
        doc_ners = [{"id": i, 
                    "ner_type": brat_doc.ners[idx][0], 
                    "start": brat_doc.ners[idx][1], 
                    "end": brat_doc.ners[idx][2]} 
                    for i, idx in brat_doc.ner_id_2_idx.items()]
        
        for line in re.finditer(r"[^\n]+(\n+|$)", brat_doc.txt_data):
            sents, sent_spans = span_sentences(line.group(0), shift=line.start())
            
            for sent, (sent_start, _) in zip(sents, sent_spans):
                toks, spans = span_tokens(sent, shift=sent_start)
                tokens.append(toks)
                tags.append(to_conll(doc_ners, spans))

    return tokens, tags

Extract tokens and corresponding conll tags from files in train directory.

In [None]:
files = glob("data/train/*.ann")
tokens, tags = extract_data(files)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=188.0), HTML(value='')))




In [None]:
train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens, tags, 
                                                                  test_size=0.1)
len(train_tokens), len(val_tokens)

(19026, 2115)

Compile dataset for NER.

In [None]:
class NER_Dataset(Dataset):
    def __init__(self, tag2id, seqs, seq_tags):
        self.tag2id = tag2id
        self.seqs = [[token.lower() for token in seq] for seq in seqs]
        self.seq_tags = [[self.tag2id[tag] for tag in tags] for tags in seq_tags]

    def __len__(self):
        return len(self.seqs)

    def __getitem__(self, idx):
        return self.seqs[idx], self.seq_tags[idx]

In [None]:
# Conll tags encoding
tags = list({tag for sent in train_tags for tag in sent})
tag2id = {tag: i for i, tag in enumerate(tags)}
id2tag = {i: tag for i, tag in enumerate(tags)}

with open("tags.json", "w") as f:
    json.dump(tags, f)

tags

['B-INST',
 'I-INST',
 'I-CMP',
 'B-ACT',
 'I-MET',
 'B-SOC',
 'I-ACT',
 'I-QUA',
 'B-QUA',
 'I-BIN',
 'O',
 'B-BIN',
 'I-SOC',
 'B-ECO',
 'I-ECO',
 'B-MET',
 'B-CMP']

In [None]:
train_ds = NER_Dataset(tag2id, train_tokens, train_tags)
val_ds = NER_Dataset(tag2id, val_tokens, val_tags)

In [None]:
val_ds[:2]

([['общее', 'образование', ':'],
  ['разработка',
   'и',
   'реализация',
   'краевых',
   'целевых',
   'и',
   'ведомственных',
   'программ',
   ',',
   'проектов',
   ';']],
 [[5, 12, 10], [11, 10, 11, 3, 6, 6, 6, 6, 6, 6, 10]])

#Model

In [None]:
class BiLSTM_CRF(nn.Module):
    def __init__(self, embedding_size, hidden_size, feature_dim, num_classes, 
                 dropout):
        super().__init__()
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.feature_dim = feature_dim
        self.num_classes = num_classes
        self.dropout = dropout

        self.lstm = nn.LSTM(embedding_size, hidden_size, 2, bidirectional=True, 
                            batch_first=True)
        self.drop = nn.Dropout(dropout)
        self.fc_0 = nn.Linear(2 * hidden_size, feature_dim)
        self.Q = nn.Linear(feature_dim, feature_dim)
        self.K = nn.Linear(feature_dim, feature_dim)
        self.V = nn.Linear(feature_dim, feature_dim)
        self.layer_norm = nn.LayerNorm(feature_dim)
        self.fc_1 = nn.Linear(feature_dim, num_classes)
        self.crf = CRF(num_classes, batch_first=True)


    def forward(self, x, lengths):
        # LSTM
        x_packed = pack_padded_sequence(x, lengths, batch_first=True)
        seq_out_packed, _ = self.lstm(x_packed)
        seq_out, _ = pad_packed_sequence(seq_out_packed, batch_first=True)
        seq_out = self.drop(seq_out)
        seq_out = self.fc_0(F.relu(seq_out))

        # Attention
        Q, K, V = self.Q(seq_out), self.K(seq_out), self.V(seq_out)
        attn = torch.bmm(Q, K.transpose(1, 2))
        attn /= torch.sqrt(torch.tensor(self.feature_dim, dtype=torch.float))
        attn = F.softmax(attn, dim=-1)
        out = torch.bmm(attn, V)
        out = self.layer_norm(out)

        scores = self.fc_1(out)

        return scores

#Train

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
elmo_embedder = ELMoEmbedder("http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-news_wmt11-16_1.5M_steps.tar.gz", 
                             elmo_output_names=["elmo"])



















INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore








In [None]:
def collate_fn(batch):
    x, y = list(zip(*batch))
    
    # Sort sequences by length (descending)
    lengths = torch.tensor(list(map(len, y)))
    lengths, indices = lengths.sort(0, descending=True)

    # Pad sequences and create mask
    y = pad_sequence(list(map(torch.tensor, y)), padding_value=-1, batch_first=True)
    y = y[indices]
    mask = (y != -1)
    x = pad_sequence(list(map(torch.tensor, elmo_embedder(x))), batch_first=True)
    x = x[indices]
    
    return x.to(device), lengths.to(device), mask.to(device), y.to(device)

In [None]:
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_ds, batch_size=64, collate_fn=collate_fn)

In [None]:
def train(loader, model, optimizer):
    model.train()
    total_loss = 0
    
    for x, lengths, mask, y in tqdm(loader):
        scores = model(x, lengths)
        loss = -model.crf(scores, y, mask)
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print("Train loss {:.4f}".format(total_loss / len(loader)))


def evaluate(loader, model):
    model.eval()
    total_loss = 0
    pred_tags, tags = [], []

    with torch.no_grad():
        
        for x, lengths, mask, y in tqdm(loader):
            scores = model(x, lengths)
            loss = -model.crf(scores, y, mask)
            total_loss += loss.item()

            y_pred = model.crf.decode(scores, mask)
            pred_tags.extend([id2tag[i] for sent in y_pred for i in sent])
            tags.extend([id2tag[i] for i in y[mask].tolist()])

    print("Loss {:.4f}".format(total_loss / len(loader)))
    res = prec_rec_f(tags, pred_tags, verbose=False)
    print("Precision = {:.2f}%, Recall = {:.2f}%, F-score = {:.2f}%".format(*res))
    return res[-1]

In [None]:
model_params = {"embedding_size": elmo_embedder.dim, 
                "hidden_size": 200,  
                "feature_dim": 50, 
                "num_classes": len(tag2id), 
                "dropout": 0.2}
model = BiLSTM_CRF(**model_params)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2 * 1e-4)
best_res = -1

for e in range(5):
    print(f"Epoch {e}")
    train(train_loader, model, optimizer)
    f_score = evaluate(val_loader, model)
    
    if f_score > best_res:
      	best_res = f_score
      	torch.save(model.state_dict(), "model.pt")
    
    print()

Epoch 0


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=298.0), HTML(value='')))


Train loss 1752.3258


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=34.0), HTML(value='')))


Loss 1451.2795
Precision = 2.08%, Recall = 0.04%, F-score = 0.07%

Epoch 1


HBox(children=(FloatProgress(value=0.0, max=298.0), HTML(value='')))


Train loss 1239.6502


HBox(children=(FloatProgress(value=0.0, max=34.0), HTML(value='')))


Loss 1067.4792
Precision = 15.68%, Recall = 9.76%, F-score = 12.03%

Epoch 2


HBox(children=(FloatProgress(value=0.0, max=298.0), HTML(value='')))


Train loss 1007.2514


HBox(children=(FloatProgress(value=0.0, max=34.0), HTML(value='')))


Loss 891.0007
Precision = 34.25%, Recall = 33.33%, F-score = 33.78%

Epoch 3


HBox(children=(FloatProgress(value=0.0, max=298.0), HTML(value='')))


Train loss 813.9747


HBox(children=(FloatProgress(value=0.0, max=34.0), HTML(value='')))


Loss 768.5622
Precision = 42.51%, Recall = 40.06%, F-score = 41.25%

Epoch 4


HBox(children=(FloatProgress(value=0.0, max=298.0), HTML(value='')))


Train loss 691.6889


HBox(children=(FloatProgress(value=0.0, max=34.0), HTML(value='')))


Loss 710.2327
Precision = 46.88%, Recall = 41.67%, F-score = 44.12%



#Test

In [None]:
with open("tags.json", "r") as f:
    tags = json.load(f)

tag2id = {tag: i for i, tag in enumerate(tags)}
id2tag = {i: tag for i, tag in enumerate(tags)}

test_files = glob("data/test/*.ann")

model = BiLSTM_CRF(**model_params)
model.load_state_dict(torch.load("model.pt"))
model.to(device)

<All keys matched successfully>

Evaluate on test.

In [None]:
test_tokens, test_tags = extract_data(test_files)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




In [None]:
test_ds = NER_Dataset(tag2id, test_tokens, test_tags)
test_loader = DataLoader(test_ds, batch_size=1, collate_fn=collate_fn)

In [None]:
evaluate(test_loader, model)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=3189.0), HTML(value='')))


Loss 11.4423
Precision = 47.18%, Recall = 46.39%, F-score = 46.78%


46.78034819937992

Predict files on test.

In [None]:
def predict_tags(tokens, embedder, model, id2tag, device):
    """
    Given text sequence as tokens, predicts corresponding conll tags.

    Parameters
    tokens : List[str]
        Tokens to predict tags for.
    embedder : ElmoEmbedder
        Converts tokens to vectors.
    model : BiLSTM_CRF
        Predicts tags from embedded tokens.
    id2tag : Dict
        Converts predicted tag ids to tag names.
    device : str
        Cuda or cpu.

    Returns
    pred_tags : List[str]
        Predicted tag names.
    """

    x = torch.tensor(embedder([tokens])[0]).unsqueeze(0).to(device)
    length = torch.tensor([x.size(1)])
    
    with torch.no_grad():
        scores = model(x, length)
        y_pred = model.crf.decode(scores)
    
    pred_tags = [id2tag[idx] for idx in y_pred[0]]
    
    return pred_tags


def extract_ners(brat_doc, embedder, model, id2tag, device):    
    """
    Extracts named entities from brat_doc text and adds them to brat_doc 
    ner attributs.

    Parameters
    brat_doc : BratDoc
        in brat format.
    embedder : ElmoEmbedder
        Converts tokens to vectors.
    model : BiLSTM_CRF
        Predicts tags from embedded tokens.
    id2tag : Dict
        Converts predicted tag ids to tag names.
    device : str
        Cuda or cpu.

    Returns
    None
    """
    
    ner_id = 0

    for line in re.finditer(r"[^\n]+(\n+|$)", brat_doc.txt_data):
        sents, sent_spans = span_sentences(line.group(0), line.start())
        
        for sent, (sent_start, _) in zip(sents, sent_spans):
            tokens, spans = span_tokens(sent, sent_start)
            pred_tags = predict_tags(tokens, embedder, model, id2tag, device)
            brat_ners = to_brat(pred_tags, spans, ner_id)
            
            if brat_ners:
                ner_id = brat_ners[-1]["ner_id"]
                
                for ner in brat_ners:
                    brat_doc.add_ner(ner["ner_id"], ner["ner_type"], 
                                     ner["start"], ner["end"])

In [None]:
model.eval()

for file_path in tqdm(test_files):
    print(file_path)
    
    with open(file_path, "r") as f:
        brat_doc = BratDoc(f.read())
    
    extract_ners(brat_doc, elmo_embedder, model, id2tag, device)
    brat_doc.write_to_file("data/test_predict/" + file_path.split("/")[-1][:-3] + "ann")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

data/test/20227011054408684110020_10_part_0_.ann
data/test/31339011034587000689004_11_part_1.ann
data/test/31339011027002952877010_7_part_0_.ann
data/test/31228011024401437389022_2_part_1_.ann
data/test/31339011034587000689004_11_part_0.ann
data/test/31339011025603271396015_58_part_0_.ann
data/test/31339011027002952877002_3_part_1_.ann
data/test/20227011054408684110020_10_part_1_.ann
data/test/31339011061685002510004_3_part_1.ann
data/test/31339011027002952877002_3_part_0_.ann
data/test/31339011061685002510004_3_part_2.ann
data/test/31339221023601033818162_13_part_0.ann
data/test/31339011027002952877002_3_part_2_.ann
data/test/31339291061674000266032_22_part_2.ann
data/test/31339011061685002510004_3_part_0.ann
data/test/32339011021100517140001_5_part_0.ann
data/test/20336011161101050428003_13_part_1_.ann
data/test/31339011024501525289014_6_part_0_.ann
data/test/31339011061674001290010_6_part_0_.ann
data/test/31228011067014000019047_6_part_1_.ann
data/test/20336011161101050428003_13_par