In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

torch.manual_seed(1)

<torch._C.Generator at 0x7f60d01e7930>

In [2]:
!nvidia-smi

Mon Dec 28 12:09:02 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.80.02    Driver Version: 450.80.02    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 208...  Off  | 00000000:1A:00.0 Off |                  N/A |
| 68%   67C    P2   255W / 260W |  10586MiB / 11019MiB |     95%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:1B:00.0 Off |                  N/A |
| 71%   68C    P2   260W / 260W |  10586MiB / 11019MiB |     93%      Default |
|       

In [3]:
cuda_device = torch.device('cuda:2')
n_gpu = torch.cuda.device_count()

for i in range(n_gpu):
    print(torch.cuda.get_device_name(i))

GeForce RTX 2080 Ti
GeForce RTX 2080 Ti
GeForce RTX 2080 Ti
GeForce RTX 2080 Ti
GeForce RTX 2080 Ti
GeForce RTX 2080 Ti
GeForce RTX 2080 Ti
GeForce RTX 2080 Ti


In [4]:
from allennlp.data.token_indexers import PretrainedTransformerMismatchedIndexer
from allennlp.data.vocabulary import Vocabulary


#BERT_MODEL = 'bert-base-cased'
BERT_MODEL = 'google/electra-base-discriminator'
indexer = PretrainedTransformerMismatchedIndexer(model_name=BERT_MODEL)

In [2]:
from typing import Dict, List, Sequence, Iterable
import itertools
import logging

from overrides import overrides

from allennlp.common.checks import ConfigurationError
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.dataset_readers.dataset_utils import to_bioul
from allennlp.data.fields import TextField, SequenceLabelField, Field, MetadataField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token

logger = logging.getLogger(__name__)


def _is_divider(line: str) -> bool:
    empty_line = line.strip() == ""
    if empty_line:
        return True
    else:
        first_token = line.split()[0]
        if first_token == "-DOCSTART-":
            return True
        else:
            return False
        



In [33]:
from allennlp.data.dataset_readers import Conll2003DatasetReader
from allennlp.data.dataset_readers import SequenceTaggingDatasetReader

reader = Conll2003DatasetReader(token_indexers={'tokens': indexer})
#reader = ConllUniversalReader(token_indexers={'tokens': indexer})
#train_dataset = reader.read('train.tsv')
#dev_dataset = reader.read('dev.tsv')
#test_dataset = reader.read('data_1/test_no_answers.tsv')

In [4]:
vocab = Vocabulary.from_instances(train_dataset.instances)
train_dataset.index_with(vocab)
dev_dataset.index_with(vocab)
#test_dataset.index_with(vocab)

building vocab:   0%|          | 0/2334 [00:00<?, ?it/s]

In [6]:
from allennlp.modules.token_embedders import PretrainedTransformerMismatchedEmbedder
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.seq2seq_encoders import PassThroughEncoder


embedder = PretrainedTransformerMismatchedEmbedder(model_name=BERT_MODEL)
text_field_embedder = BasicTextFieldEmbedder({'tokens': embedder})
seq2seq_encoder = PassThroughEncoder(input_dim=embedder.get_output_dim())

In [20]:
from allennlp.models import SimpleTagger
from allennlp.data.vocabulary import Vocabulary

vocab= Vocabulary.from_files("./vocab_dir")

model = SimpleTagger(text_field_embedder=text_field_embedder, 
                      vocab=vocab, 
                      encoder=seq2seq_encoder,
                      calculate_span_f1=True,
                      label_encoding='IOB1').cuda(device=cuda_device)

In [22]:
model.load_state_dict(torch.load("roberta.hdf5"))

<All keys matched successfully>

In [23]:
type(model)

allennlp.models.simple_tagger.SimpleTagger

In [29]:
from typing import Dict, List, Sequence, Iterable
import itertools
import logging

from overrides import overrides
from allennlp.data.dataset_readers.dataset_reader import DatasetReader

from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.instance import Instance

from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.data.fields import TextField, SequenceLabelField, Field, MetadataField

def _is_divider(line: str) -> bool:
    empty_line = line.strip() == ""
    if empty_line:
        return True
    else:
        first_token = line.split()[0]
        if first_token == "-DOCSTART-":
            return True
        else:
            return False
        

class ConllUniversalReader(DatasetReader):
    def __init__(
        self,
        token_indexers: Dict[str, TokenIndexer] = None,
        tag_index: int = 0,
        coding_scheme: str = "IOB1",
        label_namespace: str = "labels",
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        
        if coding_scheme not in ("IOB1", "BIOUL"):
            raise ConfigurationError("unknown coding_scheme: {}".format(coding_scheme))

        self.tag_index = tag_index
        self.coding_scheme = coding_scheme
        self.label_namespace = label_namespace
        self._original_coding_scheme = "IOB1"

    @overrides
    def _read(self, file_path: str) -> Iterable[Instance]:
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(data_file, _is_divider):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if not is_divider:
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    fields = [list(field) for field in zip(*fields)]
                    tokens_ = fields[0]
                    if self.tag_index >= 0:
                        ner_tags = fields[1:][self.tag_index]
                    else:
                        ner_tags = None
                    # TextField requires `Token` objects
                    tokens = [Token(token) for token in tokens_]

                    yield self.text_to_instance(tokens, ner_tags)

    def text_to_instance(  # type: ignore
        self,
        tokens: List[Token],
        ner_tags: List[str] = None,
    ) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """

        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {"tokens": sequence}
        instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})

        # Recode the labels if necessary.
        if self.coding_scheme == "BIOUL":
            coded_ner = (
                to_bioul(ner_tags, encoding=self._original_coding_scheme)
                if ner_tags is not None
                else None
            )
        else:
            # the default IOB1
            coded_ner = ner_tags

        
        # Add "tag label" to instance
        if coded_ner:
            instance_fields["tags"] = SequenceLabelField(coded_ner, sequence, self.label_namespace)
        
        return Instance(instance_fields)

In [30]:
reader = ConllUniversalReader(token_indexers={'tokens': indexer})

In [36]:
from allennlp.predictors import SentenceTaggerPredictor

predictor = SentenceTaggerPredictor(model, reader)
preds = predictor.predict('Python is better than C++ for developers')
list(zip(preds['words'], preds['tags']))

[('Python', 'B-Object'),
 ('is', 'O'),
 ('better', 'B-Predicate'),
 ('than', 'O'),
 ('C++', 'B-Object'),
 ('for', 'O'),
 ('developers', 'O')]

In [35]:
preds['tags']

['B-Object', 'O', 'B-Predicate', 'O', 'B-Object']

In [20]:
#with open("/notebook/NLU_last_version/models/model_warm_up.th", 'wb') as f:
    #torch.save(model.state_dict(), f)

#vocab.save_to_files("/notebook/NLU_last_version/models/vocabulary_warm_up")

#vocab2 = Vocabulary.from_files("/tmp/vocabulary")

#model2 = LstmTagger(word_embeddings, lstm, vocab2)
#with open("/tmp/model.th", 'rb') as f:
    #model2.load_state_dict(torch.load(f))

# Eval on the dev dataset

In [21]:
from seqeval.metrics import classification_report, f1_score

In [22]:
dev_labels = [list(e['tags']) for e in dev_dataset]

In [23]:
predictor = CustomSentenceTaggerPredictor(model, reader)

dev_eval_loader = DataLoader(dataset=dev_dataset, batch_size=100, 
                             shuffle=False, collate_fn=lambda a: a)

all_preds = []
for batch in dev_eval_loader:
    all_preds += predictor.predict_batch_instance(batch)
    
pred_tags = [pred['tags'][:len(pred['words'])] for pred in all_preds]

Encountered the loss key in the model's return dictionary which couldn't be split by the batch size. Key will be ignored.


In [24]:
from seqeval.metrics import classification_report, f1_score

print(f1_score(dev_labels, pred_tags))
print(classification_report(dev_labels, pred_tags))

0.7181756296800544
           precision    recall  f1-score   support

   Object       0.66      0.71      0.69       781
   Aspect       0.55      0.57      0.56       257
Predicate       0.87      0.91      0.89       386

micro avg       0.70      0.74      0.72      1424
macro avg       0.70      0.74      0.72      1424



In [25]:
sent_num = 47
list(zip(all_preds[sent_num]['words'], all_preds[sent_num]['tags']))

[('georgia', 'B-Object'),
 ('has', 'O'),
 ('a', 'O'),
 ('higher', 'B-Predicate'),
 ('percentage', 'B-Aspect'),
 ('of', 'I-Aspect'),
 ('blacks', 'B-Aspect'),
 ('(', 'O'),
 ('30', 'O'),
 ('.', 'O'),
 ('5', 'O'),
 (')', 'O'),
 ('than', 'O'),
 ('new', 'B-Object'),
 ('york', 'B-Object'),
 ('(', 'O'),
 ('15', 'O'),
 ('.', 'O'),
 ('9', 'O'),
 (')', 'O'),
 ('or', 'O'),
 ('california', 'B-Object'),
 ('(', 'O'),
 ('6', 'O'),
 ('.', 'O'),
 ('2', 'O'),
 (').', 'O')]

# Predict on the test set

In [27]:
reader = ConllUniversalReader(token_indexers={'tokens': indexer}, tag_index=-1)
test_dataset_no_answers = reader.read('test_no_answers.tsv')
predict_data_loader = DataLoader(dataset=test_dataset_no_answers, batch_size=100, 
                                 shuffle=False, collate_fn=lambda a: a)


HBox(children=(HTML(value='reading instances'), FloatProgress(value=1.0, bar_style='info', layout=Layout(width…




In [33]:
predictor = CustomSentenceTaggerPredictor(model, reader) 

predict_data_loader = DataLoader(dataset=test_dataset_no_answers, batch_size=100, 
                                 shuffle=False, collate_fn=lambda a: a)

all_preds = []
for batch in predict_data_loader:
    all_preds += predictor.predict_batch_instance(batch)
    
pred_tags = [pred['tags'][:len(pred['words'])] for pred in all_preds]

In [38]:
for pred in all_preds[:3]:
    print (pred['words'])
    a = [f'{w}\t{t}' for w, t in zip(pred['words'], pred['tags'])]

['plus', ',', 'android', 'is', 'developing', 'a', 'way', 'faster', 'than', 'ios', 'so', 'it', 'has', 'chances', 'to', 'become', 'a', 'laptop', 'replacement', 'earlier', 'than', 'ios', '.']
['went', 'to', 'android', 'earlier', 'this', 'year', 'after', 'being', 'convinced', 'its', 'better', 'then', 'ios', 'apple', '.']
['the', 'version', 'we', 'showed', 'here', 'is', 'ios', 'only', ',', 'because', 'the', 'ios', 'code', 'supported', 'ibeacons', 'earlier', 'than', 'android', ',', 'but', 'we', 'are', 'almost', 'finished', 'with', 'an', 'android', 'one', 'as', 'well', '.']


In [36]:
a

['the\tO',
 'version\tO',
 'we\tO',
 'showed\tO',
 'here\tO',
 'is\tO',
 'ios\tB-Object',
 'only\tO',
 ',\tO',
 'because\tO',
 'the\tO',
 'ios\tB-Object',
 'code\tO',
 'supported\tO',
 'ibeacons\tO',
 'earlier\tB-Predicate',
 'than\tO',
 'android\tB-Object',
 ',\tO',
 'but\tO',
 'we\tO',
 'are\tO',
 'almost\tO',
 'finished\tO',
 'with\tO',
 'an\tO',
 'android\tB-Object',
 'one\tO',
 'as\tO',
 'well\tO',
 '.\tO']

# Save the results

In [29]:
with open('answers_Artem.tsv', 'w') as f:
    for pred in all_preds:
        f.write('\n'.join([f'{w}\t{t}' for w, t in zip(pred['words'], pred['tags'])]))
        
        if i < len(all_preds) - 1:
            f.write('\n\n')

In [24]:
with open('answers_test.tsv', 'w') as f:
    for i, pred in enumerate(all_preds):
        f.write('\n'.join([f'{w}\t{t}' for w, t in zip(pred['words'], pred['tags'])]))
        
        if i < len(all_preds) - 1:
            f.write('\n')

In [None]:
from extractorRoberta