In [1]:
!pip install -e ../github/allennlp/

[31mERROR: ../github/allennlp/ is not a valid editable requirement. It should either be a path to a local project or a VCS URL (beginning with svn+, git+, hg+, or bzr+).[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

torch.manual_seed(1)

<torch._C.Generator at 0x7f3fae1fad90>

In [2]:
!nvidia-smi

Fri Dec 18 13:18:18 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.100      Driver Version: 440.100      CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 208...  Off  | 00000000:1A:00.0 Off |                  N/A |
| 70%   68C    P2   267W / 260W |  10682MiB / 11019MiB |     94%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:1B:00.0 Off |                  N/A |
| 67%   67C    P2   211W / 260W |   8646MiB / 11019MiB |     91%      Default |
+-------------------------------+----------------------+----------------------+
|   2  GeForce RTX 208...  Off  | 00000000:1C:00.0 Off |                  N/A |
| 28%   

In [3]:
!pip3 show transformers

Name: transformers
Version: 4.0.1
Summary: State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch
Home-page: https://github.com/huggingface/transformers
Author: Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Sam Shleifer, Patrick von Platen, Sylvain Gugger, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors
Author-email: thomas@huggingface.co
License: Apache
Location: /opt/.pyenv/versions/3.7.4/lib/python3.7/site-packages
Requires: numpy, packaging, tokenizers, regex, requests, sacremoses, filelock, tqdm
Required-by: flair, allennlp


In [4]:
cuda_device = torch.device('cuda:3')
n_gpu = torch.cuda.device_count()

for i in range(n_gpu):
    print(torch.cuda.get_device_name(i))

GeForce RTX 2080 Ti
GeForce RTX 2080 Ti
GeForce RTX 2080 Ti
GeForce RTX 2080 Ti
GeForce RTX 2080 Ti
GeForce RTX 2080 Ti
GeForce RTX 2080 Ti
GeForce RTX 2080 Ti


In [5]:
from allennlp.data.token_indexers import PretrainedTransformerMismatchedIndexer
from allennlp.data.vocabulary import Vocabulary


#BERT_MODEL = 'bert-base-cased'
BERT_MODEL = 'google/electra-base-discriminator'
indexer = PretrainedTransformerMismatchedIndexer(model_name=BERT_MODEL)

In [6]:
from typing import Dict, List, Sequence, Iterable
import itertools
import logging

from overrides import overrides

from allennlp.common.checks import ConfigurationError
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.dataset_readers.dataset_utils import to_bioul
from allennlp.data.fields import TextField, SequenceLabelField, Field, MetadataField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token

logger = logging.getLogger(__name__)


def _is_divider(line: str) -> bool:
    empty_line = line.strip() == ""
    if empty_line:
        return True
    else:
        first_token = line.split()[0]
        if first_token == "-DOCSTART-":
            return True
        else:
            return False
        

class ConllUniversalReader(DatasetReader):
    def __init__(
        self,
        token_indexers: Dict[str, TokenIndexer] = None,
        tag_index: int = 0,
        coding_scheme: str = "IOB1",
        label_namespace: str = "labels",
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        
        if coding_scheme not in ("IOB1", "BIOUL"):
            raise ConfigurationError("unknown coding_scheme: {}".format(coding_scheme))

        self.tag_index = tag_index
        self.coding_scheme = coding_scheme
        self.label_namespace = label_namespace
        self._original_coding_scheme = "IOB1"

    @overrides
    def _read(self, file_path: str) -> Iterable[Instance]:
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(data_file, _is_divider):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if not is_divider:
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    fields = [list(field) for field in zip(*fields)]
                    tokens_ = fields[0]
                    if self.tag_index >= 0:
                        ner_tags = fields[1:][self.tag_index]
                    else:
                        ner_tags = None
                    # TextField requires `Token` objects
                    tokens = [Token(token) for token in tokens_]

                    yield self.text_to_instance(tokens, ner_tags)

    def text_to_instance(  # type: ignore
        self,
        tokens: List[Token],
        ner_tags: List[str] = None,
    ) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """

        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {"tokens": sequence}
        instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})

        # Recode the labels if necessary.
        if self.coding_scheme == "BIOUL":
            coded_ner = (
                to_bioul(ner_tags, encoding=self._original_coding_scheme)
                if ner_tags is not None
                else None
            )
        else:
            # the default IOB1
            coded_ner = ner_tags

        
        # Add "tag label" to instance
        if coded_ner:
            instance_fields["tags"] = SequenceLabelField(coded_ner, sequence, self.label_namespace)
        
        return Instance(instance_fields)

In [7]:
#from allennlp.data.dataset_readers import Conll2003DatasetReader
from allennlp.data.dataset_readers import SequenceTaggingDatasetReader

#reader = Conll2003DatasetReader(token_indexers={'tokens': indexer})
reader = ConllUniversalReader(token_indexers={'tokens': indexer})
train_dataset = reader.read('train.tsv')
dev_dataset = reader.read('dev.tsv')
#test_dataset = reader.read('data_1/test_no_answers.tsv')

HBox(children=(HTML(value='reading instances'), FloatProgress(value=1.0, bar_style='info', layout=Layout(width…




HBox(children=(HTML(value='reading instances'), FloatProgress(value=1.0, bar_style='info', layout=Layout(width…




In [8]:
vocab = Vocabulary.from_instances(train_dataset.instances)
train_dataset.index_with(vocab)
dev_dataset.index_with(vocab)
#test_dataset.index_with(vocab)

HBox(children=(HTML(value='building vocab'), FloatProgress(value=0.0, max=2334.0), HTML(value='')))




In [9]:
from allennlp.modules.token_embedders import PretrainedTransformerMismatchedEmbedder
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.seq2seq_encoders import PassThroughEncoder


embedder = PretrainedTransformerMismatchedEmbedder(model_name=BERT_MODEL)
text_field_embedder = BasicTextFieldEmbedder({'tokens': embedder})
seq2seq_encoder = PassThroughEncoder(input_dim=embedder.get_output_dim())

In [10]:
from allennlp.models import crf_tagger

ImportError: cannot import name 'crf_tagger' from 'allennlp.models' (/opt/.pyenv/versions/3.7.4/lib/python3.7/site-packages/allennlp/models/__init__.py)

In [9]:
from allennlp.models import SimpleTagger
from allennlp.models import crf_tagger


model = SimpleTagger(vocab=vocab, 
                      encoder=seq2seq_encoder,
                      calculate_span_f1=True,
                      label_encoding='IOB1').cuda(device=cuda_device)

NameError: name 'vocab' is not defined

NameError: name 'metrics' is not defined

In [11]:
(steps_per_epoch*num_epochs)*0.1

NameError: name 'steps_per_epoch' is not defined

In [None]:
steps_per_epoch

In [None]:
num_epochs

In [11]:
import torch.optim as optim
from transformers import AdamW
import transformers
from datetime import datetime

from allennlp.training.learning_rate_schedulers import LinearWithWarmup
from torch.utils.data import DataLoader
from allennlp.training import GradientDescentTrainer
from allennlp.training.learning_rate_schedulers import SlantedTriangular
from allennlp.data import allennlp_collate

import math

def train_roberta_triangular(lr, batch_size):
    num_epochs = 3
    #batch_size = 16
    #batch_size = 2
    #accum = 4
    accum = 1
    steps_per_epoch = math.ceil(len(train_dataset) / batch_size)

    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    train_data_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, 
                                   collate_fn=allennlp_collate, shuffle=True)
    val_data_loader = DataLoader(dataset=dev_dataset, batch_size=batch_size, collate_fn=allennlp_collate)
    lr_scheduler = SlantedTriangular(optimizer, 
                                    num_epochs=num_epochs, 
                                    #warmup_steps=(steps_per_epoch*num_epochs)*0.1, 
                                    num_steps_per_epoch=steps_per_epoch)

    date_time = datetime.now()
    date_str = date_time.strftime('%m/%d/%Y')
    time_str = date_time.strftime('%H:%M:%S')


    trainer = GradientDescentTrainer(
        model=model,
        optimizer=optimizer,
        patience = 1,
        data_loader=train_data_loader,
        validation_data_loader=val_data_loader,
        #validation_data_loader=None,
        num_epochs=num_epochs,
        cuda_device=cuda_device,
        learning_rate_scheduler=lr_scheduler,
        num_gradient_accumulation_steps=accum,
        serialization_dir=f'./workdir/{date_str}/{time_str}',
        grad_clipping=1.
    )
    parameter_tuple = (lr, batch_size, (steps_per_epoch*num_epochs)*0.1)
    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        pass
    return metrics, parameter_tuple
    

In [10]:
import torch.optim as optim
from transformers import AdamW
import transformers
from datetime import datetime

from allennlp.training.learning_rate_schedulers import LinearWithWarmup
from torch.utils.data import DataLoader
from allennlp.training import GradientDescentTrainer
from allennlp.training.learning_rate_schedulers import SlantedTriangular
from allennlp.data import allennlp_collate

import math


num_epochs = 3
batch_size = 16
#batch_size = 2
#accum = 4
accum = 1
steps_per_epoch = math.ceil(len(train_dataset) / batch_size)

optimizer = optim.AdamW(model.parameters(), lr=2e-4, weight_decay=0.01)
train_data_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, 
                               collate_fn=allennlp_collate, shuffle=True)
val_data_loader = DataLoader(dataset=dev_dataset, batch_size=100, collate_fn=allennlp_collate)
lr_scheduler = SlantedTriangular(optimizer, 
                                    num_epochs=num_epochs, 
                                    #warmup_steps=(steps_per_epoch*num_epochs)*0.1, 
                                    num_steps_per_epoch=steps_per_epoch)

date_time = datetime.now()
date_str = date_time.strftime('%m/%d/%Y')
time_str = date_time.strftime('%H:%M:%S')


trainer = GradientDescentTrainer(
    model=model,
    optimizer=optimizer,
    patience = 1,
    data_loader=train_data_loader,
    validation_data_loader=val_data_loader,
    #validation_data_loader=None,
    num_epochs=num_epochs,
    cuda_device=cuda_device,
    learning_rate_scheduler=lr_scheduler,
    num_gradient_accumulation_steps=accum,
    serialization_dir=f'./workdir/{date_str}/{time_str}',
    grad_clipping=1.
)

try:
    metrics = trainer.train()
except KeyboardInterrupt:
    pass

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=146.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=146.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [14]:
from allennlp.training.util import evaluate

dev_dataloader = DataLoader(dev_dataset, batch_size=100, collate_fn=allennlp_collate)
evaluate(model, dev_dataloader, cuda_device=3)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))




{'accuracy': 0.9179720196101877,
 'accuracy3': 0.9900753318187253,
 'precision-overall': 0.7098479841374752,
 'recall-overall': 0.7542134831460674,
 'f1-measure-overall': 0.7313585291112882,
 'loss': 0.26040369272232056}

In [26]:
!pwd

/notebook/NLU_last_version


In [27]:
#with open("/notebook/NLU_last_version/models/model_triang.th", 'wb') as f:
#    torch.save(model.state_dict(), f)

#vocab.save_to_files("/notebook/NLU_last_version/models/vocabulary_triang")

#vocab2 = Vocabulary.from_files("/tmp/vocabulary")

#model2 = LstmTagger(word_embeddings, lstm, vocab2)
#with open("/tmp/model.th", 'rb') as f:
#    model2.load_state_dict(torch.load(f))

In [15]:
from allennlp.predictors import SentenceTaggerPredictor

class CustomSentenceTaggerPredictor(SentenceTaggerPredictor):
    @overrides
    def _json_to_instance(self, json_dict):
        tokens = [Token(e) for e in json_dict["sentence"]]
        return self._dataset_reader.text_to_instance(tokens)

In [16]:
predictor = CustomSentenceTaggerPredictor(model, reader)
preds = predictor.predict(['Python', 'is', 'better', 'than', 'C++'])
list(zip(preds['words'], preds['tags'])) 

[('Python', 'B-Object'),
 ('is', 'O'),
 ('better', 'B-Predicate'),
 ('than', 'O'),
 ('C++', 'B-Object')]

In [17]:
from allennlp.predictors import SentenceTaggerPredictor

predictor = SentenceTaggerPredictor(model, reader)
preds = predictor.predict('Python is better than C++')
list(zip(preds['words'], preds['tags']))

[('Python', 'B-Object'),
 ('is', 'O'),
 ('better', 'B-Predicate'),
 ('than', 'O'),
 ('C++', 'B-Object')]

# Eval on the dev dataset

In [18]:
from seqeval.metrics import classification_report, f1_score

In [19]:
dev_labels = [list(e['tags']) for e in dev_dataset]

In [20]:
predictor = CustomSentenceTaggerPredictor(model, reader)

dev_eval_loader = DataLoader(dataset=dev_dataset, batch_size=100, 
                             shuffle=False, collate_fn=lambda a: a)

all_preds = []
for batch in dev_eval_loader:
    all_preds += predictor.predict_batch_instance(batch)
    
pred_tags = [pred['tags'][:len(pred['words'])] for pred in all_preds]

Encountered the loss key in the model's return dictionary which couldn't be split by the batch size. Key will be ignored.


In [21]:
from seqeval.metrics import classification_report, f1_score

print(f1_score(dev_labels, pred_tags))
print(classification_report(dev_labels, pred_tags))

0.731358529111338
           precision    recall  f1-score   support

Predicate       0.87      0.93      0.90       386
   Object       0.67      0.73      0.70       781
   Aspect       0.58      0.57      0.58       257

micro avg       0.71      0.75      0.73      1424
macro avg       0.71      0.75      0.73      1424



In [22]:
sent_num = 47
list(zip(all_preds[sent_num]['words'], all_preds[sent_num]['tags']))

[('georgia', 'B-Object'),
 ('has', 'O'),
 ('a', 'O'),
 ('higher', 'B-Predicate'),
 ('percentage', 'O'),
 ('of', 'O'),
 ('blacks', 'B-Aspect'),
 ('(', 'O'),
 ('30', 'O'),
 ('.', 'O'),
 ('5', 'O'),
 (')', 'O'),
 ('than', 'O'),
 ('new', 'B-Object'),
 ('york', 'B-Object'),
 ('(', 'O'),
 ('15', 'O'),
 ('.', 'O'),
 ('9', 'O'),
 (')', 'O'),
 ('or', 'O'),
 ('california', 'B-Object'),
 ('(', 'O'),
 ('6', 'O'),
 ('.', 'O'),
 ('2', 'O'),
 (').', 'O')]

# Predict on the test set

In [23]:
reader = ConllUniversalReader(token_indexers={'tokens': indexer}, tag_index=-1)
test_dataset_no_answers = reader.read('test_no_answers.tsv')

HBox(children=(HTML(value='reading instances'), FloatProgress(value=1.0, bar_style='info', layout=Layout(width…




In [24]:
predictor = CustomSentenceTaggerPredictor(model, reader) 

predict_data_loader = DataLoader(dataset=test_dataset_no_answers, batch_size=100, 
                                 shuffle=False, collate_fn=lambda a: a)

all_preds = []
for batch in predict_data_loader:
    all_preds += predictor.predict_batch_instance(batch)
    
pred_tags = [pred['tags'][:len(pred['words'])] for pred in all_preds]

# Save the results

In [25]:
with open('answers_test_triangular.tsv', 'w') as f:
    for pred in all_preds:
        f.write('\n'.join([f'{w}\t{t}' for w, t in zip(pred['words'], pred['tags'])]))
        
        if i < len(all_preds) - 1:
            f.write('\n\n')

In [None]:
with open('answers_test.tsv', 'w') as f:
    for i, pred in enumerate(all_preds):
        f.write('\n'.join([f'{w}\t{t}' for w, t in zip(pred['words'], pred['tags'])]))
        
        if i < len(all_preds) - 1:
            f.write('\n')