In [17]:
import pandas as pd 

In [34]:
mypath = './data/QQP/paws_qqp/output/'
train_data = pd.read_table(mypath+'train.tsv',header=0)
test_data = pd.read_table(mypath+'dev_and_test.tsv',header=0)

In [28]:
sentence1 = [x[2:-1] for x in train_data['sentence1']]
sentence2 = [x[2:-1] for x in train_data['sentence2']]
label = [x for x in train_data['label']]

In [30]:
sentence1

['Will a message still say blocked if you were delivered on an iPhone ?',
 'How can you treat ocd ? Is there any helpful suggestions on how to keep your ocd on a way that you can control it ?',
 'If you do not do anything how you are motivated to seek your daily works ?',
 'Why is new in system verily constructor not a task ?',
 'What are the most common traffic convictions in Arizona , and how does the severity of the convictions differ in Arkansas ?',
 'What should it cost to fix a 1 foot other scratch on a car door ? It is far from the key panels so should not cause blending issues .',
 'How should one write academic papers to make them more parse-able to machine learning/NLP algorithms that mass-analyze papers ?',
 'What are some beautiful lines to comment on Beautiful pictures ?',
 'Can small dogs breed with large dogs ?',
 'Do arteries carry deoxygenated blood or oxygenated blood ? How do they do it ?',
 'Which are in your opinion the most beautiful and the most depressing truths

# Data loader 

In [5]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)
        
config = Config(
    testing=True,
    seed=1,
    batch_size=64,
    lr=3e-4,
    epochs=2,
    hidden_sz=64,
    max_seq_len=100, # necessary to limit memory usage
    max_vocab_size=100000,
)

In [6]:
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data import Instance
from allennlp.data.fields import TextField, LabelField, ArrayField
import numpy as np
from typing import Iterator, List, Dict, Optional


class BertPawsReader(DatasetReader):
    def __init__(
        self,
        tokenizer: Tokenizer = None,
        token_indexers: Dict[str, TokenIndexer] = None,
        max_seq_len: Optional[int]=config.max_seq_len
    ) -> None:
        super().__init__(lazy=False)
        self._tokenizer = tokenizer 
        self._token_indexers = token_indexers
        self.max_seq_len = max_seq_len
               
    def text_to_instance(
        self,  # type: ignore
        sentence1: str,
        sentence2: str,
        label: str=None,
    ) -> Instance:
        
        fields: Dict[str, Field] = {}
        sentence1 = self._tokenizer.tokenize(sentence1)
        fields["sentence1"] = TextField(sentence1, {"bert": self._token_indexers})
        
        sentence2 = self._tokenizer.tokenize(sentence2)
        fields["sentence2"] = TextField(sentence2, {"bert": self._token_indexers})
        if label:
            fields["label"] = LabelField(label)
        return Instance(fields)
    def _read(self, file_path: str) -> Iterator[Instance]:
        df = pd.read_table(file_path)
#         if config.testing: df = df.head(100)
        for i, row in df.iterrows():
            yield self.text_to_instance(
                row['sentence1'],row['sentence2'],str(row['label']),
            )

In [21]:
from allennlp.nn import InitializerApplicator, RegularizerApplicator
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from sklearn.metrics.pairwise import cosine_distances
from allennlp.modules import FeedForward
from allennlp.modules.text_field_embedders import TextFieldEmbedder
from allennlp.models import Model
from allennlp.data.vocabulary import Vocabulary
import torch
import torch.nn.functional as F

class ParagraphIdentification(Model):
    """
    Model to classify venue based on input title and abstract
    """
    def __init__(self, 
                 vocab: Vocabulary,
                 word_embeddings: TextFieldEmbedder,
                 sentence1_encoder: Seq2VecEncoder,
                 sentence2_encoder: Seq2VecEncoder,
                 classifier_feedforward: FeedForward,
                 initializer: InitializerApplicator = InitializerApplicator()) -> None:
        super(ParagraphIdentification, self).__init__(vocab)
        self.word_embeddings = word_embeddings
        self.num_classes = self.vocab.get_vocab_size("label")
        self.sentence1_encoder = sentence1_encoder
        self.sentence2_encoder = sentence2_encoder
        self.classifier_feedforward = classifier_feedforward
        self.loss = torch.nn.BCELoss()
        initializer(self)
    
    def forward(self, 
                sentence1: Dict[str, torch.LongTensor],
                sentence2: Dict[str, torch.LongTensor],
                label: torch.LongTensor = None) -> Dict[str, torch.Tensor]:
        
        embedded_sentence1 = self.word_embeddings(sentence1)
        sentence1_mask = get_text_field_mask(sentence1)
        encoded_sentence1 = self.sentence1_encoder(embedded_sentence1, sentence1_mask)

        embedded_sentence2 = self.word_embeddings(sentence2)
        sentence2_mask = get_text_field_mask(sentence2)
        encoded_sentence2 = self.sentence1_encoder(embedded_sentence2, sentence2_mask)
        
        logits = self.classifier_feedforward(torch.cat([encoded_sentence1, encoded_sentence2], dim=-1))
        class_probabilities = F.softmax(logits, dim=-1)
        output_dict = {'class_probabilities': class_probabilities}

        argmax_indices = np.argmax(class_probabilities.cpu().data.numpy(), axis=-1)
        labels = [self.vocab.get_token_from_index(x, namespace="labels") for x in argmax_indices]
        output_dict['predicted_label'] = labels
        print(output_dict)
        if label is not None:
            loss = self.loss(class_probabilities, label)
            output_dict["loss"] = loss

        return output_dict

In [22]:
from allennlp.data.token_indexers.wordpiece_indexer import PretrainedBertIndexer
from allennlp.data.tokenizers.word_splitter import BertBasicWordSplitter
from allennlp.data.fields import TextField, LabelField
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.iterators import BucketIterator

token_indexer = PretrainedBertIndexer(pretrained_model="bert-base-cased",max_pieces=config.max_seq_len, do_lowercase=False,)
tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())
reader = BertPawsReader(tokenizer,token_indexer)
train_dataset = reader.read(mypath+'train.tsv')
vocab = Vocabulary.from_instances(train_dataset)
iterator = BucketIterator(batch_size=config.batch_size, 
                          biggest_batch_first=True,
                          sorting_keys=[("sentence1", "num_tokens"), 
                                        ("sentence2", "num_tokens")],
                         )
iterator.index_with(vocab)
# vars(train_dataset[0])
# next(iter(iterator(train_dataset)))

11988it [00:06, 1730.89it/s]
100%|██████████| 11988/11988 [00:00<00:00, 163442.59it/s]


In [23]:
# bert_model = BertModel.from_pretrained(pretrained_model_name_or_path="bert-base-cased")
# token_embedder = BertEmbedder(bert_model)

from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders.bert_token_embedder import PretrainedBertEmbedder
USE_GPU = torch.cuda.is_available()
bert_embedder = PretrainedBertEmbedder(
        pretrained_model="bert-base-uncased",
        top_layer_only=True, # conserve memory
)
word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder({"bert": bert_embedder},
                                                            # we'll be ignoring masks so we'll need to set this to True
                                                           allow_unmatched_keys = True)

EMBEDDING_DIM = 768
HIDDEN_DIM = 100
num_classes = len(vocab.get_index_to_token_vocabulary('label'))
sentence1_lstm = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, 
                                                 batch_first=True, bidirectional=True))
sentence2_lstm = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, 
                                                    batch_first=True, bidirectional=True))

feed_forward = torch.nn.Linear(2 * 2 * HIDDEN_DIM, num_classes)
model = ParagraphIdentification(vocab,
                                word_embeddings, 
                                sentence1_lstm, 
                                sentence2_lstm,
                                feed_forward)
if USE_GPU: model.cuda()
else: model

In [24]:
from allennlp.training.trainer import Trainer
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=0.005)

train_dataset = reader.read(mypath+'train.tsv')
validation_dataset = reader.read(mypath+'dev_and_test.tsv')

trainer = Trainer(
    model=model,
    optimizer=optimizer,
    iterator=iterator,
    train_dataset=train_dataset,
    validation_dataset=validation_dataset,
    cuda_device=0 if USE_GPU else -1,
    patience=2,
    num_epochs=5,
    serialization_dir='output'
)
trainer.train()

11988it [00:06, 1738.03it/s]
677it [00:00, 1693.97it/s]


{'best_epoch': 4, 'best_validation_loss': 0.6363336443901062}

# Prediction

In [25]:
from allennlp.common.util import JsonDict
from allennlp.predictors.predictor import Predictor

In [26]:
class ParagraphIdentificationPredictor(Predictor):
    """"
    Predictor wrapper for the ParagraphIdentificationClassifier
    """
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        sentence1 = json_dict['sentence1']
        sentence2 = json_dict['sentence2']
        instance = self._dataset_reader.text_to_instance(sentence1=sentence1, sentence2=sentence2)
        return instance

In [27]:
reader = BertPawsReader(tokenizer,token_indexer)
predictor = ParagraphIdentificationPredictor(model, dataset_reader=reader)

In [15]:
train_data

Unnamed: 0,id,sentence1,sentence2,label
0,1,b'Will a message still say blocked if you were...,b'Will a message still say delivered if you we...,0
1,2,b'How can you treat ocd ? Is there any helpful...,b'How can you treat OCD ? Is there any helpful...,1
2,3,b'If you do not do anything how you are motiva...,b'If you do not seek anything how you are moti...,0
3,4,b'Why is new in system verily constructor not ...,b'Why constructor new in system verilog is not...,0
4,5,b'What are the most common traffic convictions...,b'What are the most common traffic convictions...,0
...,...,...,...,...
11983,11984,b'Do you think Hillary Clinton had Seth Rich k...,b'Do you think Seth Rich had Hillary Clinton k...,0
11984,11985,b'How do you become attractive/pretty ?',b'How do you become pretty/attractive ?',1
11985,11986,"b""`` What do a Pakistani will tell when someon...",b'What do a Pakistani will ask when someone te...,0
11986,11987,b'Are professional traders better at investing...,b'Are non-professional traders better at inves...,0


In [16]:
from allennlp.nn.util import get_text_field_mask
test_data_json = test_data[['sentence1','sentence2']].to_dict(orient='records')
train_data_json = train_data[['sentence1','sentence2']].to_dict(orient='records')
prediction_output = predictor.predict_batch_json(train_data_json[:100])

print([x['predicted_label'] for x in prediction_output])

['1', '0', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '0', '0', '1', '1', '1', '1', '1', '1', '1', '1', '0', '1', '1', '1', '1', '1', '1', '1', '1', '0', '0', '0', '1', '1', '1', '1', '1', '0', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '0', '0', '1', '1', '1', '1']


# Sanity check 

In [193]:
from allennlp.nn import util as nn_util
USE_GPU = torch.cuda.is_available()
batch = next(iter(iterator(train_dataset)))
batch = nn_util.move_to_device(batch, 0 if USE_GPU else -1)

In [195]:
tokens = batch["sentence1"]
labels = batch

In [216]:
USE_GPU

True

In [218]:
embeddings = model.word_embeddings(tokens)
# state = model.encoder(embeddings, mask)
# class_logits = model.projection(state)
# class_logits

In [219]:
embeddings

tensor([[[-0.5718,  0.4497, -0.2918,  ...,  0.0904,  0.4740,  0.6834],
         [ 0.0674,  0.1834,  0.0511,  ..., -0.0967,  0.4437,  0.1413],
         [-0.1015,  0.1788, -0.0693,  ..., -0.1066,  0.3961, -0.2464],
         ...,
         [-0.1450,  0.3277,  0.0532,  ..., -0.0693,  0.5113,  0.2490],
         [-0.1644,  0.4830, -0.0431,  ..., -0.1120,  0.6098,  0.2662],
         [-0.0993,  0.4373,  0.1460,  ..., -0.1039,  0.3325,  0.1892]],

        [[-0.0158,  0.0557,  0.1627,  ...,  0.0305,  0.3749,  0.2665],
         [ 0.0723, -0.2047,  0.4992,  ...,  0.2407,  0.4124, -0.0924],
         [ 0.1071, -0.2139,  0.2627,  ...,  0.0952,  0.3373, -0.1341],
         ...,
         [ 0.0388, -0.0748,  0.2566,  ...,  0.0635,  0.0031, -0.0814],
         [-0.0628, -0.1459,  0.3133,  ...,  0.0393,  0.2716, -0.0148],
         [ 0.1226, -0.0794,  0.4338,  ...,  0.1507,  0.4670, -0.1460]],

        [[-0.2382,  0.1446,  0.0568,  ..., -0.0434,  0.5593,  0.5779],
         [ 0.1535,  0.2408,  0.3716,  ...,  0