Installations done before
    
    pip3 install pytorch-pretrained-bert
    pip3 install pytorch-nlp
    pip3 install transformers

### Imports

In [112]:
import json
import os
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import torch
from transformers import BertTokenizer, BertModel

In [113]:
import os,sys,inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir) 
from utils import HotPotDataHandler

### Encoder

In [114]:
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
model=BertModel.from_pretrained('bert-base-uncased',
                                 output_hidden_states=True,
                                 output_attentions=True)

In [115]:
def encode(text,
           tokenizer=BertTokenizer.from_pretrained('bert-base-uncased'),
           model=BertModel.from_pretrained('bert-base-uncased',
                                 output_hidden_states=True,
                                 output_attentions=True)):
    ''' TODO: document
    '''
    
    input_ids = torch.tensor([tokenizer.encode(text)])
    all_hidden_states, all_attentions = model(input_ids)[-2:]
    
    # This is the embedding of the [CLS] token.
    # [-1] is the last hidden state (list of sentences)
    # first [0] - first (and only) sentence
    # second [0] - first ([CLS]) token of the sentence
    return all_hidden_states[-1][0][0]
    

### Paragraph selector class

In [116]:
class ParagraphSelector_non_final(torch.nn.Module):
    '''
    
    '''
    
    def __init__(self, input_size=768, output_size=1):
        self.linear  = torch.nn.Linear(input_size, output_size)
    
    def forward(self, embedding):
        output = self.linear(embedding)
        output = torch.sigmoid(output)
        
        return output 


In [117]:
def make_training_data_from_filedata(hotpot_train):
    '''
    Make a dataframe with training data for selecting relevant paragraphs
    Each entry in the dataframe has three columns:
        1. Query - the question
        2. Paragraphs - the paragraphs
        3. Label - 0 (unrelated) or 1 (related)
    '''
    for item in hotpot_train[:10]:
        query = item['question']
        paragraphs = item['context']
        supporting_facts = [i[0] for i in item['supporting_facts']]
        
        labels = []
        datapoints = []
        for para in paragraphs:
            labels.append(int(para[0] in supporting_facts))
            datapoints.append("[CLS] " + query + " [SEP] " + ("").join(para[1]) + " [SEP]")
        
        df = pd.DataFrame({
            'id': range(len(labels)),
            'label': labels,
            'text': datapoints
        })
        return df   

### Test Training 

In [225]:
"""
This module implements the Paragraph Selector from the paper, Section 3.1
"""

import torch
from transformers import BertTokenizer, BertModel

class ParagraphSelector():
    """
    TODO: write docstring
    """
    
    def __init__(self,
                 tokenizer=None,
                 model=None):
        """
        TODO: write docstring
        """
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') if not tokenizer else tokenizer
        self.model = BertModel.from_pretrained('bert-base-uncased',
                                 output_hidden_states=True,
                                 output_attentions=True) if not model else model
        
        class ParagraphSelectorNet(torch.nn.Module):
            """
            TODO: write docstring
            """
            def __init__(self, input_size=768, output_size=1):
                super(ParagraphSelectorNet, self).__init__()
                self.linear  = torch.nn.Linear(input_size, output_size)

            def forward(self, embedding):
                output = self.linear(embedding)
                output = torch.sigmoid(output)

                return output 
            
        self.net = ParagraphSelectorNet()
    
    def encode(self, text):
        ''' TODO: document
        '''

        input_ids = torch.tensor([self.tokenizer.encode(text)])
        all_hidden_states, all_attentions = self.model(input_ids)[-2:]

        # This is the embedding of the [CLS] token.
        # [-1] is the last hidden state (list of sentences)
        # first [0] - first (and only) sentence
        # second [0] - first ([CLS]) token of the sentence
        return all_hidden_states[-1][0][0]

    def train(self, train_data, labels, epochs, learning_rate=0.0001):
        """
        TODO: write docstring
        """
        # Use Binary Cross Entropy as a loss function instead of MSE
        # There are papers on why MSE is bad for classification
        criterion = torch.nn.BCELoss()
        optimizer = torch.optim.Adam(self.net.parameters(), lr=learning_rate)

        losses = []
        
        # Set the network into train mode
        self.net.train()

        print("Training...")

        # Iterate over the epochs
        for epoch in range(epochs):
            print('Epoch %d/%d' % (epoch + 1, epochs))
            for inputs, label in zip(train_data, labels):

                optimizer.zero_grad()
                outputs = self.net(inputs)
                loss = criterion(outputs, label)
                loss.backward(retain_graph=True)
                losses.append(loss.item())
                optimizer.step()

        return losses

    def predict(self, p):
        """
        TODO: write docstring
        """
        self.net.eval()
        score = self.net(p)
        return score

    def test(self, test_data, labels):
        """
        TODO: write docstring
        """
        # set the model into evaluation mode and turn off autograd to save memory
        self.net.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for i, data in enumerate(test_data):
                text, labels = data
                outputs = net(text)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        print('Accuracy of the network: %d %%' % (100 * correct / total))
        return correct / total
    
    def make_context(self, paragraphs, query, threshold):
        """
        TODO: write docstring
        
        Parameters: paragraphs - [[p1_title, [p1_s1, p1_s2 ...]],
                                  [p2_title, [p2_s1, p2_s2, ...]],
                                   ...]
                    query - the query as a string
                    threshold - a float between zero and one;
                                paragraphs that get a score above the
                                threshold, become part of the context
        Output: context: [[p1_title, [p1_s1, p1_s2 ...]],
                          [p2_title, [p2_s1, p2_s2, ...]],
                           ...]
        """
        context = []
        for p in paragraphs:
            # p[0] is the paragraph title, p[1] is the list of sentences in the paragraph
            encoded_p = self.encode("[CLS] " + query + " [SEP] " + ("").join(p[1]) + " [SEP]")
            score = self.predict(encoded_p)
            if score > threshold:
                context.append(p)
        return context


In [168]:
dh = HotPotDataHandler(parent_dir + "/data/hotpot_train_v1.1.json")

In [169]:
data = dh.data_for_paragraph_selector()

In [170]:
# data[:1]

In [171]:
def make_training_data(data,
                       tokenizer=BertTokenizer.from_pretrained('bert-base-uncased'),
                       model=BertModel.from_pretrained('bert-base-uncased',
                                 output_hidden_states=True,
                                 output_attentions=True)):
    '''
    Make a dataframe with training data for selecting relevant paragraphs
    Each entry in the dataframe has three columns:
        1. Query - the question
        2. Paragraphs - the paragraphs
        3. Label - 0 (unrelated) or 1 (related)
    '''
    labels = []
    datapoints = []
    for point in data:        
        for para in point[2]:
            labels.append(torch.Tensor([int(para[0] in point[0])])) # Label 1: if paragraph title is in supporting facts, otherwise 0
            encoded_point = encode("[CLS] " + point[1] + " [SEP] " + ("").join(para[1]) + " [SEP]", tokenizer, model)
            datapoints.append(encoded_point)
        
    df = pd.DataFrame({
        'id': range(len(labels)),
        'label': labels,
        'text': datapoints
    })
    return df   

In [172]:
training_data = make_training_data(data[:2])
X_train, X_test, y_train, y_test = train_test_split(training_data[["id", "text"]], training_data["label"], test_size=0.2, random_state=42, shuffle=True)

In [226]:
ps = ParagraphSelector()

In [227]:
losses = ps.train(X_train["text"], y_train, 10)

Training...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [228]:
X_test

Unnamed: 0,id,text
0,0,"[tensor(-0.7114, grad_fn=<SelectBackward>), te..."
17,17,"[tensor(-0.0008, grad_fn=<SelectBackward>), te..."
15,15,"[tensor(-0.4962, grad_fn=<SelectBackward>), te..."
1,1,"[tensor(-0.0445, grad_fn=<SelectBackward>), te..."


In [229]:
for index, row in X_test.iterrows():
    print(ps.predict(X_test["text"][index]))

tensor([0.3437], grad_fn=<SigmoidBackward>)
tensor([0.3113], grad_fn=<SigmoidBackward>)
tensor([0.2753], grad_fn=<SigmoidBackward>)
tensor([0.3116], grad_fn=<SigmoidBackward>)


In [244]:
context = ps.make_context(data[2][2], data[2][1], 0.3)

In [245]:
context

[['Marge Simpson',
  ['Marjorie Jacqueline "Marge" Simpson (née Bouvier) is a fictional character in the American animated sitcom "The Simpsons" and part of the eponymous family.',
   ' She is voiced by Julie Kavner and first appeared on television in "The Tracey Ullman Show" short "Good Night" on April 19, 1987.',
   " Marge was created and designed by cartoonist Matt Groening while he was waiting in the lobby of James L. Brooks' office.",
   ' Groening had been called to pitch a series of shorts based on "Life in Hell" but instead decided to create a new set of characters.',
   ' He named the character after his mother Margaret Groening.',
   ' After appearing on "The Tracey Ullman Show" for three seasons, the Simpson family received their own series on Fox, which debuted December 17, 1989.']],
 ['Allie Goertz',
  ['Allison Beth "Allie" Goertz (born March 2, 1991) is an American musician.',
   ' Goertz is known for her satirical songs based on various pop culture topics.',
   ' Her v

In [246]:
len(context)

3