Installations done before
    
    pip3 install pytorch-pretrained-bert
    pip3 install pytorch-nlp
    pip3 install transformers

### Load first query and paragraphs from HotPotQA

In [12]:
import json
import os
import pandas as pd

In [13]:
with open(os.path.abspath('../data/hotpot_train_v1.1.json')) as json_file:
    hotpot_train = json.load(json_file)

In [14]:
query = hotpot_train[0]['question']
paragraphs = hotpot_train[0]['context']
query

"Which magazine was started first Arthur's Magazine or First for Women?"

In [15]:
paragraphs[0]

['Radio City (Indian radio station)',
 ["Radio City is India's first private FM radio station and was started on 3 July 2001.",
  ' It broadcasts on 91.1 (earlier 91.0 in most cities) megahertz from Mumbai (where it was started in 2004), Bengaluru (started first in 2001), Lucknow and New Delhi (since 2003).',
  ' It plays Hindi, English and regional songs.',
  ' It was launched in Hyderabad in March 2006, in Chennai on 7 July 2006 and in Visakhapatnam October 2007.',
  ' Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features.',
  ' The Radio station currently plays a mix of Hindi and Regional music.',
  ' Abraham Thomas is the CEO of the company.']]

### Using BERT

In [1]:
import torch
from transformers import BertTokenizer, BertModel

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed',)).History will not be written to the database.


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# Load pre-trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = BertModel.from_pretrained('bert-base-uncased')

model = BertModel.from_pretrained('bert-base-uncased',
                                 output_hidden_states=True,
                                 output_attentions=True)

In [3]:
# Input text
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was not a puppeteer. He died too early. [SEP]"

In [4]:
# input_ids must be a list of lists
input_ids = torch.tensor([tokenizer.encode(text)])
input_ids

tensor([[  101,   101,  2040,  2001,  3958, 27227,  1029,   102,  3958, 27227,
          2001,  2025,  1037, 13997, 11510,  1012,  2002,  2351,  2205,  2220,
          1012,   102,   102]])

In [5]:
all_hidden_states, all_attentions = model(input_ids)[-2:]

In [6]:
# We have 12 "attentions" - one for each "encoder layer"; each of these layers
# has 12 attention heads. Each attention head has a dimention n x n,
# where n is the number of words in the sentence.
all_attentions[0].shape

torch.Size([1, 12, 23, 23])

In [7]:
len(all_attentions)

12

In [8]:
all_hidden_states[-1].shape

torch.Size([1, 23, 768])

In [9]:
len(all_hidden_states)

13

In [16]:
# add [CLS] and [SEP] to paragraph and join sentences for each paragraph together
test_paragraph = "[CLS] " + (" [SEP]").join(paragraphs[0][1]) + " [SEP]"
test_paragraph

"[CLS] Radio City is India's first private FM radio station and was started on 3 July 2001. [SEP] It broadcasts on 91.1 (earlier 91.0 in most cities) megahertz from Mumbai (where it was started in 2004), Bengaluru (started first in 2001), Lucknow and New Delhi (since 2003). [SEP] It plays Hindi, English and regional songs. [SEP] It was launched in Hyderabad in March 2006, in Chennai on 7 July 2006 and in Visakhapatnam October 2007. [SEP] Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features. [SEP] The Radio station currently plays a mix of Hindi and Regional music. [SEP] Abraham Thomas is the CEO of the company. [SEP]"

In [17]:
tokenized_paragraph = tokenizer.tokenize(test_paragraph)
tokenized_paragraph[:10]

['[CLS]', 'radio', 'city', 'is', 'india', "'", 's', 'first', 'private', 'fm']

### Encoder

In [18]:
def encode(text,
           tokenizer=BertTokenizer.from_pretrained('bert-base-uncased'),
           model=BertModel.from_pretrained('bert-base-uncased',
                                 output_hidden_states=True,
                                 output_attentions=True)):
    ''' TODO: document
    '''
    
    input_ids = torch.tensor([tokenizer.encode(text)])
    all_hidden_states, all_attentions = model(input_ids)[-2:]
    
    # This is the embedding of the [CLS] token.
    # [-1] is the last hidden state (list of sentences)
    # first [0] - first (and only) sentence
    # second [0] - first ([CLS]) token of the sentence
    return all_hidden_states[-1][0][0]
    

### Paragraph selector class

In [30]:
class ParagraphSelector_non_final(torch.nn.Module):
    '''
    
    '''
    
    def __init__(self, input_size=768, output_size=1):
        self.linear  = nn.Linear(input_size, output_size)
    
    def forward(self, embedding):
        output = self.linear(embedding)
        output = torch.sigmoid(output)
        
        return output 


In [64]:
def make_training_data_from_filedata(hotpot_train):
    '''
    Make a dataframe with training data for selecting relevant paragraphs
    Each entry in the dataframe has three columns:
        1. Query - the question
        2. Paragraphs - the paragraphs
        3. Label - 0 (unrelated) or 1 (related)
    '''
    for item in hotpot_train[:10]:
        query = item['question']
        paragraphs = item['context']
        supporting_facts = [i[0] for i in item['supporting_facts']]
        
        labels = []
        datapoints = []
        for para in paragraphs:
            labels.append(int(para[0] in supporting_facts))
            datapoints.append("[CLS] " + query + " [SEP] " + ("").join(para[1]) + " [SEP]")
        
        df = pd.DataFrame({
            'id': range(len(labels)),
            'label': labels,
            'text': datapoints
        })
        return df   

In [22]:
data = make_training_data(hotpot_train)

In [23]:
data[:20]

Unnamed: 0,id,label,text
0,0,0,[CLS] Which magazine was started first Arthur'...
1,1,0,[CLS] Which magazine was started first Arthur'...
2,2,0,[CLS] Which magazine was started first Arthur'...
3,3,0,[CLS] Which magazine was started first Arthur'...
4,4,0,[CLS] Which magazine was started first Arthur'...
5,5,1,[CLS] Which magazine was started first Arthur'...
6,6,0,[CLS] Which magazine was started first Arthur'...
7,7,1,[CLS] Which magazine was started first Arthur'...
8,8,0,[CLS] Which magazine was started first Arthur'...
9,9,0,[CLS] Which magazine was started first Arthur'...


In [24]:
data['text'][0]

"[CLS] Which magazine was started first Arthur's Magazine or First for Women? [SEP] Radio City is India's first private FM radio station and was started on 3 July 2001. It broadcasts on 91.1 (earlier 91.0 in most cities) megahertz from Mumbai (where it was started in 2004), Bengaluru (started first in 2001), Lucknow and New Delhi (since 2003). It plays Hindi, English and regional songs. It was launched in Hyderabad in March 2006, in Chennai on 7 July 2006 and in Visakhapatnam October 2007. Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features. The Radio station currently plays a mix of Hindi and Regional music. Abraham Thomas is the CEO of the company. [SEP]"

### Test Training 

In [25]:
train_data = []
labels = []
for point in data.loc[:,['label','text']].values:
#     print(point)
    train_data.append(encode(point[1]))
    labels.append(point[0])

In [26]:
labels

[0, 0, 0, 0, 0, 1, 0, 1, 0, 0]

In [27]:
def train(train_data, labels, net, epochs, learning_rate=0.0001):
    
    # Use Binary Cross Entropy as a loss function instead of MSE
    # There are papers on why MSE is bad for classification
    criterion = torch.nn.BCELoss()
    optimizer = torch.optim.Adam(lr=learning_rate)
    
    losses = []
    
    print("Training...")
    
    # Iterate over the epochs
    for epoch in range(epochs):
        print('Epoch %d/%d' % (epoch + 1, epochs))
        for inputs, label in zip(train_data, labels):
            
            optimizer.zero_grad()
            outputs = net(inputs)
            loss = criterion(outputs, label)
            loss.backward()
            losses.append(loss.item())
            optimizer.step()

    return losses

In [28]:
def test(net, test_data):
# set the model into evaluation mode and turn off autograd to save memory
    net.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for i, data in enumerate(test_data):
            images, labels = data
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print('Accuracy of the network: %d %%' % (100 * correct / total))
    return correct / total

In [31]:
"""
This module implements the Paragraph Selector from the paper, Section 3.1
"""

import torch
from transformers import BertTokenizer, BertModel

class ParagraphSelector():
    """
    TODO: write docstring
    """
    
    def __init__(self,
                 tokenizer=None),
                 model=None):
        """
        TODO: write docstring
        """
        if not tokenizer:
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        if not model:
            self.model = BertModel.from_pretrained('bert-base-uncased',
                                 output_hidden_states=True,
                                 output_attentions=True)
        
        class ParagraphSelectorNet(torch.nn.Module):
            """
            TODO: write docstring
            """
            s
            def __init__(self, input_size=768, output_size=1):
                self.linear  = nn.Linear(input_size, output_size)

            def forward(self, embedding):
                output = self.linear(embedding)
                output = torch.sigmoid(output)

                return output 
            
        self.net = ParagraphSelectorNet()
    
    def encode(text, tokenizer = self.tokenizer, model = self.model):
        ''' TODO: document
        '''

        input_ids = torch.tensor([tokenizer.encode(text)])
        all_hidden_states, all_attentions = model(input_ids)[-2:]

        # This is the embedding of the [CLS] token.
        # [-1] is the last hidden state (list of sentences)
        # first [0] - first (and only) sentence
        # second [0] - first ([CLS]) token of the sentence
        return all_hidden_states[-1][0][0]

    def train(self, train_data, labels, epochs, learning_rate=0.0001):
        """
        TODO: write docstring
        """
        # Use Binary Cross Entropy as a loss function instead of MSE
        # There are papers on why MSE is bad for classification
        criterion = torch.nn.BCELoss()
        optimizer = torch.optim.Adam(lr=learning_rate)

        losses = []

        print("Training...")

        # Iterate over the epochs
        for epoch in range(epochs):
            print('Epoch %d/%d' % (epoch + 1, epochs))
            for inputs, label in zip(train_data, labels):

                optimizer.zero_grad()
                outputs = self.net(inputs)
                loss = criterion(outputs, label)
                loss.backward()
                losses.append(loss.item())
                optimizer.step()

        return losses

    def predict(self, p):
        """
        TODO: write docstring
        """
        self.net.eval()
        score = self.net(inputs)
        return score

    def test(self, test_data):
        """
        TODO: write docstring
        """
        # set the model into evaluation mode and turn off autograd to save memory
        self.net.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for i, data in enumerate(test_data):
                images, labels = data
                outputs = net(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        print('Accuracy of the network: %d %%' % (100 * correct / total))
        return correct / total
    
    def make_context(self, paragraphs, threshold):
        """
        TODO: write docstring
        """
        # TODO: write main function for building context
        # Output:
        # [[p1_title, [p1_s1, p1_s2 ...]],
        #  [p2_title, [p2_s1, p2_s2, ...]],
        #  ...]
        context = []
        for p in paragraphs:
            score = self.predict(p)
            if score > threshold:
                context.append(p)
        return context


In [53]:
import os,sys,inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir) 
from utils import HotPotDataHandler

In [59]:
dh = HotPotDataHandler(parent_dir + "/data/hotpot_train_v1.1.json")

In [60]:
data = dh.data_for_paragraph_selector()

In [62]:
data[:1]

[(["Arthur's Magazine", 'First for Women'],
  "Which magazine was started first Arthur's Magazine or First for Women?",
  [['Radio City (Indian radio station)',
    ["Radio City is India's first private FM radio station and was started on 3 July 2001.",
     ' It broadcasts on 91.1 (earlier 91.0 in most cities) megahertz from Mumbai (where it was started in 2004), Bengaluru (started first in 2001), Lucknow and New Delhi (since 2003).',
     ' It plays Hindi, English and regional songs.',
     ' It was launched in Hyderabad in March 2006, in Chennai on 7 July 2006 and in Visakhapatnam October 2007.',
     ' Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features.',
     ' The Radio station currently plays a mix of Hindi and Regional music.',
     ' Abraham Thomas is the CEO of the company.']],
   ['History of Albanian football',
    ['Football in Albania

In [69]:
def make_training_data(data):
    '''
    Make a dataframe with training data for selecting relevant paragraphs
    Each entry in the dataframe has three columns:
        1. Query - the question
        2. Paragraphs - the paragraphs
        3. Label - 0 (unrelated) or 1 (related)
    '''
    for point in data:        
        labels = []
        datapoints = []
        for para in point[2]:
            labels.append(int(para[0] in point[0])) # Label 1: if paragraph title is in supporting facts, otherwise 0
            datapoints.append("[CLS] " + query + " [SEP] " + ("").join(para[1]) + " [SEP]")
        
        df = pd.DataFrame({
            'id': range(len(labels)),
            'label': labels,
            'text': datapoints
        })
        return df   

In [70]:
make_training_data(data[:10])

Unnamed: 0,id,label,text
0,0,0,[CLS] Which magazine was started first Arthur'...
1,1,0,[CLS] Which magazine was started first Arthur'...
2,2,0,[CLS] Which magazine was started first Arthur'...
3,3,0,[CLS] Which magazine was started first Arthur'...
4,4,0,[CLS] Which magazine was started first Arthur'...
5,5,1,[CLS] Which magazine was started first Arthur'...
6,6,0,[CLS] Which magazine was started first Arthur'...
7,7,1,[CLS] Which magazine was started first Arthur'...
8,8,0,[CLS] Which magazine was started first Arthur'...
9,9,0,[CLS] Which magazine was started first Arthur'...
