Installations done before
    
    pip3 install pytorch-pretrained-bert
    pip3 install pytorch-nlp

### Load first query and paragraphs from HotPotQA

In [11]:
import json
import os
import pandas as pd

In [12]:
with open(os.path.abspath('../data/hotpot_train_v1.1.json')) as json_file:
    hotpot_train = json.load(json_file)

In [13]:
query = hotpot_train[0]['question']
paragraphs = hotpot_train[0]['context']
query

"Which magazine was started first Arthur's Magazine or First for Women?"

In [14]:
paragraphs[0]

['Radio City (Indian radio station)',
 ["Radio City is India's first private FM radio station and was started on 3 July 2001.",
  ' It broadcasts on 91.1 (earlier 91.0 in most cities) megahertz from Mumbai (where it was started in 2004), Bengaluru (started first in 2001), Lucknow and New Delhi (since 2003).',
  ' It plays Hindi, English and regional songs.',
  ' It was launched in Hyderabad in March 2006, in Chennai on 7 July 2006 and in Visakhapatnam October 2007.',
  ' Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features.',
  ' The Radio station currently plays a mix of Hindi and Regional music.',
  ' Abraham Thomas is the CEO of the company.']]

### Using BERT

In [15]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertConfig

In [16]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [17]:
# Tokenized input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)

In [33]:
tokenizer.convert_tokens_to_ids(tokenized_text)

[101,
 2040,
 2001,
 3958,
 27227,
 1029,
 102,
 3958,
 27227,
 2001,
 1037,
 13997,
 11510,
 102]

In [48]:
# from keras.preprocessing.sequence import pad_sequences

tokenized_texts = [tokenizer.tokenize(sent) for sent in paragraphs[0][1]]
# Set the maximum sequence length. 
# MAX_LEN = 128
# Pad our input tokens
# input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
# input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# input_ids[:1]

array([[2557, 2103, 2003, 2634, 1005, 1055, 2034, 2797, 4718, 2557, 2276,
        1998, 2001, 2318, 2006, 1017, 2251, 2541, 1012,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0]])

In [18]:
# add [CLS] and [SEP] to paragraph and join sentences for each paragraph together
test_paragraph = "[CLS] " + (" [SEP]").join(paragraphs[0][1]) + " [SEP]"
test_paragraph

"[CLS] Radio City is India's first private FM radio station and was started on 3 July 2001. [SEP] It broadcasts on 91.1 (earlier 91.0 in most cities) megahertz from Mumbai (where it was started in 2004), Bengaluru (started first in 2001), Lucknow and New Delhi (since 2003). [SEP] It plays Hindi, English and regional songs. [SEP] It was launched in Hyderabad in March 2006, in Chennai on 7 July 2006 and in Visakhapatnam October 2007. [SEP] Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features. [SEP] The Radio station currently plays a mix of Hindi and Regional music. [SEP] Abraham Thomas is the CEO of the company. [SEP]"

In [19]:
tokenized_paragraph = tokenizer.tokenize(test_paragraph)
tokenized_paragraph[:10]

['[CLS]', 'radio', 'city', 'is', 'india', "'", 's', 'first', 'private', 'fm']

In [20]:
tokenized_text

['[CLS]',
 'who',
 'was',
 'jim',
 'henson',
 '?',
 '[SEP]',
 'jim',
 'henson',
 'was',
 'a',
 'puppet',
 '##eer',
 '[SEP]']

### Paragraph selector class

In [21]:
class ParagraphSelector():
    '''
    
    '''
    
    def __init__(self):
        pass
    
    def train(self, training_data, labels):
        '''
        training_data - query embedding, paragraph_embedding
        labels - 0 (unrelated) or 1 (related)
        '''
        pass
    
    def predict(self, paragraphs, supporting_facts, treshold=0.1, tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')):
        pass


In [32]:
def make_training_data(hotpot_train):
    '''
    Make a dataframe with training data for selecting relevant paragraphs
    Each entry in the dataframe has three columns:
        1. Query - the question
        2. Paragraphs - the paragraphs
        3. Label - 0 (unrelated) or 1 (related)
    '''
    for item in hotpot_train[:10]:
        query = item['question']
        paragraphs = item['context']
        supporting_facts = [i[0] for i in item['supporting_facts']]
        
        labels = []
        all_paragraphs = []
        all_queries = []
        for para in paragraphs:
            labels.append(int(para[0] in supporting_facts))
            all_paragraphs.append("[CLS] " + (" [SEP]").join(para[1]) + " [SEP]")
            all_queries.append("[CLS] " + query + " [SEP]")
        
        df = pd.DataFrame(list(zip(all_paragraphs, all_queries, labels)), 
               columns =['Query', 'Paragraphs', 'Related'])
        return df   

In [30]:
data = make_training_data(hotpot_train)

In [31]:
data[:20]

Unnamed: 0,Query,Paragraphs,Related
0,[CLS] Radio City is India's first private FM r...,[CLS] Which magazine was started first Arthur'...,0
1,[CLS] Football in Albania existed before the A...,[CLS] Which magazine was started first Arthur'...,0
2,"[CLS] Echosmith is an American, Corporate indi...",[CLS] Which magazine was started first Arthur'...,0
3,[CLS] Women's colleges in the Southern United ...,[CLS] Which magazine was started first Arthur'...,0
4,[CLS] The First Arthur County Courthouse and J...,[CLS] Which magazine was started first Arthur'...,0
5,[CLS] Arthur's Magazine (1844–1846) was an Ame...,[CLS] Which magazine was started first Arthur'...,1
6,[CLS] The 2014–15 Ukrainian Hockey Championshi...,[CLS] Which magazine was started first Arthur'...,0
7,[CLS] First for Women is a woman's magazine pu...,[CLS] Which magazine was started first Arthur'...,1
8,[CLS] The Freeway Complex Fire was a 2008 wild...,[CLS] Which magazine was started first Arthur'...,0
9,[CLS] William Rast is an American clothing lin...,[CLS] Which magazine was started first Arthur'...,0


### Paragraph Selector

In [57]:
def make_context(paragraphs, supporting_facts, query, treshold=0.1, tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')):
    '''Takes a list of paragraphs, a query and a treshold relevance score
    and produces a context - a list consisting of all paragraphs whose relevance
    score to the query is higher than the treshold
    Parameters: paragraphs - a list where the 0th element is the paragraph topic and the 1st one is a list of sentences
                supporting_facts - a list of strings
                query - a string representing the question (query)
                treshold - a treshold relevance score
                tokenizer - 
    Output:     context - a list of relevant paragraphs
    '''
    # Create labeled data
    labels = []
    for para in pragraphs:
        labels.append(int(para[0] in supporting_facts))
    
    context = []
    for para in paragraphs:
        # BERTify paragraph and query (separately)
        joined_para = "[CLS] " + (" [SEP]").join(para[1]) + " [SEP]"
        tokenized_para = tokenizer.tokenize(joined_para)
        
        tokenized_query = tokenizer.tokenize("[CLS] " + query + " [SEP]")
        
        
        
        # Sentence classification layer with sigmoid prediction
        
        # if score for para is more than r (default of treshold = 0.1), add p to context
        if r > treshold:
            context.append(para)
    return context