In [74]:
import ujson as json
import import_ipynb
import prepro
import random
from prepro import *
from collections import Counter

In [6]:
train_filename = 'hotpot_train_v1.1.json'
dev_distractor_filename = 'hotpot_dev_distractor_v1.json'
dev_filename = 'hotpot_dev_fullwiki_v1.json'
test_filename = 'hotpot_test_fullwiki_v1.json'

In [34]:
glove_embeddings_dict = {}
embedding_size = 300
with open('glove.840B.300d.txt', 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = ''.join(values[:len(values) - embedding_size])
        vector = np.asarray(values[-embedding_size:], "float32")
        glove_embeddings_dict[word] = vector
print(len(glove_embeddings_dict))

2195892


In [47]:
dd[100]['context']

[['Mascogos',
  ['The Mascogos (also known as "negros mascagos") are an afrodescendant group in Coahuila, Mexico.',
   ' Centered on the town of El Nacimiento in Múzquiz Municipality, the group are descendants of Black Seminoles escaping the threat of slavery in the United States.']],
 ['Dhoolpet',
  ['Dhoolpet is one of the old suburbs in Hyderabad, India.',
   ' It is part of the old city of Hyderabad.',
   ' This place is inhabited by people who migrated from Uttar Pradesh during the Nizam rule.',
   ' The Nizam helped these people settle in this area.',
   ' The area is notorious for bootlegging and has witnessed attacks on policemen or excise department officials during raids.']],
 ['Seminole music',
  ['Seminole music is the music of the Seminole people, an indigenous people of the Americas who formed in Florida in the 18th century.',
   ' Today most live in Oklahoma, but a minority continue in Florida.',
   ' They have three federally recognized tribes, and some people belong to

In [7]:
train = json.load(open(train_filename, 'r'))
dd = json.load(open(dev_distractor_filename, 'r'))
dev = json.load(open(dev_filename, 'r'))
test = json.load(open(test_filename, 'r'))

In [8]:
train[0].keys()

dict_keys(['supporting_facts', 'level', 'question', 'context', 'answer', '_id', 'type'])

In [9]:
qa_types = []
for d in range(len(train)):
    qa_types.append(train[d]['type'])
    
print(set(qa_types))
print({x : qa_types.count(x) for x in set(qa_types)})

{'bridge', 'comparison'}
{'bridge': 72991, 'comparison': 17456}


# Preprocessing

In [72]:
def process_article(article):
    
    # Fill context if empty
    if len(article['context']) == 0:
        article['context'] = [['some random title', 'some random stuff']]
    
    # Convert supporting facts to set of tuples if present, else empty set
    if 'supporting_facts' in article:
        sp_set = set(list(map(tuple, article['supporting_facts'])))
    else:
        sp_set = set()
        
    # Create spans for the titles and supporting facts, keep track of total text in supporting facts
    text_context, context_tokens, context_chars = '', [], []
    offsets = []
    flat_offsets = []
    start_end_facts = []
    sent2title_ids = []
    
    def _process(sent, is_sup_fact, is_title=False):
        
        nonlocal text_context, context_tokens, context_chars, offsets, start_end_facts, flat_offsets
        N_chars = len(text_context) # Keep track of existing text

        sent_tokens = word_tokenize(sent)
        if is_title:
            sent = '<t> {} </t>'.format(sent)
            sent_tokens = ['<t>'] + sent_tokens + ['</t>']
        sent_chars = [list(token) for token in sent_tokens]
        sent_spans = convert_idx(sent, sent_tokens)

        sent_spans = [[N_chars + e[0], N_chars + e[1]] for e in sent_spans] # add offset to start and end indices of words

        text_context += sent # Context text
        context_tokens.extend(sent_tokens) # Word tokenized
        context_chars.extend(sent_chars) # Individual characters
        start_end_facts.append((len(context_tokens), len(context_tokens) + len(sent_tokens), is_sup_fact)) # Keep track of start and end of context
        offsets.append(sent_spans) # Keep track of spans - position of words
        flat_offsets.extend(sent_spans) # Keep track of spans - position of words
    
    # Count number of supporting facts per article
    sp_fact_cnt = 0
    for para in article['context']:
        cur_title, cur_para = para[0], para[1]
        _process(prepro_sent(cur_title), False, is_title=True)
        sent2title_ids.append((cur_title, -1)) # Titles have index -1, 0 starts from supporting facts
        for sent_id, sent in enumerate(cur_para):
            is_sup_fact = (cur_title, sent_id) in sp_set
            if is_sup_fact:
                sp_fact_cnt += 1
            _process(prepro_sent(sent), is_sup_fact)
            sent2title_ids.append((cur_title, sent_id))
            
    # Calculate best possible answer span
    if 'answer' in article: # Answer can be 'yes', 'no' or an actual answer which may or may not be present in the text context
        answer = article['answer'].strip()
        # best_indices has the start and end index of answer, if present in context
        
        if answer.lower() == 'yes':
                best_indices = [-1, -1]
        elif answer.lower() == 'no':
                best_indices = [-2, -2]
        else:
            if article['answer'].strip() not in ''.join(text_context): 
                best_indices = (0, 1)
            else:
                _, best_indices, _ = fix_span(text_context, offsets, article['answer']) # Find location of answer in context
                answer_span = []
                for idx, span in enumerate(flat_offsets):
                    if not (best_indices[1] <= span[0] or best_indices[0] >= span[1]):
                        answer_span.append(idx)
                best_indices = (answer_span[0], answer_span[-1]) # Get start and end indices of best possible answer
    
    else:
        # If answer not present in article
        answer = 'random'
        best_indices = (0, 1)

    ques_tokens = word_tokenize(article['question'])
    ques_chars = [list(token) for token in ques_tokens]

    example = {'context_tokens': context_tokens,
               'context_chars': context_chars, 
               'ques_tokens': ques_tokens, 
               'ques_chars': ques_chars, 
               'y1s': [best_indices[0]], 
               'y2s': [best_indices[1]], 
               'id': article['_id'], 
               'start_end_facts': start_end_facts}
    eval_example = {'context': text_context, 
                    'spans': flat_offsets, 
                    'answer': [answer], 
                    'id': article['_id'],
                    'sent2title_ids': sent2title_ids}
    
    return example, eval_example

In [87]:
# This function processes each article in required dataset by applying the process_article function

def process_data(data, word_counter = None, char_counter = None):
    
    examples = []
    eval_examples = {}

    #outputs = Parallel(n_jobs=12, verbose=10)(delayed(process_article)(article) for article in data)
    outputs = [process_article(article) for article in data]
    
    examples = [e[0] for e in outputs]
    for _, e in outputs:
        if e is not None:
            eval_examples[e['id']] = e

    # only count during training
    if word_counter is not None and char_counter is not None:
        for example in examples:
            for token in example['ques_tokens'] + example['context_tokens']:
                word_counter[token] += 1
                for char in token:
                    char_counter[char] += 1

    random.shuffle(examples)
    print("{} questions in total".format(len(examples)))

    return examples, eval_examples, word_counter, char_counter

In [106]:
# Function to get word embeddings

def get_embeddings(counter, data_type, emb_file, size, vec_size, token2idx_dict = None, limit = -1):
    
    print("Generating {} embedding...".format(data_type))
    
    embedding_dict = {}
    filtered_elements = [k for k, v in counter.items() if v > limit]
    
    if emb_file is None:
        assert vec_size is not None
        for token in filtered_elements:
            embedding_dict[token] = [np.random.normal(
                scale=0.01) for _ in range(vec_size)]
        print("{} tokens have corresponding embedding vector".format(
            len(filtered_elements)))
    else:
        ks = list(emb_file.keys())
        for e in filtered_elements:
            if e in ks:
                embedding_dict[e] = emb_file[e]
    
    print("{} / {} tokens have corresponding {} embedding vector".format(
        len(embedding_dict), len(filtered_elements), data_type))
    
    # Create embeddings for NULL and Out-of-Vocabulary
    NULL = "--NULL--"
    OOV = "--OOV--"
    token2idx_dict = {token: idx for idx, token in enumerate(
        embedding_dict.keys(), 2)} if token2idx_dict is None else token2idx_dict
    token2idx_dict[NULL] = 0
    token2idx_dict[OOV] = 1
    embedding_dict[NULL] = [0. for _ in range(vec_size)]
    embedding_dict[OOV] = [0. for _ in range(vec_size)]
    idx2emb_dict = {idx: embedding_dict[token]
                    for token, idx in token2idx_dict.items()}
    emb_mat = [idx2emb_dict[idx] for idx in range(len(idx2emb_dict))]

    idx2token_dict = {idx: token for token, idx in token2idx_dict.items()}

    return emb_mat, token2idx_dict, idx2token_dict

In [113]:
# This function is to convert all paragraphs and questions into indexes form

def build_features_examples(examples, data_type, out_file, word2idx_dict, char2idx_dict):
    if data_type == 'test':
        para_limit, ques_limit = 0, 0
        for example in tqdm(examples):
            para_limit = max(para_limit, len(example['context_tokens']))
            ques_limit = max(ques_limit, len(example['ques_tokens']))
    else:
        para_limit = 1000
        ques_limit = 80

    char_limit = 16
    
    # To remove contexts which exceed length limit set 
    def filter_func(example):
        return len(example["context_tokens"]) > para_limit or len(example["ques_tokens"]) > ques_limit

    print("Processing {} examples...".format(data_type))
    datapoints = []
    total = 0
    total_ = 0
    for example in tqdm(examples):
        total_ += 1
        
        # Filter the examples with respect to length
        if filter_func(example):
            continue

        total += 1
        
        # Empty arrays to hold question / paragraph vectors
        context_idxs = np.zeros(para_limit, dtype=np.int64)
        context_char_idxs = np.zeros((para_limit, char_limit), dtype=np.int64)
        ques_idxs = np.zeros(ques_limit, dtype=np.int64)
        ques_char_idxs = np.zeros((ques_limit, char_limit), dtype=np.int64)
        
        # Get index of word
        def _get_word(word):
            for each in (word, word.lower(), word.capitalize(), word.upper()):
                if each in word2idx_dict:
                    return word2idx_dict[each]
            return 1
        
        # Get index of character
        def _get_char(char):
            if char in char2idx_dict:
                return char2idx_dict[char]
            return 1
        
        # Fill the arrays
        context_idxs[:len(example['context_tokens'])] = [_get_word(token) for token in example['context_tokens']]
        ques_idxs[:len(example['ques_tokens'])] = [_get_word(token) for token in example['ques_tokens']]

        for i, token in enumerate(example["context_chars"]):
            l = min(len(token), char_limit)
            context_char_idxs[i, :l] = [_get_char(char) for char in token[:l]]

        for i, token in enumerate(example["ques_chars"]):
            l = min(len(token), char_limit)
            ques_char_idxs[i, :l] = [_get_char(char) for char in token[:l]]
        
        # Get the start and end indexes of the answer
        start, end = example["y1s"][-1], example["y2s"][-1]
        y1, y2 = start, end
        
        # Collate into one list - result: a list of dictionaries
        datapoints.append({'context_idxs': torch.from_numpy(context_idxs),
            'context_char_idxs': torch.from_numpy(context_char_idxs),
            'ques_idxs': torch.from_numpy(ques_idxs),
            'ques_char_idxs': torch.from_numpy(ques_char_idxs),
            'y1': y1,
            'y2': y2,
            'id': example['id'],
            'start_end_facts': example['start_end_facts']})
    print("Build {} / {} instances of features in total".format(total, total_))
    torch.save(datapoints, out_file)

In [88]:
word_counter, char_counter = Counter(), Counter()

examples, eval_examples, word_counter, char_counter = process_data(random.sample(train, 1000), Counter(), Counter())
#examples, eval_examples = process_data(random.sample(test,1000))

1000 questions in total


In [96]:
word_emb_mat, word2idx_dict, idx2word_dict = get_embeddings(word_counter, "word", emb_file = glove_embeddings_dict,
                                                size = int(2.2e6), vec_size = 300)

Generating word embedding...
56903 / 64871 tokens have corresponding word embedding vector


In [107]:
char_emb_mat, char2idx_dict, idx2char_dict = get_embeddings(
            char_counter, "char", emb_file=None, size = 94, vec_size = 8)

Generating char embedding...
1461 tokens have corresponding embedding vector
1461 / 1461 tokens have corresponding char embedding vector


In [111]:
char2idx_dict

{'T': 2,
 'h': 3,
 'e': 4,
 'O': 5,
 'l': 6,
 'd': 7,
 'S': 8,
 't': 9,
 'y': 10,
 'a': 11,
 'o': 12,
 'n': 13,
 'N': 14,
 '.': 15,
 '1': 16,
 '0': 17,
 ',': 18,
 'i': 19,
 's': 20,
 'c': 21,
 'D': 22,
 'w': 23,
 'u': 24,
 'k': 25,
 'U': 26,
 'f': 27,
 'C': 28,
 '?': 29,
 '<': 30,
 '>': 31,
 'L': 32,
 'r': 33,
 'v': 34,
 'I': 35,
 'R': 36,
 '/': 37,
 'm': 38,
 'W': 39,
 'p': 40,
 'b': 41,
 'g': 42,
 'A': 43,
 ' ': 44,
 'x': 45,
 '6': 46,
 'M': 47,
 'G': 48,
 '4': 49,
 '8': 50,
 '2': 51,
 '-': 52,
 '(': 53,
 ':': 54,
 '"': 55,
 'á': 56,
 ';': 57,
 ')': 58,
 '7': 59,
 'H': 60,
 'ú': 61,
 'z': 62,
 'ŋ': 63,
 'ȟ': 64,
 'F': 65,
 'P': 66,
 'B': 67,
 '9': 68,
 '5': 69,
 'K': 70,
 'E': 71,
 "'": 72,
 'q': 73,
 'J': 74,
 'V': 75,
 '3': 76,
 'Y': 77,
 'j': 78,
 'X': 79,
 '&': 80,
 '#': 81,
 '–': 82,
 '—': 83,
 'ü': 84,
 'İ': 85,
 '’': 86,
 ']': 87,
 'ı': 88,
 'ğ': 89,
 'Ç': 90,
 'Z': 91,
 'Ü': 92,
 '$': 93,
 '소': 94,
 '주': 95,
 '燒': 96,
 '酒': 97,
 '%': 98,
 'í': 99,
 'Þ': 100,
 'ó': 101,
 'ō': 

In [114]:
build_features_examples(examples, 'train', 'train_record.pkl', word2idx_dict, char2idx_dict)


  0%|                                                                                         | 0/1000 [00:00<?, ?it/s][A
  3%|██▋                                                                            | 34/1000 [00:00<00:02, 338.28it/s][A

Processing train examples...



  6%|████▎                                                                          | 55/1000 [00:00<00:03, 284.29it/s][A
  9%|██████▉                                                                        | 88/1000 [00:00<00:03, 287.13it/s][A
 11%|████████▎                                                                     | 107/1000 [00:00<00:03, 236.33it/s][A
 13%|██████████▎                                                                   | 132/1000 [00:00<00:03, 238.03it/s][A
 15%|███████████▊                                                                  | 152/1000 [00:00<00:03, 217.51it/s][A
 19%|██████████████▌                                                               | 186/1000 [00:00<00:03, 240.99it/s][A
 22%|█████████████████▍                                                            | 224/1000 [00:00<00:02, 264.86it/s][A
 25%|███████████████████▌                                                          | 251/1000 [00:00<00:02, 255.30it/s][A
 28%|██████████

Build 290 / 1000 instances of features in total
