In [29]:
import ujson as json
import ipynb
import import_ipynb
import prepro
import random
import numpy as np
from ipynb.fs.full.prepro import *
import torch
from collections import Counter
import joblib
import sys
import time
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
import gc

In [22]:
train_filename = 'hotpot_train_v1.1.json'
dev_distractor_filename = 'hotpot_dev_distractor_v1.json'
dev_filename = 'hotpot_dev_fullwiki_v1.json'
test_filename = 'hotpot_test_fullwiki_v1.json'

In [3]:
glove_embeddings_dict = {}
embedding_size = 300
with open('glove.840B.300d.txt', 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = ''.join(values[:len(values) - embedding_size])
        vector = np.asarray(values[-embedding_size:], "float32")
        glove_embeddings_dict[word] = vector
print(len(glove_embeddings_dict))

2195892


In [23]:
train = joblib.load('train_sample.pkl')
dd = json.load(open(dev_distractor_filename, 'r'))
dev = json.load(open(dev_filename, 'r'))
test = json.load(open(test_filename, 'r'))

In [None]:
train[0].keys()

In [None]:
qa_types = []
for d in range(len(train)):
    qa_types.append(train[d]['type'])
    
print(set(qa_types))
print({x : qa_types.count(x) for x in set(qa_types)})

# Preprocessing

In [17]:
def process_article(article):
    
    # Fill context if empty
    if len(article['context']) == 0:
        article['context'] = [['some random title', 'some random stuff']]
    
    # Convert supporting facts to set of tuples if present, else empty set
    if 'supporting_facts' in article:
        sp_set = set(list(map(tuple, article['supporting_facts'])))
    else:
        sp_set = set()
        
    # Create spans for the titles and supporting facts, keep track of total text in supporting facts
    text_context, context_tokens, context_chars = '', [], []
    offsets = []
    flat_offsets = []
    start_end_facts = []
    sent2title_ids = []
    
    def _process(sent, is_sup_fact, is_title=False):
        
        nonlocal text_context, context_tokens, context_chars, offsets, start_end_facts, flat_offsets
        N_chars = len(text_context) # Keep track of existing text

        sent_tokens = word_tokenize(sent)
        if is_title:
            sent = '<t> {} </t>'.format(sent)
            sent_tokens = ['<t>'] + sent_tokens + ['</t>']
        sent_chars = [list(token) for token in sent_tokens]
        sent_spans = convert_idx(sent, sent_tokens)

        sent_spans = [[N_chars + e[0], N_chars + e[1]] for e in sent_spans] # add offset to start and end indices of words

        text_context += sent # Context text
        context_tokens.extend(sent_tokens) # Word tokenized
        context_chars.extend(sent_chars) # Individual characters
        start_end_facts.append((len(context_tokens), len(context_tokens) + len(sent_tokens), is_sup_fact)) # Keep track of start and end of context
        offsets.append(sent_spans) # Keep track of spans - position of words
        flat_offsets.extend(sent_spans) # Keep track of spans - position of words
    
    # Count number of supporting facts per article
    sp_fact_cnt = 0
    for para in article['context']:
        cur_title, cur_para = para[0], para[1]
        _process(prepro_sent(cur_title), False, is_title=True)
        sent2title_ids.append((cur_title, -1)) # Titles have index -1, 0 starts from supporting facts
        for sent_id, sent in enumerate(cur_para):
            is_sup_fact = (cur_title, sent_id) in sp_set
            if is_sup_fact:
                sp_fact_cnt += 1
            _process(prepro_sent(sent), is_sup_fact)
            sent2title_ids.append((cur_title, sent_id))
            
    # Calculate best possible answer span
    if 'answer' in article: # Answer can be 'yes', 'no' or an actual answer which may or may not be present in the text context
        answer = article['answer'].strip()
        # best_indices has the start and end index of answer, if present in context
        
        if answer.lower() == 'yes':
                best_indices = [-1, -1]
        elif answer.lower() == 'no':
                best_indices = [-2, -2]
        else:
            if article['answer'].strip() not in ''.join(text_context): 
                best_indices = (0, 1)
            else:
                _, best_indices, _ = fix_span(text_context, offsets, article['answer']) # Find location of answer in context
                answer_span = []
                for idx, span in enumerate(flat_offsets):
                    if not (best_indices[1] <= span[0] or best_indices[0] >= span[1]):
                        answer_span.append(idx)
                best_indices = (answer_span[0], answer_span[-1]) # Get start and end indices of best possible answer
    
    else:
        # If answer not present in article
        answer = 'random'
        best_indices = (0, 1)

    ques_tokens = word_tokenize(article['question'])
    ques_chars = [list(token) for token in ques_tokens]

    example = {'context_tokens': context_tokens,
               'context_chars': context_chars, 
               'ques_tokens': ques_tokens, 
               'ques_chars': ques_chars, 
               'y1s': [best_indices[0]], 
               'y2s': [best_indices[1]], 
               'id': article['_id'], 
               'start_end_facts': start_end_facts}
    eval_example = {'context': text_context, 
                    'spans': flat_offsets, 
                    'answer': [answer], 
                    'id': article['_id'],
                    'sent2title_ids': sent2title_ids}
    
    return example, eval_example

In [16]:
# This function processes each article in required dataset by applying the process_article function

def process_data(data, word_counter = None, char_counter = None):
    
    examples = []
    eval_examples = {}

    #outputs = Parallel(n_jobs = -1, verbose=10)(delayed(process_article)(article) for article in data)
    outputs = [process_article(article) for article in data]
    
    examples = [e[0] for e in outputs]
    for _, e in outputs:
        if e is not None:
            eval_examples[e['id']] = e

    # only count during training
    if word_counter is not None and char_counter is not None:
        for example in examples:
            for token in example['ques_tokens'] + example['context_tokens']:
                word_counter[token] += 1
                for char in token:
                    char_counter[char] += 1

    random.shuffle(examples)
    print("{} questions in total".format(len(examples)))

    return examples, eval_examples, word_counter, char_counter

In [4]:
# Function to get word embeddings

def get_embeddings(counter, data_type, emb_file, size, vec_size, token2idx_dict = None, limit = -1):
    
    print("Generating {} embedding...".format(data_type))
    
    embedding_dict = {}
    filtered_elements = [k for k, v in counter.items() if v > limit]
    
    if emb_file is None:
        assert vec_size is not None
        for token in filtered_elements:
            embedding_dict[token] = [np.random.normal(
                scale=0.01) for _ in range(vec_size)]
        print("{} tokens have corresponding embedding vector".format(
            len(filtered_elements)))
    else:
        ks = list(emb_file.keys())
        reqd_elements = set(ks).intersection(set(filtered_elements))
        for e in reqd_elements:
            embedding_dict[e] = emb_file[e]
    
    del emb_file
    
    print("{} / {} tokens have corresponding {} embedding vector".format(
        len(embedding_dict), len(filtered_elements), data_type))
    
    # Create embeddings for NULL and Out-of-Vocabulary
    NULL = "--NULL--"
    OOV = "--OOV--"
    token2idx_dict = {token: idx for idx, token in enumerate(
        embedding_dict.keys(), 2)} #if token2idx_dict is None else token2idx_dict
    token2idx_dict[NULL] = 0
    token2idx_dict[OOV] = 1
    embedding_dict[NULL] = [0. for _ in range(vec_size)]
    embedding_dict[OOV] = [0. for _ in range(vec_size)]
    print('Done')
    idx2emb_dict = {idx: embedding_dict[token]
                    for token, idx in token2idx_dict.items()}
    emb_mat = [idx2emb_dict[idx] for idx in range(len(idx2emb_dict))]
    print('Done')
    #idx2token_dict = {idx: token for token, idx in token2idx_dict.items()}
    idx2token_dict = dict(zip(token2idx_dict.values(), token2idx_dict.keys()))

    return emb_mat, token2idx_dict, idx2token_dict

In [15]:
# This function is to convert all paragraphs and questions into indexes form

def build_features_examples(examples, data_type, out_file, word2idx_dict, char2idx_dict):
    if data_type == 'test':
        para_limit, ques_limit = 0, 0
        for example in tqdm(examples):
            para_limit = max(para_limit, len(example['context_tokens']))
            ques_limit = max(ques_limit, len(example['ques_tokens']))
    else:
        para_limit = 1000
        ques_limit = 80

    char_limit = 16
    
    # To remove contexts which exceed length limit set 
    def filter_func(example):
        return len(example["context_tokens"]) > para_limit or len(example["ques_tokens"]) > ques_limit

    print("Processing {} examples...".format(data_type))
    datapoints = []
    total = 0
    total_ = 0
    for example in tqdm(examples):
        total_ += 1
        
        # Filter the examples with respect to length
        if filter_func(example):
            continue

        total += 1
        
        # Empty arrays to hold question / paragraph vectors
        context_idxs = np.zeros(para_limit, dtype=np.int64)
        context_char_idxs = np.zeros((para_limit, char_limit), dtype=np.int64)
        ques_idxs = np.zeros(ques_limit, dtype=np.int64)
        ques_char_idxs = np.zeros((ques_limit, char_limit), dtype=np.int64)
        
        # Get index of word
        def _get_word(word):
            for each in (word, word.lower(), word.capitalize(), word.upper()):
                if each in word2idx_dict:
                    return word2idx_dict[each]
            return 1
        
        # Get index of character
        def _get_char(char):
            if char in char2idx_dict:
                return char2idx_dict[char]
            return 1
        
        # Fill the arrays
        context_idxs[:len(example['context_tokens'])] = [_get_word(token) for token in example['context_tokens']]
        ques_idxs[:len(example['ques_tokens'])] = [_get_word(token) for token in example['ques_tokens']]

        for i, token in enumerate(example["context_chars"]):
            l = min(len(token), char_limit)
            context_char_idxs[i, :l] = [_get_char(char) for char in token[:l]]

        for i, token in enumerate(example["ques_chars"]):
            l = min(len(token), char_limit)
            ques_char_idxs[i, :l] = [_get_char(char) for char in token[:l]]
        
        # Get the start and end indexes of the answer
        start, end = example["y1s"][-1], example["y2s"][-1]
        y1, y2 = start, end
        
        # Collate into one list - result: a list of dictionaries
        datapoints.append({'context_idxs': torch.from_numpy(context_idxs),
            'context_char_idxs': torch.from_numpy(context_char_idxs),
            'ques_idxs': torch.from_numpy(ques_idxs),
            'ques_char_idxs': torch.from_numpy(ques_char_idxs),
            'y1': y1,
            'y2': y2,
            'id': example['id'],
            'start_end_facts': example['start_end_facts']})
    print("Build {} / {} instances of features in total".format(total, total_))
    torch.save(datapoints, out_file)

In [None]:
# Sample train data
#train = random.sample(train, 10000)

In [None]:
word_counter, char_counter = Counter(), Counter()

examples, eval_examples, word_counter, char_counter = process_data(random.sample(train, 1000), Counter(), Counter())
#examples, eval_examples = process_data(random.sample(test,1000))

In [None]:
word_emb_mat, word2idx_dict, idx2word_dict = get_embeddings(word_counter, "word", emb_file = glove_embeddings_dict,
                                                size = int(2.2e6), vec_size = 300)

In [None]:
char_emb_mat, char2idx_dict, idx2char_dict = get_embeddings(
            char_counter, "char", emb_file=None, size = 94, vec_size = 8)

In [None]:
build_features_examples(examples, 'train', 'train_record.pkl', word2idx_dict, char2idx_dict)

In [None]:
### Export all necessary files (embeddings, features, processed text)

word_counter_train, char_counter_train = Counter(), Counter()

examples_train, eval_examples_train, word_counter_train, char_counter_train = process_data(train, 
                                                                               word_counter_train, char_counter_train)

In [None]:
#joblib.dump(train, 'train_sample.pkl')

#with open('examples_train.json', "w") as fh:
#    json.dump(examples_train, fh)

#with open('eval_examples_train.json', "w") as fh:
#    json.dump(eval_examples_train, fh)

#with open('word_counter_train.json', "w") as fh:
#    json.dump(word_counter_train, fh)

#with open('char_counter_train.json', "w") as fh:
#    json.dump(char_counter_train, fh)

In [30]:
examples_test, eval_examples_test, word_counter_test, char_counter_test = process_data(test)
print('Test Done')
examples_dev, eval_examples_dev, word_counter_dev, char_counter_dev = process_data(dev)
print('Dev Done')

7405 questions in total
Test Done
7405 questions in total
Dev Done


In [31]:
with open('examples_test.json', "w") as fh:
    json.dump(examples_test, fh)

with open('eval_examples_test.json', "w") as fh:
    json.dump(eval_examples_test, fh)

In [32]:
with open('examples_dev.json', "w") as fh:
    json.dump(examples_dev, fh)

with open('eval_examples_dev.json', "w") as fh:
    json.dump(eval_examples_dev, fh)

In [5]:
word_counter_train = json.load(open('word_counter_train.json', 'r'))

In [6]:
start_time = time.time()
word_emb_mat, word2idx_dict, idx2word_dict = get_embeddings(word_counter_train, "word", emb_file = glove_embeddings_dict,
                                                size = int(2.2e6), vec_size = 300)
print(time.time() - start_time)

Generating word embedding...
178725 / 241010 tokens have corresponding word embedding vector
Done
Done
0.5842909812927246


In [8]:
#del glove_embeddings_dict
#gc.collect()

#joblib.dump(word_emb_mat, 'word_emb.pkl')
#joblib.dump(word2idx_dict, 'word2idx.pkl')
#joblib.dump(idx2word_dict, 'idx2word.pkl')
print('Done')
del word_emb_mat
gc.collect()

Done


0

In [11]:
char_counter_train = json.load(open('char_counter_train.json', 'r'))

In [13]:
char_emb_mat, char2idx_dict, idx2char_dict = get_embeddings(
            char_counter_train, "char", emb_file=None, size = 94, vec_size = 8)

Generating char embedding...
3798 tokens have corresponding embedding vector
3798 / 3798 tokens have corresponding char embedding vector
Done
Done


In [14]:
joblib.dump(char_emb_mat, 'char_emb.pkl')
joblib.dump(char2idx_dict, 'char2idx.pkl')
joblib.dump(idx2char_dict, 'idx2char.pkl')
print('Done')

Done


In [21]:
examples_train = json.load(open('examples_train.json', 'r'))
build_features_examples(examples_train, 'train', 'train_record.pkl', word2idx_dict, char2idx_dict)

  3%|▎         | 288/10000 [00:00<00:07, 1228.14it/s]

Processing train examples...


100%|██████████| 10000/10000 [00:06<00:00, 1466.72it/s]


Build 3068 / 10000 instances of features in total


In [33]:
examples_test = json.load(open('examples_test.json', 'r'))
build_features_examples(examples_test, 'test', 'test_record.pkl', word2idx_dict, char2idx_dict)

100%|██████████| 7405/7405 [00:00<00:00, 285708.70it/s]
  1%|          | 52/7405 [00:00<00:32, 226.74it/s]

Processing test examples...


100%|██████████| 7405/7405 [00:24<00:00, 296.83it/s]


Build 7405 / 7405 instances of features in total


In [34]:
examples_dev = json.load(open('examples_dev.json', 'r'))
build_features_examples(examples_dev, 'dev', 'dev_record.pkl', word2idx_dict, char2idx_dict)

  1%|          | 92/7405 [00:00<00:07, 914.97it/s]

Processing dev examples...


100%|██████████| 7405/7405 [00:04<00:00, 1558.20it/s]


Build 2038 / 7405 instances of features in total
