#### Load in data

In [1]:
import json
import csv

data = []
for i in range(1, 6):
    filename = './run{}/'.format(i) + 'qa_pairs.json'
    with open(filename, 'r') as f:
        data += json.load(f)

#### Load in tokenizer

In [2]:
import en_core_web_sm
spacy_en = en_core_web_sm.load()

In [3]:
def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

#### Some basic stats

In [4]:
q_count = 0
a_count = 0
word_count = 0
word_types = set()
with open('all_qa_pairs.csv', 'w', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['question', 'answer', 'key phrases'])
    for pair in data:
        question = pair['question']
        q_count += 1
        tokenized_question = tokenize_en(question)
        word_count += len(tokenized_question)
        word_types = word_types.union(set(tokenized_question))
        for answer in pair['answers']:
            tokenized_answer = tokenize_en(answer)
            word_count += len(tokenized_answer)
            word_types = word_types.union(tokenized_answer)
            a_count += 1
            writer.writerow([question, answer, ''])
        
print('total question count:', q_count)
print('total answer count:', a_count)
print('number of word tokens:', word_count)
print('number of word types:', len(word_types))

total question count: 800
total answer count: 3815
number of word tokens: 110305
number of word types: 7507


In [5]:
with open('all_qa_pairs.json', 'w') as outfile:
    json.dump(data, outfile)