forked from kingsman142/CS7650-term-project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_squad_to_csv.py
60 lines (49 loc) · 1.89 KB
/
parse_squad_to_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import json
import csv
def parse_to_dict_list(data):
index = 0
final_list = []
for title in data:
title_name = title['title']
for para in title['paragraphs']:
context = para['context']
for qas in para['qas']:
answers = set()
question = qas['question']
for ans in qas['answers']:
answers.add(ans['text'])
for answer in answers:
entry = {'id': index, 'title': title_name, 'context': context, 'question': question,
'answer': answer}
final_list.append(entry)
index = index + 1
return final_list
if __name__ == "__main__":
# Parse squad json to csv
with open('./squad_dataset/dev-v1.1.json') as f:
dev_data = json.load(f)
dev_data = dev_data['data']
print("Types of paragraphs in Squad Dataset:\n")
for i in dev_data:
print(i['title'])
dev_data_dict_list = parse_to_dict_list(dev_data)
try:
with open('./squad_dataset/dev-v1.1.csv', 'w+', encoding="utf-8") as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=dev_data_dict_list[0].keys())
writer.writeheader()
for entry in dev_data_dict_list:
writer.writerow(entry)
except IOError as e:
print(e)
with open('./squad_dataset/train-v1.1.json') as f:
train_data = json.load(f)
train_data = train_data['data']
train_data_dict_list = parse_to_dict_list(train_data)
try:
with open('./squad_dataset/train-v1.1.csv', 'w+', encoding="utf-8") as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=train_data_dict_list[0].keys())
writer.writeheader()
for entry in train_data_dict_list:
writer.writerow(entry)
except IOError as e:
print(e)