In [1]:
import os
import json
from urllib import request
import requests
import zipfile
import io

import numpy as np

In [2]:
os.chdir('../..')

In [3]:
SQUAD_TRAIN_URL = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"
SQUAD_DEV_URL = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"

ADVERSARIAL_QA_URL = "https://adversarialqa.github.io/data/aqa_v1.0.zip"

View SQuAD example

In [4]:
with request.urlopen(SQUAD_TRAIN_URL) as url:
    squad_train = json.loads(url.read().decode())

In [5]:
squad_train.keys()

dict_keys(['data', 'version'])

In [6]:
squad_train['version']

'1.1'

In [7]:
len(squad_train['data'])

442

In [8]:
squad_train['data'][0]['paragraphs'][0]

{'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'qas': [{'answers': [{'answer_start': 515,
     'text': 'Saint Bernadette Soubirous'}],
   'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
   'id': '5733be284776f41900661182'},
  {'answers': [{'answer_start': 188, 'text': 'a copper statue of Christ

In [10]:
squad_train_ids = []
for data in squad_train['data']:
    for para in data['paragraphs']:
        for qa in para['qas']:
            squad_train_ids.append(qa['id'])

In [11]:
len(squad_train_ids)

87599

In [12]:
len(set(squad_train_ids))

87599

Extract Aversarial QA ZIP file

In [10]:
r = requests.get(ADVERSARIAL_QA_URL)
r.ok

True

In [15]:
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall("data/external/aqa_v1.0")

View Adversarial QA example

In [14]:
with open('data/external/aqa_v1.0/1_dbidaf/train.json') as f:
    dbidaf_train = json.load(f)

In [15]:
dbidaf_train.keys()

dict_keys(['data', 'version'])

In [16]:
dbidaf_train['version']

''

In [17]:
dbidaf_train['data'][0].keys()

dict_keys(['title', 'paragraphs'])

In [18]:
dbidaf_train['data'][0]['paragraphs'][0]

{'context': "Slack (2003) compares three groups that conducted biological research at Yale during overlapping periods between 1910 and 1970. Yale proved important as a site for this research. The leaders of these groups were Ross Granville Harrison, Grace E. Pickford, and G. Evelyn Hutchinson, and their members included both graduate students and more experienced scientists. All produced innovative research, including the opening of new subfields in embryology, endocrinology, and ecology, respectively, over a long period of time. Harrison's group is shown to have been a classic research school; Pickford's and Hutchinson's were not. Pickford's group was successful in spite of her lack of departmental or institutional position or power. Hutchinson and his graduate and postgraduate students were extremely productive, but in diverse areas of ecology rather than one focused area of research or the use of one set of research tools. Hutchinson's example shows that new models for research grou

In [19]:
dbidaf_train_ids = []
for data in dbidaf_train['data']:
    for para in data['paragraphs']:
        for qa in para['qas']:
            dbidaf_train_ids.append(qa['id'])

In [20]:
len(dbidaf_train_ids)

10000

In [21]:
len(set(dbidaf_train_ids))

10000

In [22]:
set(dbidaf_train_ids).intersection(set(squad_train_ids))

set()

No overlapping IDs between SQuAD v1.1 and any of adversarial datasets

`aqa_v1.0/combined/train.json` has duplicate IDs:

In [23]:
with open('data/external/aqa_v1.0/combined/train.json') as f:
    combined_train = json.load(f)

In [24]:
combined_train_ids = []
for data in combined_train['data']:
    for para in data['paragraphs']:
        for qa in para['qas']:
            combined_train_ids.append(qa['id'])

In [25]:
len(combined_train_ids)

30000

In [26]:
len(set(combined_train_ids))

29965

In [27]:
ids, counts = np.unique(combined_train_ids, return_counts=True)

In [28]:
ids[counts > 1]

array(['06f0fc11ac4afbd691e3318326f505c204f40bd4',
       '17c4021092c54a222b3357ece608acbe63d932e3',
       '202b2db2a023b1328bc31de7d52a5661ae4d9a98',
       '36b1366067af52a5d96893a3f71b89b88b174c34',
       '37b2f1140a24eb1c750c75fced296c998a7619b3',
       '3d2501fbc9220aa407693fcedf05ed14f0d3b729',
       '497fbc529dae06475d48ead100e0c71cb4c8591e',
       '4d3cb5677211ee32895ca9c66dad04d7152254d4',
       '5117a4d8df0e323d2ae36f4dae676c0747a32870',
       '57c2c33b7d89ce702818e98d05d353dec4e4cd3e',
       '5be45686c520b4a199fb72872450ed29b568de66',
       '6195f231e14123c1ae84d44c23d907b1d4936748',
       '624f30226a8faceda9717ad02c0d1f191c9ae09d',
       '6a342115c27fce6e4e0764ba00709839a61d1e70',
       '6ace7f0a1d6ca12f1a82aa4b6ed9b880c4d22154',
       '6d6d369fa0b39f2681e1beb36cccf235b1109b86',
       '6e223ba9d4b9a915cd805f5d9f1849361ac9b3e6',
       '70109ff57a67952323971bc21e7f174f337d8b17',
       '76b62ed4eb083826cf89b07afe51b3d0ffd86b8a',
       '794e467afa80c99cbfdfcba

In [29]:
'4d3cb5677211ee32895ca9c66dad04d7152254d4' in ids[counts > 1]

True

^ This led to the following PR: https://github.com/huggingface/datasets/pull/2433

Possible to have duplicate IDs in the dictionary so long as the ID in the tuple is unique.