In [3]:
import nltk
from datasets import load_dataset

squad = load_dataset('squad_v2')

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

!python -m spacy download en_core_web_sm

Found cached dataset squad_v2 (/Users/v/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d)
100%|██████████| 2/2 [00:00<00:00, 144.86it/s]
[nltk_data] Downloading package punkt to /Users/v/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/v/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [11]:
wikipedia_paragraphs = squad['train']['context']
wikipedia_paragraphs[0]

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'

In [12]:
import spacy
from spacy import displacy

ner = spacy.load('en_core_web_sm')

doc = ner('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
print([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'), ('Google', 'ORG'), ('$5.1 billion', 'MONEY'), ('Wednesday', 'DATE')]


In [13]:
from tqdm import tqdm

def create_NE_collection( paragraphs ):
    output = {}
    for p in tqdm(paragraphs):
        tokens = ner(p)

        for ne in tokens.ents:

            if ne.label_ in output:
                output[ne.label_].append(ne.text)
            else:
                output[ne.label_] = [ne.text]

    for k in output.keys():
        output[k] = list(set(output[k]))
        
    return output

ne_collection = create_NE_collection(wikipedia_paragraphs)
ne_collection


100%|██████████| 130319/130319 [33:31<00:00, 64.78it/s]


{'PERSON': ['Mosasaurs',
  'Yoshimasa',
  'Louis Aragon',
  'Bourbon',
  'Cherry Blossom',
  'Methodius',
  'Frederick W. Baldwin',
  'Dolby Surround',
  'Hall',
  'Jan Baudouin de Courtenay',
  'Kim Kardashian',
  'Selim the Grim',
  'Drama',
  "Arthur] Miller's",
  'Qīng Cháo',
  'Skylab',
  'Dimotiki',
  'Martin Javeršek',
  'Francis Collins',
  'Danté',
  'Alexander Bard',
  'Ronald Dworkin',
  'Ain',
  'Kota',
  'Ruili',
  'Neognathae',
  'Serapis',
  'Adolph Zukor',
  'Mario Palanti',
  'Hindi',
  'Hemudu',
  'West Campus',
  'John Spilsbury',
  'George III',
  'Arne Harris',
  'Tusculan',
  'Yucatec Maya',
  'Ahmose',
  'Струмички Карневал',
  'Mel',
  'Pedro Reinel',
  'Photius',
  'Xog Ogaal',
  'Yasser Arafat',
  'Monophthalmus',
  'Stacey Snider',
  'Robert Stephens',
  'Johann Wolfgang von Goethe',
  'Kampuchea',
  'George Michael',
  'Utrecht Vaartsche Rijn',
  'Supercor',
  'William Fraser',
  'Juno',
  'Mehrangarh Fort',
  'Projet de',
  'Shitao',
  'Stelios Kazantzidis'

In [14]:
import pickle

with open('ne_collection.obj', 'wb') as handle:
    pickle.dump(ne_collection, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [15]:
question = squad['train']['question'][0]
question

'When did Beyonce start becoming popular?'

In [25]:
import random
for question in squad['train']['question'][1000:1010]:
    question_chunks = question.split()
    doc = ner( question )
    res = [(X.text, X.label_) for X in doc.ents]

    if not res:
        continue 

    idx = random.randint(0, len(res) - 1)

    text, label = res[idx]

    substitution = random.choice(ne_collection[label])

    new_question = question.replace( text, substitution )
    print( f'{question} vs {new_question}')

What nationality were the two friends who served as a pivotal influence in Frédéric's life while in Paris? vs What nationality were the two friends who served as a pivotal influence in Frédéric's life while in Roman Hispania?
Julian Fontana tried to find his way where before moving to Paris? vs Käfigturm tried to find his way where before moving to Paris?
Where did Julian Fontana fail to get established? vs Where did Tito fail to get established?
Who did Chopin know that became rich in Paris? vs Who did Chopin know that became rich in Assassin?
Which friend of Chopin became like an older brother to him? vs Which friend of the RMS St Helena became like an older brother to him?
Which friend took on the role of several jobs to help Chopin including copyist? vs Which friend took on the role of several jobs to help Ali including copyist?
Where were Chopin and Fontana students together? vs Where were Chopin and Ye Olde students together?
Who gave Frédéric his first significant public approva