In [1]:
# Load all relevant Python libraries and a SpaCy language model.
import json
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
# Open the provided JSON file. It contains a list of dictionaries with summaries from Wikipedia articles, 
# where each dictionary has three key-value pairs. The keys title, text and url correspond to:
# 
# - Title of a Wikipedia article the text is taken from.
# - Wikipedia article text (in this dataset we included only the summary).
# - Link to the Wikipedia article.
wiki_data = []
with open("./data.json") as f:
    wiki_data = json.load(f)
wiki_data[0].keys()

dict_keys(['title', 'text', 'url'])

In [4]:
sample_text = wiki_data[0]["text"]
doc = nlp(sample_text.lower())
for w in doc[:8]:
    if len(w.lemma_.strip()) == 0:
        continue
    print(len(w.lemma_.strip()), w.lemma_, w.is_stop, w.is_punct, w.dep_)

1 a True False det
8 pandemic False False nsubj
1 ( False True punct
4 from True False prep
5 greek False False amod
3 πᾶν False False pobj
1 , False True punct
3 pan False False conj


In [5]:
# Create a Python function that takes in a text string, 
# performs all operations described in the previous step and outputs a list of tokens (lemmas).
def tokenizer(text):
    doc = nlp(text.lower())
    tokens = [w for w in doc if not (w.is_stop or w.is_punct or len(w.lemma_.strip()) == 0)]
    return [t.lemma_ for t in tokens if t.dep_]

In [6]:
for token in tokenizer(sample_text)[:8]:
    print(token)

pandemic
greek
πᾶν
pan
δῆμος
demos
people
epidemic


In [7]:
# Use this function to preprocess all text documents in the dataset (text field only), 
# and add the resulting lists to the dictionaries from step 1. You should end up with a list of dictionaries, each of which now has four key-value pairs:
# 
# - title: Title of a Wikipedia article the text is taken from.
# - text: Wikipedia article text (in this dataset we included only the summary).
# - tokenized_text: Tokenized Wikipedia article text.
# - url: Link to the Wikipedia article.
for i, doc in enumerate(wiki_data):
    doc["tokenized_text"] = tokenizer(doc["text"])
    wiki_data[i] = doc

In [8]:
wiki_data[0]["tokenized_text"][:8]

['pandemic', 'greek', 'πᾶν', 'pan', 'δῆμος', 'demos', 'people', 'epidemic']

In [9]:
# Save the new list of dictionaries in a JSON format.
with open("wiki_data.json", "w") as f:
    json.dump(wiki_data, f)