In [1]:
# Load all relevant Python libraries and a SpaCy language model.
import json
import itertools
import numpy as np
import spacy

from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
# Access the tokenized text in your new dataset from milestone. 
# Each document dictionary should now include a new key-value pair with the lemmatized text of the articles.
with open("./wiki_data.json") as f:
    wiki_data = json.load(f)
wiki_data[0].keys()

dict_keys(['title', 'text', 'url', 'tokenized_text', 'tf_idf'])

In [4]:
# Create a corpus vocabulary. 
# It should simply be a list of unique tokens in the provided set of documents. 
tokenized_texts = [t["tokenized_text"] for t in wiki_data]
vocab = list(set(itertools.chain(*tokenized_texts)))

with open("vocab.json", "w") as f:
    json.dump(vocab, f)
vocab[:8]

['winter', '1', 'scholar', 'visit', '1854', 'set', 'worshipful', '12']

In [5]:
# Count how many times each unique tokens appears in the corpus, 
# you will need these counts for the next step.
docs_count = []
for doc in wiki_data:
    docs_count.append(Counter(doc["tokenized_text"]))
docs_count[0].most_common(8)

[('pandemic', 7),
 ('people', 5),
 ('disease', 4),
 ('number', 4),
 ('spread', 2),
 ('large', 2),
 ('region', 2),
 ('worldwide', 2)]

In [6]:
docs_with_token = {}
for token in vocab:
    docs_with_token[token] = sum([1 for doc in docs_count if token in doc.keys()])
docs_with_token["pandemic"]

17

In [7]:
# Calculate Tf-Idf vectors for every article in the dataset
# and add the add these vectors to the article dictionaries.
# You should end up the same list a list of dictionaries as before,
# but with a new key-value pair containing Tf-Idf vectors:
#
# title: Title of a Wikipedia article the text is taken from.
# text: Wikipedia article text (in this dataset we included only the summary).
# tokenized_text: Tokenized Wikipedia article text.
# url: Link to the Wikipedia article.
# tf_idfs: Tf_Idf vector.
n_wiki_data = len(wiki_data)

In [8]:
def vectorize(doc, tokenized_text, vocab):
    tfidf_vec = []
    n_tokenized_text = len(tokenized_text)
    for token in vocab:
        # Compute a term frequency (Tf) per document
        tf = doc[token] / n_tokenized_text
        
        # Compute a log of inverse document frequency (Idf) per document
        idf = np.log(n_wiki_data / docs_with_token[token])

        tfidf = tf * idf
        tfidf_vec.append(tfidf)
    return tfidf_vec

In [9]:
for i, doc in enumerate(docs_count):
    # Update the original list of dictionaries by adding a new field to each document dictionary called tf_idf, 
    # containing a list of Tf-Idf values for the words in vocabulary.
    wiki_data[i]["tf_idf"] = vectorize(doc, wiki_data[i]["tokenized_text"], vocab)
    
wiki_data[0]["tf_idf"][:8]

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [10]:
# Save this new list of dictionaries as a JSON file.
with open("wiki_data.json", "w") as f:
    json.dump(wiki_data, f)

In [11]:
# Now we can try to search our list of dictionaries using this Tf-Idf field using existing tools for similarity. 
# We suggest to use Scikit-Learn library and its cosine_similarity function.

In [12]:
def tokenizer(text):
    doc = nlp(text.lower())
    tokens = [w for w in doc if not (w.is_stop or w.is_punct or len(w.lemma_.strip()) == 0)]
    return [t.lemma_ for t in tokens if t.dep_]

In [13]:
def search_tfidf(query, docs):
    # Vectorize query
    query_tkn = tokenizer(query)
    query_vec = vectorize(Counter(query_tkn), query_tkn, vocab)
    query_arr = np.array(query_vec)
    
    # Create a function to calculate cosine similarity between each document's Tf-Idf array
    # and the Tf-Idf array of the query.
    rankings = []
    for doc in docs:
        doc_arr = np.array(doc["tf_idf"])
        rank = cosine_similarity(query_arr.reshape(1,-1), doc_arr.reshape(1, -1))[0][0]
        if rank <= 0:
            continue
        rankings.append((rank, doc["title"]))

    # Return the dictionary sorted by the cosine similarity value in reverse order.
    return sorted(rankings, reverse=True)

In [14]:
search_tfidf("When COVID-19 will be ended?", wiki_data)

[(0.12086063518833431, 'COVID-19 pandemic'),
 (0.06488220500470956, 'Pandemic'),
 (0.06114716796110625, 'Crimson Contagion'),
 (0.05408174997825669, 'Disease X'),
 (0.04268474318671646, 'Pandemic Severity Assessment Framework'),
 (0.0406346645234315, 'Science diplomacy and pandemics')]

In [15]:
search_tfidf("coronavirus", wiki_data)

[(0.26854372399806203, 'COVID-19 pandemic')]

In [16]:
for doc in wiki_data:
    if doc["title"] == "COVID-19 pandemic":
        print(doc["text"])
        break

The COVID-19 pandemic, also known as the coronavirus pandemic, is an ongoing pandemic of coronavirus disease 2019 (COVID-19) caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2). The disease was first identified in December 2019 in Wuhan, China. The outbreak was declared a Public Health Emergency of International Concern in January 2020, and a pandemic in March 2020. As of 17 October 2020, more than 39.5 million cases have been confirmed, with more than 1.1 million deaths attributed to COVID-19.

Common symptoms include fever, cough, fatigue, breathing difficulties, and loss of smell. Complications may include pneumonia and acute respiratory distress syndrome. The incubation period is typically around five days but may range from one to 14 days. There are several vaccine candidates in development, although none have proven their safety and efficacy. There is no known specific antiviral medication, so primary treatment is currently symptomatic.
Recommended preventive m