In [None]:
%%capture
%pip install ir_datasets
%pip install demoji
%pip install pycld3
%pip install langdetect

import re
import numpy as np
import pandas as pd
import ir_datasets
import demoji

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

from langdetect import detect
from cld3 import get_language

from wordcloud import WordCloud

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display
from tqdm.notebook import tqdm

from collections import Counter

from my_utils import *

pd.set_option('max_colwidth', 800)

In [None]:
# load dataset
dataset = ir_datasets.load("cord19/trec-covid")
print(f"queries: {dataset.queries_count()}, docs: {dataset.docs_count()}, qrels: {dataset.qrels_count()}")

In [None]:
%%capture
# convert the collection in a dataframe
queries = pd.DataFrame(dataset.queries_iter())
docs = pd.DataFrame(dataset.docs_iter())
qrels = pd.DataFrame(dataset.qrels_iter())

docs['date'] = pd.to_datetime(docs['date'])

# Part 1: Analysis of Queries and Documents

In [None]:
display(queries.sample(5))
display(docs.sample(5))
display(qrels.sample(5))

## Analysis of Documents

### Filter Covid documents

Visualize years distribution

In [None]:
years = docs["date"].dt.year
years.dropna(inplace=True)
years = years.astype(int)

u_years, u_counts = np.unique(years, return_counts=True)

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(25, 5))

sns.barplot(x=u_years, y=u_counts, ax=ax1)
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=90)

sns.boxplot(x=years, ax=ax2, orient="h")
ax2.margins(0)

fig.tight_layout()
fig.savefig("out/dates.png")

In [None]:
keywords = [
    "covid 19",
    "covid-19",
    "covid19",
    "sars-cov-2",
    "sarscov2",
    "sars cov 2",
    "2019-nCoV",
    "Wuhan virus",
    "Chinese flu",
    "COronaVIrusDisease",
]

keep = set()
for index, row in tqdm(docs.iterrows(), total=len(docs)):
    text = " ".join([row["title"], row["abstract"]])
    if any(re.findall(r"|".join(keywords), text, re.IGNORECASE)):
        keep.add(index)

    # the few documents without date contain at least one keyword
    if row["date"] and row["date"] is not pd.NaT:
        if row["date"].year >= 2019:
            if any(
                re.findall(
                    r"coronavirus | corona",
                    " ".join([row["title"], row["abstract"]]),
                    re.IGNORECASE,
                )
            ):
                keep.add(index)

print(f"{len(keep)}, {round(len(keep) / len(docs) * 100)}%")

docs = docs.iloc[list(keep)]

### Missing Values

In [None]:
docs.replace("", pd.NA, inplace=True)

for col in docs:
  m = sum(docs[col].isna())
  p = m / len(docs) * 100
  print(f"{col:10} {m:5} ({p:.2f}%)")

In [None]:
docs[docs['title'].isna()]

In [None]:
docs[docs['abstract'].isna()].sample(5)

### Duplicated Values

In [None]:
duplicated = sum(docs.duplicated())
duplicated_perc = duplicated / len(docs) * 100
print(f"duplicated rows: {duplicated}, {duplicated_perc:.2f}%")

In [None]:
def print_duplicates(docs, columns=None):
  if columns is None:
    columns = docs.columns

  for col in columns:
    d = sum(docs[~docs[col].isna()][col].duplicated())
    p = d / len(docs) * 100
    print(f"{col:10} {d:5} ({p:.2f}%)")

In [None]:
print("Duplicates for each column:")
print_duplicates(docs, columns=["doc_id", "title", "doi", "abstract"])

In [None]:
docs[docs["doc_id"] == "uym826bh"]

In [None]:
docs.drop_duplicates(inplace=True)
docs[docs["doc_id"] == "uym826bh"]

In [None]:
print_duplicates(docs, columns=["doc_id", "title", "doi", "abstract"])

In [None]:
duplicated = sum(docs.duplicated(["title", "abstract"]))
print(f"Duplicates (title, abstract): {duplicated} ({duplicated / len(docs) * 100:.2f}%)")

duplicated = sum(docs.duplicated(["doc_id", "title", "abstract"]))
print(f"Duplicates (doc_id, title, abstract): {duplicated} ({duplicated / len(docs) * 100:.2f}%)")

duplicated = sum(docs.duplicated(["doc_id", "title", "abstract", "doi"]))
print(f"Duplicates (doc_id, title, abstract, doi): {duplicated} ({duplicated / len(docs) * 100:.2f}%)")

Example of the same document with different dates

In [None]:
docs[docs['doc_id'] == "0fbmelx0"]

### Detect Languages
https://modelpredict.com/language-identification-survey

In [None]:
# download the pretrained model for fasttext language identification
import requests
pretrained_model = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"

results = requests.get(pretrained_model)
with open('/tmp/lid.176.bin', 'wb') as f:
    f.write(results.content)

In [None]:
from lang_identification import LanguageIdentification
lang_id = LanguageIdentification()

languages = []
for i, doc in tqdm(docs.iterrows(), total=len(docs)):
    text = doc['abstract']

    if text is pd.NA or text == "":
        text = doc['title']
    try:                                                          
        # lang = detect(text)        # language_detect  
        # lang = get_language(text)  # cld3 
        # lang = lang.language  
        lang = lang_id.predict_lang(text)                         
    except Exception as e:                                                
        lang='unknown'
    languages.append(lang)

languages = np.asarray(languages)
v, c = np.unique(languages, return_counts=True)

In [None]:
Counter(languages).most_common(20)

In [None]:
fig = plt.figure(figsize=(15, 4))
sns.barplot(x=v, y=c, palette="Blues_d")
fig.axes[0].set_xticklabels(fig.axes[0].get_xticklabels(), rotation=90)
fig.tight_layout()
fig.savefig("out/languages.png")

In [None]:
print(len(docs.loc[languages == "en"]))
en_docs = docs.loc[languages == "en"]

In [None]:
print(len(docs.loc[languages != "en"]))
not_en_docs = docs.loc[languages != "en"]

### Find Characters

In [None]:
emojis = []
for index, row in tqdm(docs.iterrows(), total=len(docs)):
    e1 = demoji.findall(row["title"])
    if e1:
      emojis.extend(list(e1.keys()))
  
    e2 = demoji.findall(str(row["abstract"]))
    if e2:
      emojis.extend(list(e2.keys()))

Counter(emojis)

In [None]:
import sys
from unicodedata import category
punctuation_chars =  [chr(i) for i in range(sys.maxunicode) if category(chr(i)).startswith("P")]

P = []
for index, row in tqdm(docs.iterrows(), total=len(docs)):
  if row["title"] is not pd.NA:
    t_chars = [c for c in punctuation_chars if c in row["title"]]
    if len(t_chars) > 0:
      P.extend(t_chars)

  if row["abstract"] is not pd.NA:
    a_chars = [c for c in punctuation_chars if c in row["abstract"]]
    if len(a_chars) > 0:
      P.extend(a_chars)

Counter(P)

In [None]:
c = Counter(P)
for word in list(c):
    if word in string.punctuation + "–—‐“”″„’‘•′·«»§¶":
        del c[word]
c.items()

In [None]:
WordCloud().fit_words(Counter(P)).to_image()

## Preprocessing

Apply the preprocess on the documents and the queries

In [None]:
docs = en_docs  # test only english docs
# docs = not_en_docs

In [None]:
nltk.download("omw-1.4")
# global lists of tokens for wordclouds
titles = []
abstracts = []

docs_preprocessed = docs.copy()
for index, row in tqdm(docs.iterrows(), total=len(docs)):
    if row["title"] is not pd.NA:
      title_tokens = preprocess(row["title"])
      docs_preprocessed.loc[index, "title"] = " ".join(title_tokens)
      titles.extend(title_tokens)

    if row["abstract"] is not pd.NA:
      abstract_tokens = preprocess(row["abstract"])
      docs_preprocessed.loc[index, "abstract"] = " ".join(abstract_tokens)
      abstracts.extend(abstract_tokens)

In [None]:
docs_preprocessed.to_pickle("data/docs_processed.pkl")
docs_preprocessed.to_pickle("data/en_docs_processed.pkl")

In [None]:
queries_preprocessed = queries.copy()
for index, row in tqdm(queries.iterrows(), total=len(queries)):
    queries_preprocessed.loc[index, "title"] = " ".join(preprocess(row["title"]))
    queries_preprocessed.loc[index, "description"] = " ".join(
        preprocess(row["description"])
    )
    queries_preprocessed.loc[index, "narrative"] = " ".join(
        preprocess(row["narrative"])
    )

queries_preprocessed.to_pickle("data/queries_processed.pkl")

## Word Cloud

In [None]:
counter_title = Counter(titles)
counter_abstract = Counter(abstracts)

wc1 = WordCloud(collocations=False, background_color="white", width=600, height=400)
wc2 = WordCloud(collocations=False, background_color="white", width=600, height=400)

wc_titles = wc1.fit_words(counter_title)
wc_abstracts = wc2.fit_words(counter_abstract)

wc_titles.to_file("out/wc_titles.png")
wc_abstracts.to_file("out/wc_abstracts.png")

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

ax1.imshow(wc_titles)
ax1.set_title("Titles", fontsize=20)
ax1.axis("off")

ax2.imshow(wc_abstracts)
ax2.set_title("Abstracts", fontsize=20)
ax2.axis("off")

fig.tight_layout()

## Token Statistics

In [None]:
print("\n[Number of Tokens]")
print("Titles:", len(counter_title.keys()))
print("Abstracts:", len(counter_abstract.keys()))

k = 10
top_k_titles = [word for word, _ in counter_title.most_common(k)]
top_k_abstracts = [word for word, _ in counter_abstract.most_common(k)]

print(f"\n[Top {k} Most Common Tokens]")
print(f"Titles: {', '.join(top_k_titles)}")
print(f"Abstracts: {', '.join(top_k_abstracts)}")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

k = 30

common_titles = dict(counter_title.most_common(k))
x_title = list(common_titles.keys())
y_title = list(common_titles.values())

common_abstracts = dict(counter_abstract.most_common(k))
x_abstract = list(common_abstracts.keys())
y_abstract = list(common_abstracts.values())

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(20, 6))
sns.barplot(x=x_title, y=y_title, ax=ax1)
sns.barplot(x=x_abstract, y=y_abstract, ax=ax2)
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=30)
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=30)
ax1.set_title("Title Tokens")
ax2.set_title("Abstract Tokens")
fig.tight_layout()

fig.savefig("out/most_common_en.png")

# Part 2: Basic Search

## Preprocessing of the query
Eliminate stopwords, remove punctuation. 
You can also perform any other pre-processing step.


* Lowcase
* Remove punctuation
* Remove extra whitespaces
* Normalize covid words
* Stop words
* Lemmatization
* Remove HTML tags
* Remove URLs
* Remove emojis



In [None]:
test_query = [
    "CoVid",
    "covid    19",
    "covid-19  influenza",
    "covid, influenza",
    "a covid flue is going on"
]

check_query = ['covid', 
'covid 19', 
'covid19 influenza', 
'covid influenza', 
'covid flue go'
]


In [None]:
preprocessed_query = []
for query in test_query:
    p = " ".join(preprocess(query, covid_normalization=False))
    preprocessed_query.append(p)
print(preprocessed_query)

In [None]:
assert check_query == preprocessed_query

## Index the documents

Use PyTerrier to index the documents.

You can choose freely the indexing configurations.
You can index either the document’s titles, abstract, or both.

### Libraries

In [None]:
%%capture
%pip install python-terrier

import pyterrier as pt
import pandas as pd
import numpy as np

### Data

In [None]:
docs_preprocessed = pd.read_pickle("data/docs_processed.pkl")

In [None]:
empty_titles = docs_preprocessed["title"] == ""

print(
    "The dataset has {} rows of ID empty".format(
        len(docs_preprocessed[docs_preprocessed["doc_id"] == ""])
    )
)
print(
    "The dataset has {} rows of TITLE empty".format(
        len(docs_preprocessed[empty_titles])
    )
)
print(
    "The dataset has {} rows of DATE empty".format(
        len(docs_preprocessed[docs_preprocessed["date"].isnull()])
    )
)
print(
    "The dataset has {} rows of ABSTRACT empty".format(
        len(docs_preprocessed[docs_preprocessed["abstract"] == ""])
    )
)
print(
    "The dataset has {} rows of DOI empty".format(
        len(docs_preprocessed[docs_preprocessed["doi"] == ""])
    )
)

print("\ndocs with empty title: " + ", ".join(docs_preprocessed[empty_titles]["doc_id"]))

In [None]:
docs_preprocessed["title"].replace({"": "Vuoto"}, inplace=True)
docs_preprocessed["abstract"].replace({"": "Vuoto"}, inplace=True)
docs_preprocessed["doi"].replace({"": "Vuoto"}, inplace=True)
docs_preprocessed["date"].fillna(pd.Timestamp(0, unit="s"), inplace=True)

In [None]:
print("The dataset has {} rows of ID empty".format(len(docs_preprocessed[docs_preprocessed["doc_id"] == ""])))
print(
    "The dataset has {} rows of TITLE empty".format(
        len(docs_preprocessed[docs_preprocessed["title"] == ""])
    )
)
print(
    "The dataset has {} rows of DATE empty".format(
        len(docs_preprocessed[docs_preprocessed["date"].isnull()])
    )
)
print(
    "The dataset has {} rows of ABSTRACT empty".format(
        len(docs_preprocessed[docs_preprocessed["abstract"] == ""])
    )
)
print("The dataset has {} rows of DOI empty".format(len(docs_preprocessed[docs_preprocessed["doi"] == ""])))

In [None]:
np.where(pd.isnull(docs_preprocessed))

In [None]:
num_id = [str(i) for i in range(1, len(docs_preprocessed.index) + 1)]
# print(num_id)
docs_preprocessed.insert(0, "docno", num_id, True)

### Index

In [None]:
# import os
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-1.11.0-openjdk-amd64/"

if not pt.started():
    pt.init()

In [None]:
def create_index(docs_df, mode="both", indexer_path="./pd_index"):

    #num_id = [str(i) for i in range(1, len(docs_df.index) + 1)]
    # print(num_id)
    #docs_df.insert(0, "docno", num_id, True)

    docs_df.rename(columns={'doc_id':'docno'}, inplace=True)

    pd_indexer = pt.DFIndexer(indexer_path, remove_stopwords=False, overwrite=True)

    if mode == "title":
        index_ref = pd_indexer.index(docs_df["title"], docs_df["docno"])
    elif mode == "abstract":
        index_ref = pd_indexer.index(docs_df["abstract"], docs_df["docno"])
    else:  # both or else
        index_ref = pd_indexer.index(
            docs_df["title"], docs_df["abstract"], docs_df["docno"]
        )

    return index_ref


In [None]:
# docs_preprocessed['title'].replace("", pd.NA, inplace=True)
# docs_preprocessed['abstract'].replace("", pd.NA, inplace=True)
# docs_preprocessed['doi'].replace("", pd.NA, inplace=True)
# docs_preprocessed['doc_id'].replace("", pd.NA, inplace=True)
# docs_preprocessed.info()

In [None]:
docs_preprocessed.rename(columns={'doc_id':'docno'}, inplace=True)
pd_indexer = pt.DFIndexer("./index", remove_stopwords=False, overwrite=True)
pd_indexer.index(docs_preprocessed["title"], docs_preprocessed["docno"])

In [None]:
# index the text, record the docnos as metadata
index_ref = create_index(docs_preprocessed, mode="both")

In [None]:
!ls -lh pd_index/

In [None]:
index = pt.IndexFactory.of(index_ref)

In [None]:
print(index.getCollectionStatistics().toString())

i = 0
max = 10
for kv in index.getLexicon():
    i = i + 1
    if i < max:
        print("%s -> %s" % (kv.getKey(), kv.getValue().toString()))
    if i == max:
        print(".....")
        break


In [None]:
def index_model(index, model="TF_IDF"):
    """
    Create and return an indexing model.
    http://terrier.org/docs/current/javadoc/org/terrier/matching/models/package-summary.html
    Model can be such as:
    "TF_IDF", "PL2", "DLH",
    "Hiemstra_LM", "Dirichlet_LM",
    "BM25", etc
    """
    list_model = ["TF_IDF", "PL2", "DLH", "Hiemstra_LM", "Dirichlet_LM", "BM25"]

    indexing_model = pt.BatchRetrieve(index, wmodel=model)
    return indexing_model

In [None]:
model = index_model(index, model="TF_IDF")

https://github.com/terrier-org/terrier-core/blob/5.x/doc/querylanguage.md

In [None]:
# A single query (that I manually provide)
retr = model.search("introduction modern climate change")
display(retr.head(10))

docids = retr[retr["rank"] < 5]["docid"]
for id in docids:
  print("[" + str(id) + "] TITLE: " + docs_preprocessed.iloc[id]["title"])
  print("[" + str(id) + "] ABSTRACT: " + docs_preprocessed.iloc[id]["abstract"][:100] + "...")

In [None]:
print("+woman covid")
retr = model.search("+woman covid")
print(len(retr.index))
display(retr.head(3))

print("+woman covid")
retr = model.search("+woman covid")
print(len(retr.index))
display(retr.head(3))

print("+woman +covid")
retr = model.search("+woman +covid")
print(len(retr.index))
display(retr.head(3))

print("+woman +covid")
retr = model.search("+woman +covid")
print(len(retr.index))
display(retr.head(3))

print("-woman +covid")
retr = model.search("-woman +covid")
print(len(retr.index))
display(retr.head(3))

print("-woman +covid")
retr = model.search("-woman +covid")
print(len(retr.index))
display(retr.head(3))

print("{woman covid}")
retr = model.search("{woman covid}")
print(len(retr.index))
display(retr.head(3))

print("{ woman term2 }")
retr = model.search("{ woman covid }")
print(len(retr.index))
display(retr.head(3))


To do indexing just choose the model to use through the function:

`index_ref = create_index(docs_df, mode="both")`

mode decide if indexing just title, abstract or both.

`index = pt.IndexFactory.of(index_ref)`

`tfidf = index_model(index, model="TF_IDF")`

and then use the function:

`tfidf.search("your query")`


To know more about the query look [here](https://github.com/terrier-org/terrier-core/blob/5.x/doc/querylanguage.md)

To know more about the model you can use look [here](http://terrier.org/docs/current/javadoc/org/terrier/matching/models/package-summary.html). You can use model as "TF_IDF", "PL2", "DLH", "Hiemstra_LM", "Dirichlet_LM", etc.


# Part 3: Advanced Search (Optional)

## Try to expand the ad-hoc queries with the most similar words.

We need to define a function that can calculate the most similar word to another.
To do this we use Word2Vec.

In [1]:
from gensim.models import Word2Vec
import pandas as pd
import nltk

In [2]:
# Load the data pre-processed
docs_preprocessed = pd.read_pickle("data/en_docs_processed.pkl")

In [3]:
print("The dataset has {} rows of ID empty".format(len(docs_preprocessed[docs_preprocessed["doc_id"] == ""])))
print(
    "The dataset has {} rows of TITLE empty".format(
        len(docs_preprocessed[docs_preprocessed["title"] == ""])
    )
)
print(
    "The dataset has {} rows of DATE empty".format(
        len(docs_preprocessed[docs_preprocessed["date"].isnull()])
    )
)
print(
    "The dataset has {} rows of ABSTRACT empty".format(
        len(docs_preprocessed[docs_preprocessed["abstract"] == ""])
    )
)
print("The dataset has {} rows of DOI empty".format(len(docs_preprocessed[docs_preprocessed["doi"] == ""])))

The dataset has 0 rows of ID empty
The dataset has 4 rows of TITLE empty
The dataset has 17 rows of DATE empty
The dataset has 0 rows of ABSTRACT empty
The dataset has 0 rows of DOI empty


In [4]:
# We don't need all the columns
docs_preprocessed.drop(['doc_id','date', 'doi'], axis=1, inplace=True)

In [5]:
print( docs_preprocessed.columns )
print( docs_preprocessed.head(5) )

Index(['title', 'abstract'], dtype='object')
                                                  title  \
3705  genetic control mouse hdl proteome defines hdl...   
3747  potent antiviral activity carbohydratespecific...   
4403  pioneer experience uniportal videoassisted tho...   
4456             note editor novel coronavirus 2019ncov   
4532  epidemiological clinical characteristic health...   

                                               abstract  
3705  hdl nanoparticles 80 associated protein phosph...  
3747  brazil one large biodiversity world search new...  
4403  optimal way treat severe thoracic scoliosis re...  
4456                                               <NA>  
4532  background analyze result 3year surveillance s...  


In [6]:
docs_preprocessed_abstract = docs_preprocessed[docs_preprocessed['abstract'].notna()]
docs_preprocessed_title = docs_preprocessed[docs_preprocessed['title'].notna()]

print( len(docs_preprocessed_abstract) )
print( len(docs_preprocessed_title) )

print( len(docs_preprocessed) )

49631
81823
81823


In [7]:
docs_preprocessed_abstract["abstract_data"] = docs_preprocessed_abstract.abstract
docs_preprocessed_abstract.abstract_data = docs_preprocessed_abstract.abstract_data.apply(lambda input_text : [t.split() for t in  nltk.sent_tokenize(input_text)])

print( len(docs_preprocessed_abstract) )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  docs_preprocessed_abstract["abstract_data"] = docs_preprocessed_abstract.abstract


49631


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [8]:
docs_preprocessed_title["title_data"] = docs_preprocessed_title.title
docs_preprocessed_title.title_data = docs_preprocessed_title.title_data.apply(lambda input_text : [t.split() for t in  nltk.sent_tokenize(input_text)])

print( len(docs_preprocessed_title) )

81823


In [9]:
# is a list of all the sentences in the abstracts
abstract_data = docs_preprocessed_abstract.abstract_data.sum()
title_data = docs_preprocessed_title.title_data.sum()

In [10]:
data = abstract_data + title_data

In [11]:
# N.B.
develop = 0
developed = 0
protein = 0
proteins = 0

for phrase in data:
    for word in phrase:
        if word == 'develop':
            develop += 1
        if word == 'developed':
            developed += 1 
        if word == 'protein':
            protein += 1
        if word == 'proteins':
            proteins += 1  

print(f"{develop} develop")
print(f"{developed} developed")
print(f"{protein} protein")
print(f"{proteins} proteins")

9447 develop
320 developed
8155 protein
423 proteins


In [12]:
print( type(data) )
print( type(data[0]) )
print( type(data[0][0]) )
print( data[0][:20] )

<class 'list'>
<class 'list'>
<class 'str'>
['hdl', 'nanoparticles', '80', 'associated', 'protein', 'phospholipid', 'cholesterol', 'cholesteryl', 'ester', 'potential', 'inverse', 'relation', 'hdl', 'coronary', 'artery', 'disease', 'cad', 'effect', 'hdl', 'myriad']


In [13]:
model = Word2Vec(sentences=data, window=10, sg=1, seed=1) # sg=1 for skipgram ; sg=0 for CBOW

In [14]:
# Save the model
model.save("data/word2vec.model")
# We can load the model like that:
# model = Word2Vec.load("data/word2vec.model")

In [15]:
model.wv.most_similar('regulation', topn=1)

[('complies', 0.6962115168571472)]

In [16]:
from my_utils import most_similar, query_similar_words
mv_model = Word2Vec.load("data/word2vec.model")

In [25]:
assert most_similar('doggy doggy', mv_model) == None
assert most_similar('italy', mv_model) == 'lombardy'
assert most_similar('doggy doggy') == None
assert most_similar('italy') == 'lombardy'
assert most_similar('acute') == 'syndrome'

In [26]:
query_similar_words('coronavirus')

'coronavirus 2019'

# NOTE

prof: importate la precision. l'obiettivo é riportare la risposta migliore
