# Part 3: Validation

In [3]:
%%capture
%pip install python-terrier
%pip install gensim

import pyterrier as pt
from pyterrier.measures import *
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [4]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-1.11.0-openjdk-amd64/"

if not pt.started():
    pt.init()

PyTerrier 0.7.2 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)


No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [5]:
def create_index(docs_df, mode="both", indexer_path="./pd_index"):

    pd_indexer = pt.DFIndexer(indexer_path, remove_stopwords=False, overwrite=True)
    pd_indexer.setProperty("termpipelines", "")
    pd_indexer.setProperty("metaindex.compressed.reverse.allow.duplicates", "true")

    if mode == "title":
        index_ref = pd_indexer.index(docs_df["title"], docs_df["docno"])
    elif mode == "abstract":
        index_ref = pd_indexer.index(docs_df["abstract"], docs_df["docno"])
    else:  # both or else
        index_ref = pd_indexer.index(docs_df["title"], docs_df["abstract"], docs_df["docno"])

    return index_ref

In [6]:
def index_model(index, model="TF_IDF"):
    """
    Create and return an indexing model.
    http://terrier.org/docs/current/javadoc/org/terrier/matching/models/package-summary.html
    Model can be such as:
    "TF_IDF", "PL2", "DLH",
    "Hiemstra_LM", "Dirichlet_LM",
    "BM25", etc
    """
    list_model = ["TF_IDF", "PL2", "DLH", "Hiemstra_LM", "Dirichlet_LM", "BM25"]

    indexing_model = pt.BatchRetrieve(index, wmodel=model)
    return indexing_model

In [7]:
# preprocessing: ...
# topics: {title, description, narrative}
# indexing: {title, abstract, both}
# models: {tf_idf, bm25, dirichlet_lm}
# similar queries: ...

# read preprocessed documents
docs = pd.read_pickle('./data/en_docs_processed.pkl')
docs.rename(columns={"doc_id":"docno"}, inplace=True)

# index documents
index_ref_docs_title = create_index(docs, mode="title", indexer_path="./indexes/title")
index_docs_title = pt.IndexFactory.of(index_ref_docs_title)

In [None]:
index_ref_docs_abstract = create_index(docs, mode="abstract", indexer_path="./indexes/abstract")
index_docs_abstract = pt.IndexFactory.of(index_ref_docs_abstract)

In [None]:
index_ref_docs_both = create_index(docs, mode="both", indexer_path="./indexes/both")
index_docs_both = pt.IndexFactory.of(index_ref_docs_title)

In [8]:
# read preprocessed queries
queries = pd.read_pickle('./data/queries_processed.pkl')

pq_title = queries[['query_id', 'title']]
pq_title = pq_title.rename(columns={'query_id':'qid', 'title':'query'})
display(pq_title.head(5))

pq_desc = queries[['query_id', 'description']]
pq_desc = pq_desc.rename(columns={'query_id':'qid', 'description':'query'})
display(pq_desc.head(5))

pq_narr = queries[['query_id', 'narrative']]
pq_narr = pq_narr.rename(columns={'query_id':'qid', 'narrative':'query'})
display(pq_narr.head(5))

Unnamed: 0,qid,query
0,1,coronavirus origin
1,2,coronavirus response weather change
2,3,coronavirus immunity
3,4,people die coronavirus
4,5,animal model covid19


Unnamed: 0,qid,query
0,1,origin covid19
1,2,coronavirus respond change weather
2,3,sarscov2 infect people develop immunity cross ...
3,4,cause death covid19
4,5,drug active sarscov sarscov2 animal study


Unnamed: 0,qid,query
0,1,seek range information sarscov2 virus origin i...
1,2,seek range information sarscov2 virus viabilit...
2,3,seek study immunity develop due infection sars...
3,4,study look mechanism death covid19
4,5,paper describe result test drug bind spike pro...


In [9]:
cord19 = pt.get_dataset("irds:cord19/trec-covid")

tfidf = index_model(index_docs_title, model="TF_IDF")
bm25 = index_model(index_docs_title, model="BM25")
dirichlet_lm = index_model(index_docs_title, model="DirichletLM")

exp1 = pt.Experiment(
    [tfidf, bm25, dirichlet_lm],
    pq_title,
    cord19.get_qrels(),
    eval_metrics=[P@5, P@10, nDCG@10, RR(rel=2)],
    names=["TF_IDF", "BM25", "DirichletLM"]
)
display(exp1)

exp2 = pt.Experiment(
    [tfidf, bm25, dirichlet_lm],
    pq_desc,
    cord19.get_qrels(),
    eval_metrics=[P@5, P@10, nDCG@10, RR(rel=2)],
    names=["TF_IDF", "BM25", "DirichletLM"]
)
display(exp2)

exp3 = pt.Experiment(
    [tfidf, bm25, dirichlet_lm],
    pq_narr,
    cord19.get_qrels(),
    eval_metrics=[P@5, P@10, nDCG@10, RR(rel=2)],
    names=["TF_IDF", "BM25", "DirichletLM"]
)
display(exp3)

Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.568,0.56,0.519932,0.622459
1,BM25,0.576,0.546,0.509432,0.613922
2,DirichletLM,0.54,0.508,0.468562,0.551351


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.632,0.546,0.535308,0.671064
1,BM25,0.64,0.548,0.52152,0.611938
2,DirichletLM,0.44,0.428,0.396092,0.488072


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.468,0.418,0.383628,0.496482
1,BM25,0.448,0.432,0.386476,0.477315
2,DirichletLM,0.252,0.226,0.213848,0.337682


In [None]:
tfidf = index_model(index_docs_abstract, model="TF_IDF")
bm25 = index_model(index_docs_abstract, model="BM25")
dirichlet_lm = index_model(index_docs_abstract, model="DirichletLM")

exp1 = pt.Experiment(
    [tfidf, bm25, dirichlet_lm],
    pq_title,
    cord19.get_qrels(),
    eval_metrics=[P@5, P@10, nDCG@10, RR(rel=2)],
    names=["TF_IDF", "BM25", "DirichletLM"]
)
display(exp1)

exp2 = pt.Experiment(
    [tfidf, bm25, dirichlet_lm],
    pq_desc,
    cord19.get_qrels(),
    eval_metrics=[P@5, P@10, nDCG@10, RR(rel=2)],
    names=["TF_IDF", "BM25", "DirichletLM"]
)
display(exp2)

exp3 = pt.Experiment(
    [tfidf, bm25, dirichlet_lm],
    pq_narr,
    cord19.get_qrels(),
    eval_metrics=[P@5, P@10, nDCG@10, RR(rel=2)],
    names=["TF_IDF", "BM25", "DirichletLM"]
)
display(exp3)

In [None]:
tfidf = index_model(index_docs_both, model="TF_IDF")
bm25 = index_model(index_docs_both, model="BM25")
dirichlet_lm = index_model(index_docs_both, model="DirichletLM")

exp1 = pt.Experiment(
    [tfidf, bm25, dirichlet_lm],
    pq_title,
    cord19.get_qrels(),
    eval_metrics=[P@5, P@10, nDCG@10, RR(rel=2)],
    names=["TF_IDF", "BM25", "DirichletLM"]
)
display(exp1)

exp2 = pt.Experiment(
    [tfidf, bm25, dirichlet_lm],
    pq_desc,
    cord19.get_qrels(),
    eval_metrics=[P@5, P@10, nDCG@10, RR(rel=2)],
    names=["TF_IDF", "BM25", "DirichletLM"]
)
display(exp2)

exp3 = pt.Experiment(
    [tfidf, bm25, dirichlet_lm],
    pq_narr,
    cord19.get_qrels(),
    eval_metrics=[P@5, P@10, nDCG@10, RR(rel=2)],
    names=["TF_IDF", "BM25", "DirichletLM"]
)
display(exp3)

In [None]:
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [None]:
"""Write a function that measures the pairwise similarity of the provided 
queries. Create a list of similar queries and a word-cloud of their qids."""

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(queries['description']).toarray()

cluster = AgglomerativeClustering(n_clusters=None, affinity='cosine', linkage='average', distance_threshold=0.9)
labels = cluster.fit_predict(vectors)

In [None]:
def compute_freqs(queries, labels):
    freqs = {}
    for i in range(max(labels)):
        # print(f"cluster {i:2} {queries.loc[labels==i]['title'].values}")

        similar_queries = queries.loc[labels==i]['query_id'].values
        for qid in similar_queries:
            freqs[qid] = len(similar_queries)

    return freqs

In [None]:
import wordcloud
wordcloud.WordCloud().fit_words(compute_freqs(queries, labels)).to_image()

In [None]:
tagged_data = [TaggedDocument(words=doc.split(), tags=[str(i)]) for i, doc in enumerate(queries['description'])]
doc_model = Doc2Vec(tagged_data, window=5, seed=1)
d2v_doc_embeddings = np.array([doc_model.infer_vector(doc.split()) for doc in queries['description']])
print(d2v_doc_embeddings.shape)

cluster = AgglomerativeClustering(n_clusters=None, affinity='cosine', linkage='average', distance_threshold=0.8)
labels = cluster.fit_predict(d2v_doc_embeddings)

for i in range(max(labels)):
    print(f"cluster {i:2} {queries.loc[labels==i]['description'].values}")

In [None]:
wordcloud.WordCloud().fit_words(compute_freqs(queries, labels)).to_image()