# Part 3: Validation

In [2]:
!pip install python-terrier

import pyterrier as pt
from pyterrier.measures import *
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



In [3]:
if not pt.started():
    pt.init()

  warn("From PyTerrier 0.8, Python 3.7 will be required, you currently have %s" % platform.python_version())


PyTerrier 0.7.2 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)


In [4]:
def create_index(docs_df, mode="both", indexer_path="./pd_index"):

    pd_indexer = pt.DFIndexer(indexer_path, remove_stopwords=False, overwrite=True)

    if mode == "title":
        index_ref = pd_indexer.index(docs_df["title"], docs_df["docno"])
    elif mode == "abstract":
        index_ref = pd_indexer.index(docs_df["abstract"], docs_df["docno"])
    else:  # both or else
        index_ref = pd_indexer.index(
            docs_df["title"], docs_df["abstract"], docs_df["docno"]
        )

    return index_ref

In [5]:
def index_model(index, model="TF_IDF"):
    """
    Create and return an indexing model.
    http://terrier.org/docs/current/javadoc/org/terrier/matching/models/package-summary.html
    Model can be such as:
    "TF_IDF", "PL2", "DLH",
    "Hiemstra_LM", "Dirichlet_LM",
    "BM25", etc
    """
    list_model = ["TF_IDF", "PL2", "DLH", "Hiemstra_LM", "Dirichlet_LM", "BM25"]

    indexing_model = pt.BatchRetrieve(index, wmodel=model)
    return indexing_model

In [6]:
# preprocessing: ...
# topics: {title, description, narrative}
# indexing: {title, abstract, both}
# models: tf_idf, bm25, dirichlet_lm
# similar queries: ...

# read preprocessed documents
docs = pd.read_pickle('./data/docs_processed.pkl')
docs.rename(columns={"doc_id":"docno"}, inplace=True)

# index documents
index_ref_docs = create_index(docs, mode="abstract")
index_docs = pt.IndexFactory.of(index_ref_docs)

tfidf = index_model(index_docs, model="TF_IDF")
bm25 = index_model(index_docs, model="BM25")
dirichlet_lm = index_model(index_docs, model="DirichletLM")

19:10:37.201 [main] WARN org.terrier.structures.indexing.Indexer - Indexed 34085 empty documents
19:10:37.304 [main] ERROR org.terrier.structures.indexing.Indexer - Could not finish MetaIndexBuilder: 
java.io.IOException: Key jquazfc8 is not unique: 35598,5061
For MetaIndex, to suppress, set metaindex.compressed.reverse.allow.duplicates=true
	at org.terrier.structures.collections.FSOrderedMapFile$MultiFSOMapWriter.mergeTwo(FSOrderedMapFile.java:1374)
	at org.terrier.structures.collections.FSOrderedMapFile$MultiFSOMapWriter.close(FSOrderedMapFile.java:1308)
	at org.terrier.structures.indexing.BaseMetaIndexBuilder.close(BaseMetaIndexBuilder.java:321)
	at org.terrier.structures.indexing.classical.BasicIndexer.createDirectIndex(BasicIndexer.java:346)
	at org.terrier.structures.indexing.Indexer.index(Indexer.java:369)


In [7]:
# read preprocessed queries
queries = pd.read_pickle('./data/queries_processed.pkl')
display(queries.head(5))

pq_title = queries[['query_id', 'title']]
pq_title = pq_title.rename(columns={'query_id':'qid', 'title':'query'})
display(pq_title.head(5))

pq_desc = queries[['query_id', 'description']]
pq_desc = pq_desc.rename(columns={'query_id':'qid', 'description':'query'})
display(pq_desc.head(5))

pq_narr = queries[['query_id', 'narrative']]
pq_narr = pq_narr.rename(columns={'query_id':'qid', 'narrative':'query'})
display(pq_narr.head(5))

Unnamed: 0,query_id,title,description,narrative
0,1,coronavirus origin,origin covid19,seek range information sarscov2 virus origin i...
1,2,coronavirus response weather change,coronavirus respond change weather,seek range information sarscov2 virus viabilit...
2,3,coronavirus immunity,sarscov2 infect people develop immunity cross ...,seek study immunity develop due infection sars...
3,4,people die coronavirus,cause death covid19,study look mechanism death covid19
4,5,animal model covid19,drug active sarscov sarscov2 animal study,paper describe result test drug bind spike pro...


Unnamed: 0,qid,query
0,1,coronavirus origin
1,2,coronavirus response weather change
2,3,coronavirus immunity
3,4,people die coronavirus
4,5,animal model covid19


Unnamed: 0,qid,query
0,1,origin covid19
1,2,coronavirus respond change weather
2,3,sarscov2 infect people develop immunity cross ...
3,4,cause death covid19
4,5,drug active sarscov sarscov2 animal study


Unnamed: 0,qid,query
0,1,seek range information sarscov2 virus origin i...
1,2,seek range information sarscov2 virus viabilit...
2,3,seek study immunity develop due infection sars...
3,4,study look mechanism death covid19
4,5,paper describe result test drug bind spike pro...


In [8]:
cord19 = pt.get_dataset("irds:cord19/trec-covid")

exp1 = pt.Experiment(
    [tfidf, bm25, dirichlet_lm],
    pq_title,
    cord19.get_qrels(),
    eval_metrics=[P@5, P@10, nDCG@10, RR(rel=2)],
    names=["TF_IDF", "BM25", "DirichletLM"]
)
display(exp1)

exp2 = pt.Experiment(
    [tfidf, bm25, dirichlet_lm],
    pq_desc,
    cord19.get_qrels(),
    eval_metrics=[P@5, P@10, nDCG@10, RR(rel=2)],
    names=["TF_IDF", "BM25", "DirichletLM"]
)
display(exp2)

exp3 = pt.Experiment(
    [tfidf, bm25, dirichlet_lm],
    pq_narr,
    cord19.get_qrels(),
    eval_metrics=[P@5, P@10, nDCG@10, RR(rel=2)],
    names=["TF_IDF", "BM25", "DirichletLM"]
)
display(exp3)

Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.68,0.684,0.610636,0.634274
1,BM25,0.72,0.7,0.626108,0.650905
2,DirichletLM,0.648,0.676,0.586329,0.589279


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.764,0.72,0.662809,0.76321
1,BM25,0.744,0.72,0.657527,0.742917
2,DirichletLM,0.668,0.664,0.608927,0.695802


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.604,0.614,0.543766,0.605863
1,BM25,0.608,0.604,0.540071,0.606204
2,DirichletLM,0.484,0.502,0.439649,0.499229


In [9]:
"""Write a function that measures the pairwise similarity of the provided 
queries. Create a list of similar queries and a word-cloud of their qids."""

def cosine_sim(queries):
    vectorizer = TfidfVectorizer(use_idf=True)
    vectors = vectorizer.fit_transform(queries['title'])
    return cosine_similarity(vectors)

z = cosine_sim(queries)