# Part 2: Basic Search

In [1]:
import os
import pyterrier as pt
import pandas as pd
from my_utils import preprocess_queries, create_index, experiment

pd.set_option('max_colwidth', 800)

In [2]:
# comment this line if your environment is already the default
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-1.11.0-openjdk-amd64/"

if not pt.started():
    pt.init()

PyTerrier 0.8.0 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


## Experiments

- preprocessing: {all languages, only english}
- topics: {title, description, narrative}
- indexing: {title, abstract, both}
- models: {tf_idf, bm25, dirichlet_lm}

In [3]:
# read preprocessed en documents
en_docs = pd.read_pickle('./data/en_docs_processed.pkl')
en_docs.rename(columns={"doc_id": "docno"}, inplace=True)

# read preprocessed documents
docs = pd.read_pickle('./data/docs_processed.pkl')
docs.rename(columns={"doc_id": "docno"}, inplace=True)

In [4]:
len(docs), len(en_docs)

(84195, 81823)

In [5]:
# check if there are newer versions of the documents after the last TREC-COVID release
sum(docs[(docs['date'] > '2020-07-16')].value_counts('docno') != 1) > 0

False

In [6]:
def keep_recent_versions(df):
    df.sort_values(by="date", ascending=False, inplace=True)
    df.drop_duplicates(subset=["docno"], inplace=True)
    return df


docs = keep_recent_versions(docs)
en_docs = keep_recent_versions(en_docs)

In [7]:
len(docs), len(en_docs)

(83159, 80847)

In [8]:
import ir_datasets
dataset = ir_datasets.load("cord19/trec-covid")

# load queries
queries = pd.DataFrame(dataset.queries_iter())

In [9]:
# load qrels
cord19 = pt.get_dataset("irds:cord19/trec-covid")
qrels = cord19.get_qrels()
qrels = qrels.loc[qrels["label"] != -1]

In [10]:
#preprocess queries by (title, description, narrative)
queries = preprocess_queries(queries, augmented=False, mv_model=None)
display(queries.head(5))

pq_title = queries[['query_id', 'title']]
pq_title = pq_title.rename(columns={'query_id': 'qid', 'title': 'query'})
display(pq_title.head(5))

pq_desc = queries[['query_id', 'description']]
pq_desc = pq_desc.rename(columns={'query_id': 'qid', 'description': 'query'})
display(pq_desc.head(5))

pq_narr = queries[['query_id', 'narrative']]
pq_narr = pq_narr.rename(columns={'query_id': 'qid', 'narrative': 'query'})
display(pq_narr.head(5))

  0%|          | 0/50 [00:00<?, ?it/s]

Unnamed: 0,query_id,title,description,narrative
0,1,coronavirus origin,origin covid19,seek range information sarscov2 virus origin include evolution animal source first transmission human
1,2,coronavirus response weather change,coronavirus respond change weather,seek range information sarscov2 virus viability different weatherclimate condition well information relate transmission virus different climate condition
2,3,coronavirus immunity,sarscov2 infect people develop immunity cross protection possible,seek study immunity develop due infection sarscov2 cross protection gain due infection coronavirus type
3,4,people die coronavirus,cause death covid19,study look mechanism death covid19
4,5,animal model covid19,drug active sarscov sarscov2 animal study,paper describe result test drug bind spike protein virus drug animal model paper sarscov2 infection cell culture assay also relevant


Unnamed: 0,qid,query
0,1,coronavirus origin
1,2,coronavirus response weather change
2,3,coronavirus immunity
3,4,people die coronavirus
4,5,animal model covid19


Unnamed: 0,qid,query
0,1,origin covid19
1,2,coronavirus respond change weather
2,3,sarscov2 infect people develop immunity cross protection possible
3,4,cause death covid19
4,5,drug active sarscov sarscov2 animal study


Unnamed: 0,qid,query
0,1,seek range information sarscov2 virus origin include evolution animal source first transmission human
1,2,seek range information sarscov2 virus viability different weatherclimate condition well information relate transmission virus different climate condition
2,3,seek study immunity develop due infection sarscov2 cross protection gain due infection coronavirus type
3,4,study look mechanism death covid19
4,5,paper describe result test drug bind spike protein virus drug animal model paper sarscov2 infection cell culture assay also relevant


### Index All Documents and Experiments

In [None]:
%%capture

# index documents by (title, abstract, both)
index_ref_docs_title = create_index(docs, mode="title", indexer_path="./indexes/all/title")
index_docs_title = pt.IndexFactory.of(index_ref_docs_title)

index_ref_docs_abstract = create_index(docs, mode="abstract", indexer_path="./indexes/all/abstract")
index_docs_abstract = pt.IndexFactory.of(index_ref_docs_abstract)

index_ref_docs_both = create_index(docs, mode="both", indexer_path="./indexes/all/both")
index_docs_both = pt.IndexFactory.of(index_ref_docs_both)

In [12]:
# docs {title, abstract, both} with query_title
exp1 = experiment(index_docs_title, pq_title, qrels)
display(exp1)

exp2 = experiment(index_docs_abstract, pq_title, qrels)
display(exp2)

exp3 = experiment(index_docs_both, pq_title, qrels)
display(exp3)

# docs {title, abstract, both} with query_description
exp4 = experiment(index_docs_title, pq_desc, qrels)
display(exp4)

exp5 = experiment(index_docs_abstract, pq_desc, qrels)
display(exp5)

exp6 = experiment(index_docs_both, pq_desc, qrels)
display(exp6)

# docs {title, abstract, both} with query_narrative
exp7 = experiment(index_docs_title, pq_narr, qrels)
display(exp7)

exp8 = experiment(index_docs_abstract, pq_narr, qrels)
display(exp8)

exp9 = experiment(index_docs_both, pq_narr, qrels)
display(exp9)

Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.572,0.564,0.523648,0.622441
1,BM25,0.576,0.548,0.511069,0.615557
2,DirichletLM,0.524,0.502,0.460846,0.547692


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.732,0.692,0.634998,0.724053
1,BM25,0.664,0.646,0.589287,0.695181
2,DirichletLM,0.672,0.664,0.590112,0.590859


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.62,0.58,0.548256,0.656998
1,BM25,0.628,0.574,0.540893,0.645207
2,DirichletLM,0.564,0.54,0.495286,0.599223


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.632,0.546,0.535381,0.670968
1,BM25,0.636,0.548,0.52081,0.610791
2,DirichletLM,0.44,0.426,0.394819,0.488832


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.736,0.7,0.644396,0.784202
1,BM25,0.68,0.642,0.567955,0.633944
2,DirichletLM,0.692,0.658,0.607796,0.675928


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.648,0.576,0.554443,0.67553
1,BM25,0.656,0.564,0.545712,0.673496
2,DirichletLM,0.496,0.458,0.417476,0.511969


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.456,0.418,0.378497,0.472017
1,BM25,0.448,0.43,0.383058,0.474036
2,DirichletLM,0.256,0.224,0.214163,0.342773


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.584,0.574,0.519034,0.602448
1,BM25,0.552,0.548,0.49388,0.614167
2,DirichletLM,0.432,0.432,0.384271,0.506665


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.48,0.426,0.399389,0.528927
1,BM25,0.476,0.432,0.404772,0.535345
2,DirichletLM,0.244,0.208,0.202026,0.322534


### Index English Documents and Experiments

In [None]:
%%capture

# index en-documents by (title, abstract, both)
index_ref_en_docs_title = create_index(en_docs, mode="title", indexer_path="./indexes/en/title")
index_en_docs_title = pt.IndexFactory.of(index_ref_en_docs_title)

index_ref_en_docs_abstract = create_index(en_docs, mode="abstract", indexer_path="./indexes/en/abstract")
index_en_docs_abstract = pt.IndexFactory.of(index_ref_en_docs_abstract)

index_ref_en_docs_both = create_index(en_docs, mode="both", indexer_path="./indexes/en/both")
index_en_docs_both = pt.IndexFactory.of(index_ref_en_docs_both)

In [16]:
# en_docs {title, abstract, both} with query_title
exp1 = experiment(index_en_docs_title, pq_title, qrels)
display(exp1)

exp2 = experiment(index_en_docs_abstract, pq_title, qrels)
display(exp2)

exp3 = experiment(index_en_docs_both, pq_title, qrels)
display(exp3)

# en_docs {title, abstract, both} with query_description
exp4 = experiment(index_en_docs_title, pq_desc, qrels)
display(exp4)

exp5 = experiment(index_en_docs_abstract, pq_desc, qrels)
display(exp5)

exp6 = experiment(index_en_docs_both, pq_desc, qrels)
display(exp6)

# en_docs {title, abstract, both} with query_narrative
exp7 = experiment(index_en_docs_title, pq_narr, qrels)
display(exp7)

exp8 = experiment(index_en_docs_abstract, pq_narr, qrels)
display(exp8)

exp9 = experiment(index_en_docs_both, pq_narr, qrels)
display(exp9)

Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.572,0.562,0.521739,0.622446
1,BM25,0.576,0.55,0.512641,0.615589
2,DirichletLM,0.54,0.508,0.468562,0.55133


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.732,0.688,0.633095,0.724063
1,BM25,0.668,0.648,0.589952,0.700447
2,DirichletLM,0.676,0.67,0.594187,0.591624


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.624,0.586,0.552496,0.657
1,BM25,0.632,0.578,0.54377,0.645228
2,DirichletLM,0.564,0.54,0.495286,0.599225


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.632,0.546,0.535381,0.671064
1,BM25,0.636,0.548,0.52096,0.611938
2,DirichletLM,0.44,0.428,0.395899,0.489001


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.736,0.7,0.644823,0.784202
1,BM25,0.676,0.64,0.5662,0.633944
2,DirichletLM,0.696,0.66,0.610008,0.685928


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.648,0.576,0.554599,0.67553
1,BM25,0.652,0.564,0.545502,0.673774
2,DirichletLM,0.496,0.458,0.417565,0.511969


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.468,0.418,0.381715,0.486482
1,BM25,0.448,0.432,0.386476,0.477303
2,DirichletLM,0.248,0.226,0.213624,0.337663


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.584,0.574,0.519124,0.602461
1,BM25,0.548,0.548,0.493708,0.614169
2,DirichletLM,0.436,0.436,0.387638,0.506611


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.48,0.426,0.399389,0.528925
1,BM25,0.48,0.432,0.405134,0.535337
2,DirichletLM,0.244,0.206,0.200178,0.322545


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.572,0.562,0.521739,0.622446
1,BM25,0.576,0.55,0.512641,0.615589
2,DirichletLM,0.54,0.508,0.468562,0.55133


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.732,0.688,0.633095,0.724063
1,BM25,0.668,0.648,0.589952,0.700447
2,DirichletLM,0.676,0.67,0.594187,0.591624


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.624,0.586,0.552496,0.657
1,BM25,0.632,0.578,0.54377,0.645228
2,DirichletLM,0.564,0.54,0.495286,0.599225


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.632,0.546,0.535381,0.671064
1,BM25,0.636,0.548,0.52096,0.611938
2,DirichletLM,0.44,0.428,0.395899,0.489001


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.736,0.7,0.644823,0.784202
1,BM25,0.676,0.64,0.5662,0.633944
2,DirichletLM,0.696,0.66,0.610008,0.685928


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.648,0.576,0.554599,0.67553
1,BM25,0.652,0.564,0.545502,0.673774
2,DirichletLM,0.496,0.458,0.417565,0.511969


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.468,0.418,0.381715,0.486482
1,BM25,0.448,0.432,0.386476,0.477303
2,DirichletLM,0.248,0.226,0.213624,0.337663


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.584,0.574,0.519124,0.602461
1,BM25,0.548,0.548,0.493708,0.614169
2,DirichletLM,0.436,0.436,0.387638,0.506611


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.48,0.426,0.399389,0.528925
1,BM25,0.48,0.432,0.405134,0.535337
2,DirichletLM,0.244,0.206,0.200178,0.322545


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.572,0.562,0.521739,0.622446
1,BM25,0.576,0.55,0.512641,0.615589
2,DirichletLM,0.54,0.508,0.468562,0.55133


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.732,0.688,0.633095,0.724063
1,BM25,0.668,0.648,0.589952,0.700447
2,DirichletLM,0.676,0.67,0.594187,0.591624


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.624,0.586,0.552496,0.657
1,BM25,0.632,0.578,0.54377,0.645228
2,DirichletLM,0.564,0.54,0.495286,0.599225


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.632,0.546,0.535381,0.671064
1,BM25,0.636,0.548,0.52096,0.611938
2,DirichletLM,0.44,0.428,0.395899,0.489001


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.736,0.7,0.644823,0.784202
1,BM25,0.676,0.64,0.5662,0.633944
2,DirichletLM,0.696,0.66,0.610008,0.685928


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.648,0.576,0.554599,0.67553
1,BM25,0.652,0.564,0.545502,0.673774
2,DirichletLM,0.496,0.458,0.417565,0.511969


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.468,0.418,0.381715,0.486482
1,BM25,0.448,0.432,0.386476,0.477303
2,DirichletLM,0.248,0.226,0.213624,0.337663


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.584,0.574,0.519124,0.602461
1,BM25,0.548,0.548,0.493708,0.614169
2,DirichletLM,0.436,0.436,0.387638,0.506611


Unnamed: 0,name,P@5,P@10,nDCG@10,RR(rel=2)
0,TF_IDF,0.48,0.426,0.399389,0.528925
1,BM25,0.48,0.432,0.405134,0.535337
2,DirichletLM,0.244,0.206,0.200178,0.322545
