In [1]:
!pip install --upgrade git+https://github.com/Georgetown-IR-Lab/OpenNIR
!pip install --upgrade git+https://github.com/terrierteam/pyterrier_t5

[0mCollecting git+https://github.com/Georgetown-IR-Lab/OpenNIR
  Cloning https://github.com/Georgetown-IR-Lab/OpenNIR to /tmp/pip-req-build-_ts7g2pw
  Running command git clone --filter=blob:none --quiet https://github.com/Georgetown-IR-Lab/OpenNIR /tmp/pip-req-build-_ts7g2pw
  Resolved https://github.com/Georgetown-IR-Lab/OpenNIR to commit 88a4679372f471a04d284a99404ffce2b7a1dc49
  Preparing metadata (setup.py) ... [?25ldone
Collecting pytorch-pretrained-bert==0.6.1
  Downloading pytorch_pretrained_bert-0.6.1-py3-none-any.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.3/114.3 KB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytorch-transformers==1.1.0
  Downloading pytorch_transformers-1.1.0-py3-none-any.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.1/158.1 KB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting colorlog==4.0.2
  Downloading colorlog-4.0.2-py2.py3-none-any.whl (17 kB)
Collecting termin

## Preliminary steps

pyterrier initialization

In [1]:
import pyterrier as pt
if not pt.started():
    pt.init(tqdm='notebook')
import onir_pt

PyTerrier 0.8.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
import pandas as pd

# corpus
docs_df = pd.read_csv('data/lab_docs.csv', dtype=str)
print(docs_df.shape)
print(docs_df.head())

# topics
topics_df = pd.read_csv('data/lab_topics.csv', dtype=str)
print(topics_df.shape)
print(topics_df.head())

qrels_df = pd.read_csv('data/lab_qrels.csv', dtype=str)
qrels_df = qrels_df.astype({'label': 'int32'})
qrels_df.head()

(2453, 2)
     docno                                               text
0   935016  he emigrated to france with his family in 1956...
1  2360440  after being ambushed by the germans in novembe...
2   347765  she was the second ship named for captain alex...
3  1969335  world war ii was a global war that was under w...
4  1576938  the ship was ordered on 2 april 1942 laid down...
(9, 2)
       qid                 query
0  1015979    president of chile
1     2674    computer animation
2   340095  2020 summer olympics
3  1502917         train station
4     2574       chinese cuisine


Unnamed: 0,qid,docno,label,iteration
0,1015979,1015979,2,0
1,1015979,2226456,1,0
2,1015979,1514612,1,0
3,1015979,1119171,1,0
4,1015979,1053174,1,0


In [4]:
# Build index

def data_generate():
    for _, row in docs_df.iterrows():
        yield {"docno": row['docno'], "text": row['text']}

iter_indexer = pt.IterDictIndexer("./indexes/default", overwrite = True)
indexref = iter_indexer.index(data_generate(), meta=["docno","text"])

index = pt.IndexFactory.of(indexref)
print(index.getCollectionStatistics().toString())

  indexref = iter_indexer.index(data_generate(), meta=["docno","text"])


Number of documents: 2453
Number of terms: 23693
Number of postings: 208487
Number of fields: 1
Number of tokens: 273373
Field names: [text]
Positions:   false



In [5]:
br = pt.BatchRetrieve(index) % 100
pipeline = br >> pt.text.get_text(index, 'text')

pipeline.search("wall")

Unnamed: 0,qid,docid,docno,rank,score,query,text
0,1,1172,679402,0,6.766137,wall,prior to the construction of the berlin wall i...
1,1,2110,2391064,1,5.944105,wall,in 2013 the rifle range which was constructed ...
2,1,1452,702865,2,5.241024,wall,it was one of a number of highly experimental ...
3,1,1357,1221197,3,5.184876,wall,he was inspired to climb during a cycling holi...
4,1,1845,1151865,4,5.151107,wall,designed in the shape of a five pointed americ...
5,1,293,243238,5,5.124795,wall,it was created in 1942 by members of the ak wa...
6,1,592,692168,6,5.055496,wall,in washington d c the memorial commemorates ja...
7,1,319,1607882,7,3.875393,wall,josef kaplick from 1959 he was an art teacher ...
8,1,1609,2411328,8,3.815203,wall,an advantage of a wooden mine is that it is ha...
9,1,1775,354066,9,3.688716,wall,it served garrison duty on the west wall until...


In [7]:
## Rerank from scratch 

knrm = onir_pt.reranker('knrm', 'wordvec_hash', text_field='text')


br = pt.BatchRetrieve(index) % 100
pipeline = br >> pt.text.get_text(index, 'text') >> knrm
pt.Experiment(
    [br, pipeline],
    topics_df,
    qrels_df,
    names=['DPH', 'DPH >> KNRM'],
    eval_metrics=["map", "ndcg", 'ndcg_cut.10', 'P.10', 'mrt']
)

[2022-05-10 16:00:33,122][WordvecHashVocab][DEBUG] [starting] reading cached at /home/wzm289/data/onir/vocab/wordvec_hash/fasttext-wiki-news-300d-1M.p
[2022-05-10 16:00:34,185][WordvecHashVocab][DEBUG] [finished] reading cached at /home/wzm289/data/onir/vocab/wordvec_hash/fasttext-wiki-news-300d-1M.p [1.06s]
[2022-05-10 16:00:38,317][onir_pt][DEBUG] using GPU (deterministic)
[2022-05-10 16:00:45,878][onir_pt][DEBUG] [starting] batches


batches:   0%|          | 0/185 [24ms<?, ?it/s]

[2022-05-10 16:00:48,579][onir_pt][DEBUG] [finished] batches: [2.70s] [185it] [68.59it/s]


Unnamed: 0,name,map,ndcg,ndcg_cut.10,P.10,mrt
0,DPH,0.539915,0.711595,0.840109,0.766667,34.317518
1,DPH >> KNRM,0.149483,0.357949,0.22936,0.288889,1413.056607


## Vanilla BERT

Contextualized language models, such as [BERT](https://arxiv.org/abs/1810.04805), are much more powerful neural models that have been shown to be effective for ranking.

We'll try using a "vanilla" (or "mono") version of the BERT model. The BERT model is pre-trained for the task of language modeling and next sentence prediction.

In [8]:
del knrm # clear out memory from KNRM
vbert = onir_pt.reranker('vanilla_transformer', 'bert', text_field='text', vocab_config={'train': True})

pipeline = br % 100 >> pt.text.get_text(index, 'text') >> vbert
pt.Experiment(
    [br, pipeline],
    topics_df,
    qrels_df,
    names=['DPH', 'DPH >> VBERT'],
    baseline=0,
    eval_metrics=["map", "ndcg", 'ndcg_cut.10', 'P.10', 'mrt']
)

100%|██████████| 231508/231508 [634ms<0ms, 365142.45B/s]  
100%|██████████| 433/433 [1ms<0ms, 514631.24B/s]
100%|██████████| 440473133/440473133 [16.49s<0ms, 26706362.99B/s]  


[2022-05-10 16:06:09,750][onir_pt][DEBUG] using GPU (deterministic)
[2022-05-10 16:06:09,817][onir_pt][DEBUG] [starting] batches


batches:   0%|          | 0/185 [23ms<?, ?it/s]

[2022-05-10 16:06:13,294][onir_pt][DEBUG] [finished] batches: [3.47s] [185it] [53.26it/s]


Unnamed: 0,name,map,P.10,ndcg,ndcg_cut.10,mrt,map +,map -,map p-value,P.10 +,P.10 -,P.10 p-value,ndcg +,ndcg -,ndcg p-value,ndcg_cut.10 +,ndcg_cut.10 -,ndcg_cut.10 p-value
0,DPH,0.539915,0.766667,0.711595,0.840109,30.337489,,,,,,,,,,,,
1,DPH >> VBERT,0.136422,0.211111,0.337665,0.177415,423.723623,0.0,9.0,0.00419,0.0,9.0,0.000543,0.0,9.0,0.001491,0.0,9.0,0.000172


In [9]:
vbert = onir_pt.reranker.from_checkpoint('https://macavaney.us/scibert-medmarco.tar.gz', text_field='text', expected_md5="854966d0b61543ffffa44cea627ab63b")
pipeline = br % 100 >> pt.text.get_text(index, 'text') >> vbert
pt.Experiment(
    [br, pipeline],
    topics_df,
    qrels_df,
    names=['DPH', 'DPH >> VBERT'],
    baseline=0,
    eval_metrics=["map", "ndcg", 'ndcg_cut.10', 'P.10', 'mrt']
)

                                                                                        

[2022-05-10 16:11:11,257][onir.util.download][DEBUG] downloaded https://macavaney.us/scibert-medmarco.tar.gz [20.41s] [499M] [41.1MB/s] [md5 hash verified]


                                                                                                                                                   

[2022-05-10 16:12:02,711][onir.util.download][DEBUG] downloaded https://s3-us-west-2.amazonaws.com/ai2-s2-research/scibert/pytorch_models/scibert_scivocab_uncased.tar [40.34s] [411M] [17.3MB/s] [md5 hash verified]


extracting: 411MB [4.23s, 97.0MB/s]                                                                                                                  
extracting: 821MB [9.66s, 85.0MB/s] 


[2022-05-10 16:12:30,994][onir_pt][DEBUG] using GPU (deterministic)
[2022-05-10 16:12:31,057][onir_pt][DEBUG] [starting] batches


batches:   0%|          | 0/185 [13ms<?, ?it/s]

[2022-05-10 16:12:33,420][onir_pt][DEBUG] [finished] batches: [2.36s] [185it] [78.36it/s]


Unnamed: 0,name,map,P.10,ndcg,ndcg_cut.10,mrt,map +,map -,map p-value,P.10 +,P.10 -,P.10 p-value,ndcg +,ndcg -,ndcg p-value,ndcg_cut.10 +,ndcg_cut.10 -,ndcg_cut.10 p-value
0,DPH,0.539915,0.766667,0.711595,0.840109,16.458479,,,,,,,,,,,,
1,DPH >> VBERT,0.507537,0.766667,0.656305,0.783515,286.879604,4.0,5.0,0.298367,3.0,2.0,1.0,4.0,5.0,0.184318,3.0,4.0,0.31386
