In [1]:
import csv
from pathlib import Path

import numpy as np
from fast_forward.ranking import Ranking
from fast_forward.index import Mode, InMemoryIndex
from fast_forward.encoder import TCTColBERTQueryEncoder as TCTColBERTQueryEncoderFF
from pyserini.search.faiss import FaissSearcher, TctColBertQueryEncoder
from ir_measures import read_trec_qrels, calc_aggregate, nDCG, RR

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sparse_ranking_2019 = Ranking.from_file(Path("msmarco-passage-test2019-sparse10000.txt"))
sparse_ranking_2019.cut(5000)
sparse_ranking_2020 = Ranking.from_file(Path("msmarco-passage-test2020-sparse10000.txt"))
sparse_ranking_2020.cut(5000)
all_ids = set.union(
    *[set(sparse_ranking_2019[q_id].keys()) for q_id in sparse_ranking_2019.q_ids],
    *[set(sparse_ranking_2020[q_id].keys()) for q_id in sparse_ranking_2020.q_ids]
)
print(f"indexing {len(all_ids)} documents or passages")

indexing 440079 documents or passages


In [3]:
searcher = FaissSearcher.from_prebuilt_index(
    "msmarco-passage-tct_colbert-bf",
    TctColBertQueryEncoder('castorini/tct_colbert-msmarco')
)

Some weights of the model checkpoint at castorini/tct_colbert-msmarco were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Attempting to initialize pre-built index msmarco-passage-tct_colbert-bf.
/home/leonhardt/.cache/pyserini/indexes/dindex-msmarco-passage-tct_colbert-bf-20210112-be7119.7312e0e7acec2a686e994902ca064fc5 already exists, skipping download.
Initializing msmarco-passage-tct_colbert-bf...


In [4]:
vectors, psg_ids, doc_ids = [], [], []
for i, psg_id in enumerate(searcher.docids):
    doc_id = psg_id.split("#")[0]
    if doc_id in all_ids:
        vectors.append(searcher.index.reconstruct(i))
        psg_ids.append(psg_id)
        doc_ids.append(doc_id)
del searcher

In [5]:
vectors_np = np.array(vectors)
del vectors

In [6]:
index = InMemoryIndex(mode=Mode.PASSAGE, encoder=TCTColBERTQueryEncoderFF("castorini/tct_colbert-msmarco"))
index.add(vectors_np, doc_ids=doc_ids, psg_ids=psg_ids)
index.save(Path("ffindex_passage_2019_2020.pkl"))

Some weights of the model checkpoint at castorini/tct_colbert-msmarco were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
with open(
    "msmarco-test2019-queries.tsv",
    encoding="utf-8",
    newline=""
) as fp:
    queries = {q_id: q for q_id, q in csv.reader(fp, delimiter="\t")}
print(f"loaded {len(queries)} queries")

alpha = 0.2
result = index.get_scores(
    sparse_ranking_2019,
    queries,
    alpha=alpha,
    cutoff=10,
    early_stopping=False
)

qrels = list(read_trec_qrels("2019qrels-pass.txt"))
print(
    "BM25",
    calc_aggregate([nDCG@10, RR(rel=2)@10], qrels, sparse_ranking_2019.run)
)
print(
    f"BM25, TCTColBERT (alpha={alpha})",
    calc_aggregate([nDCG@10, RR(rel=2)@10], qrels, result[alpha].run)
)

loaded 200 queries


100%|██████████| 43/43 [00:01<00:00, 35.36it/s]


BM25 {RR(rel=2)@10: 0.7024178663713547, nDCG@10: 0.5058310024399072}
BM25, TCTColBERT (alpha=0.2) {RR(rel=2)@10: 0.901937984496124, nDCG@10: 0.7158066715626034}


In [8]:
with open(
    "msmarco-test2020-queries.tsv",
    encoding="utf-8",
    newline=""
) as fp:
    queries = {q_id: q for q_id, q in csv.reader(fp, delimiter="\t")}
print(f"loaded {len(queries)} queries")

alpha = 0.2
result = index.get_scores(
    sparse_ranking_2020,
    queries,
    alpha=alpha,
    cutoff=10,
    early_stopping=False
)

qrels = list(read_trec_qrels("2020qrels-pass.txt"))
print(
    "BM25",
    calc_aggregate([nDCG@10, RR(rel=2)@10], qrels, sparse_ranking_2020.run)
)
print(
    f"BM25, TCTColBERT (alpha={alpha})",
    calc_aggregate([nDCG@10, RR(rel=2)@10], qrels, result[alpha].run)
)

loaded 200 queries


100%|██████████| 54/54 [00:01<00:00, 32.92it/s]


BM25 {RR(rel=2)@10: 0.6554012345679013, nDCG@10: 0.4875508583120806}
BM25, TCTColBERT (alpha=0.2) {RR(rel=2)@10: 0.7975308641975309, nDCG@10: 0.6714497182268859}
