# Preparation

In [2]:
import pyterrier as pt
import pandas as pd
import csv
import os
import shutil

if not pt.started():
    pt.init()

# Download data

In [24]:
#!wget -O data/queries.tar.gz  https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz
#!wget -O data/qrels.dev.tsv https://msmarco.blob.core.windows.net/msmarcoranking/qrels.dev.tsv
#!wget -O data/qrels.train.tsv https://msmarco.blob.core.windows.net/msmarcoranking/qrels.train.tsv
#!wget -O data/collection.tar.gz https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz
#!wget -O data19/2019qrels-pass.txt https://trec.nist.gov/data/deep/2019qrels-pass.txt
#!wget -O data19/queries.tar.gz https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz

--2020-08-06 18:13:13--  https://msmarco.blob.core.windows.net/msmarcoranking/qrels.train.tsv
Resolving msmarco.blob.core.windows.net (msmarco.blob.core.windows.net)... 40.112.152.16
Connecting to msmarco.blob.core.windows.net (msmarco.blob.core.windows.net)|40.112.152.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10589532 (10M) [text/tab-separated-values]
Saving to: ‘data/qrels.train.tsv’


2020-08-06 18:13:14 (11.0 MB/s) - ‘data/qrels.train.tsv’ saved [10589532/10589532]



# Unzip data

In [5]:
import os
import tarfile

def unzip_all_data():
    cwd = os.getcwd()
    path = os.path.join(cwd, 'data')   # Change according to your path 

    os.chdir(path)

    # unzip queries

    with tarfile.open('queries.tar.gz', 'r:gz') as tar:
        tar.extractall()

    # unzip passages
    with tarfile.open('collection.tar.gz', 'r:gz') as tar:
        tar.extractall()


    path19 = os.path.join(cwd, 'data19')
    os.chdir(path19)

    with tarfile.open('queries.tar.gz', 'r:gz') as tar:
        tar.extractall()

    os.chdir(cwd)
    return None

# uncomment if it is the first time unzipping the data
# unzip_all_data()


# Create Index

In [3]:
def passages_generator(filepath : str, delimiter : str, verbose : bool=False):
    """
    Generator of passages dataset. Generates 1 passage at a time.

    Parameters
    ----------
    filepath : str
        path of file that contains passages in a csv format 
        with two fields: [passage_id] and [passage_text].

    delimiter : str
        delimiter of csv file that contains the passages

    verbose: bool, default=False
        Whether or not to log progress frequently.

    Returns
    -------
    {'docno': docno, 'text': text}
    """
    csv_file = open(filepath)
    read_csv = csv.reader(csv_file, delimiter=delimiter)

    for i, (docno, text) in enumerate(read_csv):
        if i % 200000 == 0 and verbose:
            print(f'Processing passage {i}')
        yield {'docno': docno, 'text': text}



def create_index():
    """
    Creates pyterrier index using IterDictIndexer from pyterrier.
    """
    if os.path.exists('index'):
        shutil.rmtree('index')
    index_path = os.path.join(os.getcwd(),'index')
    iter_indexer = pt.IterDictIndexer(index_path)

    collection_file = os.path.join(os.getcwd(),'data','collection.tsv')

    doc_iter = passages_generator(collection_file, '\t', verbose=True)
    index_passages = iter_indexer.index(doc_iter)
    print("done")
    return None
    
# uncomment to create index if index is not yet created.
# create_index()

Processing passage 0
Processing passage 200000
Processing passage 400000
Processing passage 600000
Processing passage 800000
Processing passage 1000000
Processing passage 1200000
Processing passage 1400000
Processing passage 1600000
Processing passage 1800000
Processing passage 2000000
Processing passage 2200000
Processing passage 2400000
Processing passage 2600000
Processing passage 2800000
Processing passage 3000000
Processing passage 3200000
Processing passage 3400000
Processing passage 3600000
Processing passage 3800000
Processing passage 4000000
Processing passage 4200000
Processing passage 4400000
Processing passage 4600000
Processing passage 4800000
Processing passage 5000000
Processing passage 5200000
Processing passage 5400000
Processing passage 5600000
Processing passage 5800000
Processing passage 6000000
Processing passage 6200000
Processing passage 6400000
Processing passage 6600000
Processing passage 6800000
Processing passage 7000000
Processing passage 7200000
Processing 

# Connect to index, qrels, topics

In [4]:
# paths
index_path = os.path.join(os.getcwd(),'index')
qrels_path= os.path.join(os.getcwd(),'data')
qrels_eval19_path = os.path.join(os.getcwd(),'data19', '2019qrels-pass.txt')
qrels_train_path = os.path.join(qrels_path, 'qrels.train.tsv')
qrels_dev_path = os.path.join(qrels_path, 'qrels.dev.tsv')

topics_path = os.path.join(os.getcwd(),'data')
topics_train_path = os.path.join(topics_path, 'queries.train.tsv')
topics_dev_path = os.path.join(topics_path, 'queries.dev.tsv')
topics_eval_path = os.path.join(topics_path, 'queries.eval.tsv')
topics_eval19_path = os.path.join(os.getcwd(),'data19', 'queries.eval.tsv')

# read data into dataframes from paths
topics_train = pt.io.read_topics(topics_train_path, format='singleline')
topics_dev = pt.io.read_topics(topics_dev_path, format='singleline')
topics_eval = pt.io.read_topics(topics_eval_path, format='singleline')
topics_eval19 = pt.io.read_topics(topics_eval19_path, format='singleline')

qrels_train = pt.io.read_qrels(qrels_train_path)
qrels_dev = pt.io.read_qrels(qrels_dev_path)
qrels_eval19 = pt.io.read_qrels(qrels_eval19_path)

indexRef = pt.TRECCollectionIndexer(index_path)

# Preprocess empty topics

some topics have empty queries for some reason, and that messes up pyterrier_bert so let's fix it

In [5]:
def fill_empty_queries(df: pd.DataFrame):
    """
    fills all empty queries with some text so that pyterrier_bert does not crash.
    """
    df_copy = df.copy()
    df_copy.loc[df_copy['query'].str.len() == 0,'query'] = 'nova'
    return df_copy


topics_train = fill_empty_queries(topics_train)
topics_dev = fill_empty_queries(topics_dev)
topics_eval = fill_empty_queries(topics_eval)
topics_eval19 = fill_empty_queries(topics_eval19)

# Experiments

In [7]:
def retrieve_from_multiple_models(indexref, models : list, num_results_per_model : int, query : str) -> pd.DataFrame:
    first_run = True
    all_dfs = None
    for model in models:
        if first_run:
            all_dfs = pt.BatchRetrieve(indexref,controls={'wmodel':model}, num_results=num_results_per_model).transform(query)
            all_dfs['model'] = model
            first_run = False
            continue
        df = pt.BatchRetrieve(indexer,controls={'wmodel':model}, num_results=num_results_per_model).transform(query)
        df['model'] = model
        all_dfs = pd.concat([df, all_dfs])
        break
    all_dfs = all_dfs.set_index(['model','rank'])
    return all_dfs


all_retrieval_weighting_models = ['BB2', 'BM25', 'DFI0', 'DFR_BM25', 'DLH', \
                                  'DLH13', 'DPH', 'DFRee', 'Hiemstra_LM', \
                                  'DirichletLM', 'IFB2', 'In_expB2',\
                                  'In_expC2', 'InL2', 'LemurTF_IDF', 'LGD',\
                                  'PL2', 'TF_IDF', 'DFRWeightingModel']

# run the following line to test retrieval
# retrieve_from_multiple_models(indexRef, [all_retrieval_weighting_models[0]], 10, 'math')

# Bert4IR

In [None]:
from pyterrier_bert.bert4ir import *

results_path = os.path.join(os.getcwd(), 'results')

BM25_br = pt.BatchRetrieve(indexRef, controls={"wmodel" : "BM25"}, verbose=True)
bertpipe = BM25_br >> BERTPipeline()

bertpipe.fit(topics_train, qrels_train, topics_dev, qrels_dev)



# baseline for 2019 test set
"""
df_result_eval19 = pt.pipelines.Experiment([BM25_br],
                        topics_dev,
                        qrels_dev,
                        ['map','ndcg'],  
                        names=["BM25 + bert4ir"])

df_result_eval19.to_csv(
    os.path.join(results_path, 'eval19_baseline_retrieval__bm25_bert4ir.csv'), \
    index=False)
print('Finished eval 19')
"""

# baseline for 2020 eval

df_baseline_retrieval_2020 = bertpipe.transform(topics_eval)
df_baseline_retrieval_2020.to_csv(
    os.path.join(results_path, 'eval20_baseline_retrieval__bm25_bert4ir.csv'), \
    index=False)
print('Finished eval 20')


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [5]:
from pyterrier_bert.pyt_cedr import CEDRPipeline

results_path = os.path.join(os.getcwd(), 'results')

BM25_br = pt.BatchRetrieve(indexRef, controls={"wmodel" : "BM25"}, verbose=True)
cedrpipe = BM25_br >> CEDRPipeline(max_valid_rank=20)

In [6]:
from pyterrier_bert.pyt_cedr import CEDRPipeline
from pyterrier_bert.bert4ir import *

results_path = os.path.join(os.getcwd(), 'results')

BM25_br = pt.BatchRetrieve(indexRef, controls={"wmodel" : "BM25"}, verbose=True)

#cedr
cedrpipe = BM25_br >> CEDRPipeline(max_valid_rank=20)

cedrpipe.fit(topics_train, qrels_train, topics_dev, qrels_dev)

df_baseline_retrieval_2020_cedr = cedrpipe.transform(topics_eval)
df_baseline_retrieval_2020_cedr.to_csv(
    os.path.join(results_path, 'eval20_baseline_retrieval__bm25_cedr.csv'), \
    index=False)
print('done cedr 20')

df_result_eval19_cedr = pt.pipelines.Experiment([cedrpipe],
                        topics_dev,
                        qrels_dev,
                        ['map','ndcg'],  
                        names=["BM25 + cedr"])

df_result_eval19.to_csv(
    os.path.join(results_path, 'eval19_baseline_retrieval__bm25_cedr.csv'), \
    index=False)
print('Finished cedr 19')

# bert4ir

bertpipe = BM25_br >> BERTPipeline(max_valid_rank=20)

bertpipe.fit(topics_train, qrels_train, topics_dev, qrels_dev)

df_baseline_retrieval_2020 = bertpipe.transform(topics_eval)
df_baseline_retrieval_2020.to_csv(
    os.path.join(results_path, 'eval20_baseline_retrieval__bm25_bert4ir.csv'), \
    index=False)
print('done bert4ir 20')

df_result_eval19_bert = pt.pipelines.Experiment([bertpipe],
                        topics_dev,
                        qrels_dev,
                        ['map','ndcg'],  
                        names=["BM25 + bert4ir"])

df_result_eval19_bert.to_csv(
    os.path.join(results_path, 'eval19_baseline_retrieval__bm25_bert.csv'), \
    index=False)
print('Finished bert4ir 19')

  0%|          | 732/808731 [01:17<23:38:56,  9.49q/s]


KeyboardInterrupt: 

In [None]:
from pyterrier_bert.pyt_cedr import CEDRPipeline

cedrpipe = DPH_br >> CEDRPipeline(max_valid_rank=20)