In [1]:
#!/usr/bin/env python
# coding: utf-8
import torch
import os, sys
from models.dpr import VanillaDPR
from arguments import get_rerank_parser
from tqdm import tqdm
from transformers import AutoConfig, AutoTokenizer, AutoModel

from util.dataset import get_rerank_dataset
from util.util import MixedPrecisionManager, query_tokenizer, doc_tokenizer, hugface_models, set_seed, trec_eval, query_tokenizer_paragraph, document_tokenizer_paragraph
os.environ["TOKENIZERS_PARALLELISM"] = "true"
from collections import defaultdict
import torch.nn.functional as F

In [2]:
def score_agg(score_matrix, query_reps, doc_reps, args):
    if args.score_agg == "colbert":
        max_input, max_indx = torch.max(scores, dim=1)
        return torch.mean(max_input).item()
    
    elif args.score_agg == "vector_rrf" :
        final_query_rep = torch.mean(query_reps, dim =0, keepdim=True)
        
        rank_matrix = torch.zeros(score_matrix.shape)
        _argsort = torch.argsort(score_matrix, descending=True)
        for row in range(len(_argsort)):
            for col in range(len(_argsort[row])):
                indx = _argsort[row][col]
                rank_matrix[row][indx] = col + 1
            
        rrf = torch.add(rank_matrix, 60)   
        rrf = rrf.pow(-1)
        rrf = torch.mean(rrf, dim=0, keepdim=True)
        rrf = rrf.to(args.device)
        final_doc_rep = rrf.mm(doc_reps)
        return F.cosine_similarity(final_query_rep, final_doc_rep).item()
    
    else:
        raise NotImplementedError

In [3]:
def get_datasets(args):
    query_dataset, rerank_document_dataset, run = get_rerank_dataset(args)  
    
    query_loader = torch.utils.data.DataLoader(query_dataset,
                                                   batch_size=args.batch_size,
                                                   drop_last=False,
                                                   shuffle=False)
    
    document_loader = torch.utils.data.DataLoader(rerank_document_dataset,
                                                      batch_size=args.batch_size, 
                                                      drop_last=False)
    
    return query_loader, document_loader, run

In [8]:
BASE_PATH='/work/snaseri_umass_edu/'
params = f"""--mode AUTO --seed 42 --model_name xlm-roberta-base --output_dir {BASE_PATH}/better_P3/experiments/BetterRerankerS1/runs --query_maxlen 32 --doc_maxlen 180 --rerank_topK 1000 --collection_file {BASE_PATH}/better_P3/data/BETTER_PHASE3_COMBO_ARABIC_FARSI_RUSSIAN/combined-corpus.jl --task_file {BASE_PATH}/better_P3/data/BETTER_PHASE3_COMBO_ARABIC_FARSI_RUSSIAN/62e171cedebb2f0d2619f2f6.analytic_tasks.json --test_qrels {BASE_PATH}/better_P3/data/BETTER_PHASE3_COMBO_ARABIC_FARSI_RUSSIAN/IR-relevance-assessments.qrels.GALAGO --test_run {BASE_PATH}/better_P3/experiments/BetterRerankerS1/runs/62e171cedebb2f0d2619f2f6.FINAL.out""".replace("\n"," ")

In [9]:
parser = get_rerank_parser()
args = parser.parse_args(params.split(" "))

In [10]:
model = AutoModel.from_pretrained("castorini/ance-msmarco-passage")

Downloading config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at castorini/ance-msmarco-passage were not used when initializing RobertaModel: ['embeddingHead.weight', 'embeddingHead.bias', 'norm.weight', 'norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
tokenizer = AutoTokenizer.from_pretrained("castorini/ance-msmarco-passage")

Downloading tokenizer_config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [12]:
model.

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [14]:
args.score_agg = "colbert"
args.batch_size = 1
args.stride = 10
args.rrf_k = 60
set_seed(args.seed)
args.device = torch.cuda.current_device()
os.makedirs(args.output_dir, exist_ok=True)


# _, tokenizer_class, model_class = hugface_models(args.model_name)
query_encoder = AutoModel.from_pretrained("castorini/ance-msmarco-passage")
doc_encoder = AutoModel.from_pretrained("castorini/ance-msmarco-passage")
tokenizer = AutoTokenizer.from_pretrained("castorini/ance-msmarco-passage")
model = VanillaDPR(query_encoder, doc_encoder, args)
model = model.to(args.device)

# # load checkpoint
# if args.checkpoint:
#     state_dict = torch.load(os.path.join(args.checkpoint))["model_state_dict"]
#     model.load_state_dict(state_dict)

Some weights of the model checkpoint at castorini/ance-msmarco-passage were not used when initializing RobertaModel: ['embeddingHead.weight', 'embeddingHead.bias', 'norm.weight', 'norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at castorini/ance-msmarco-passage were not used when initializing RobertaModel: ['embeddingHead.weight', 'embeddingHead.bias', 'norm.weight', 'norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializin

tensor([0], device='cuda:0')


In [15]:
query_loader, document_loader, baseline_run = get_datasets(args)  

loading collection: 100%|███████████████████████████████████████████████████████████████████| 864971/864971 [01:11<00:00, 12099.87it/s]
loading qrel data: 100%|███████████████████████████████████████████████████████████████████████| 3903/3903 [00:00<00:00, 577397.31it/s]
loading run data: 100%|██████████████████████████████████████████████████████████████████████| 54000/54000 [00:00<00:00, 928369.36it/s]


In [16]:
model.eval()
rerank_run = defaultdict(dict)
METRIC = 'map'

In [17]:
with torch.no_grad():
    qreps = {}
    for item in tqdm(query_loader, desc=f'encode rerank queries'):
        qid = item['qid'][0]  
        qtext = item['qtext'][0]
        q_chunck_ids, q_chunk_mask = query_tokenizer_paragraph(qtext, args, tokenizer)
        q_reps = model.query(q_chunck_ids, q_chunk_mask)
        qreps[qid] = q_reps.detach().cpu()

encode rerank queries: 100%|███████████████████████████████████████████████████████████████████████████| 54/54 [00:02<00:00, 18.44it/s]


In [18]:
with torch.no_grad():
    dreps = {}
    for item in tqdm(document_loader, desc=f'encode rerank documents'):
        docid, doctext = item   
        docid = docid[0]
        doctext = doctext[0]
        d_chunk_ids, d_chunk_mask = document_tokenizer_paragraph(doctext, args, tokenizer)
        d_reps = model.doc(d_chunk_ids, d_chunk_mask)
        dreps[docid] = d_reps.detach().cpu()

encode rerank documents: 100%|█████████████████████████████████████████████████████████████████████| 7859/7859 [06:08<00:00, 21.31it/s]


In [19]:
rerank_run = defaultdict(dict)

with torch.no_grad():
    for qid in tqdm(qreps):
        rank_list = baseline_run[qid]
        query_rep = qreps[qid].to(args.device)
        for docid in rank_list:
            document_rep = dreps[docid].to(args.device)
            scores = model.score(query_rep, document_rep)
            score = score_agg(model.score(query_rep, document_rep), query_rep, document_rep, args)            
            rerank_run[qid][docid] = score

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 54/54 [00:06<00:00,  7.98it/s]


In [20]:
runf = os.path.join(args.output_dir, f'DPR-Reranked_with_eng2eng_model.run')

with open(runf, 'wt') as runfile:
    for qid in tqdm(rerank_run):
        scores = list(sorted(rerank_run[qid].items(), key=lambda x: (x[1], x[0]), reverse=True))
        for i, (did, score) in enumerate(scores):
            runfile.write(f'{qid} 0 {did} {i+1} {score} run\n')

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 54/54 [00:00<00:00, 431.43it/s]


In [21]:
baseline_list = {}
for qid in baseline_run:
    ranklist = list(sorted(baseline_run[qid].items(), key=lambda x: (x[1], x[0]), reverse=True))
    baseline_list.setdefault(qid, [])
    for retdoc in ranklist:
        baseline_list[qid].append(retdoc[0])

In [22]:
fusion_run = defaultdict(dict)
for qid in tqdm(rerank_run):
    rerank_list = list(sorted(rerank_run[qid].items(), key=lambda x: (x[1], x[0]), reverse=True))
    for i in range(len(rerank_list)):
        did = rerank_list[i][0]
        rerank_at = i + 1
        baseline_at = baseline_list[qid].index(did) + 1
        fused_score = (1/(args.rrf_k + rerank_at)) + (1/(args.rrf_k + baseline_at))
        fusion_run[qid][did] = fused_score

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 54/54 [00:00<00:00, 99.54it/s]


In [23]:
fusion_file = os.path.join(args.output_dir, f'Fusion_DPR_with_eng2eng_model-Reranker_Baseline_rrf-k_{args.rrf_k}.run')
with open(fusion_file, 'wt') as runfile:
    for qid in fusion_run:
        dummy_score = 100000
        fused_scores = list(sorted(fusion_run[qid].items(), key=lambda x: (x[1], x[0]), reverse=True))
        for i, (did, score) in enumerate(fused_scores):
            runfile.write(f'{qid} 0 {did} {i+1} {dummy_score} run\n')
            dummy_score -= 1

# RoundRobin Fusion of Arabic and Russian

In [None]:
arabic_run = read_run("/work/snaseri_umass_edu/better_P3/data/BETTER_PHASE3_COMBO_ARABIC_FARSI_RUSSIAN/runfiles/62e171cedebb2f0d2619f2f6.arabic.Request.RERANKED.out")

In [None]:
russian_run = read_run("/work/snaseri_umass_edu/better_P3/data/BETTER_PHASE3_COMBO_ARABIC_FARSI_RUSSIAN/runfiles/62e171cedebb2f0d2619f2f6.russian.Request.RERANKED.out")

In [None]:
def roundrobin(*iterables):
    "roundrobin('ABC', 'D', 'EF') --> A D E B F C"
    # Recipe credited to George Sakkis
    pending = len(iterables)
    nexts = cycle(iter(it).next for it in iterables)
    while pending:
        try:
            for next in nexts:
                yield next()
        except StopIteration:
            pending -= 1
            nexts = cycle(islice(nexts, pending))

In [None]:
round_robin_run = {}

for q in tqdm(arabic_run):
    ar_rank_list = list(sorted(arabic_run[q].items(), key=lambda x: x[1], reverse=True))
    ru_rank_list = list(sorted(russian_run[q].items(), key=lambda x: x[1], reverse=True))
    round_robin_run.setdefault(q,[])
    for item in zip(ar_rank_list, ru_rank_list):
        round_robin_run[q].append(item[0][0])
        round_robin_run[q].append(item[1][0])

In [None]:
topk = 1000
round_robin_file= os.path.join(args.output_dir, f'round_robin_ar_ru_baseline.run')
with open(round_robin_file, 'wt') as runfile:
    for qid in round_robin_run:
        dummy_score = 100000
        rank = 1
        for did in round_robin_run[qid][:topk]:
            runfile.write(f'{qid} 0 {did} {rank} {dummy_score} run\n')
            dummy_score -= 1
            rank += 1