In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]= "6"

In [2]:
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader

  from tqdm.autonotebook import tqdm


In [3]:
tokenizer_max_len = 512

In [4]:
dataset = "scifact"
data_path = f"../beir/datasets/{dataset}"

# Loading test set
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")

  0%|          | 0/5183 [00:00<?, ?it/s]

100%|██████████| 5183/5183 [00:00<00:00, 106151.99it/s]


In [5]:
qrels['179']

{'16322674': 1, '27123743': 1, '23557241': 1, '17450673': 1}

In [6]:
import torch
from transformers import AutoModel, AutoTokenizer
from peft import PeftModel, PeftConfig



In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def get_model(peft_model_name):
    config = PeftConfig.from_pretrained(peft_model_name)
    base_model = AutoModel.from_pretrained(config.base_model_name_or_path)
    model = PeftModel.from_pretrained(base_model, peft_model_name)
    model = model.merge_and_unload()
    model.eval()
    return model

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf')
model = get_model('castorini/repllama-v1-7b-lora-passage')

model = model.to(device) # Moving model to GPU

# Define query and passage inputs
query = "What is llama?"
title = "Llama"
passage = "The llama is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the pre-Columbian era."
query_input = tokenizer(f'query: {query}</s>', return_tensors='pt')
passage_input = tokenizer(f'passage: {title} {passage}</s>', return_tensors='pt')


Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.80s/it]


In [8]:
# from datasets import Dataset
# import pandas as pd

# queries = pd.DataFrame({'qid': queries.keys(), 'text': queries.values()})
# # queries.head()
# query_dataset = Dataset.from_pandas(queries)
# query_dataset

In [9]:
# if tokenizer.pad_token is None:
#     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
#     model.resize_token_embeddings(len(tokenizer)) # Update models token embedding size so it knows about new token

In [10]:
# input_txt = [query_dataset[0]['text'], query_dataset[1]['text']]

# tokenized_inputs = tokenizer(input_txt, return_tensors='pt', padding=True, truncation=True, max_length=tokenizer_max_len)
# tokenized_inputs.to(device)
# with torch.no_grad():
#     # compute query embedding
#     outputs = model(**tokenized_inputs)
#     embedding = outputs.last_hidden_state[0][-1]
#     embedding = torch.nn.functional.normalize(embedding, p=2, dim=0)

# embedding

In [11]:
# Can't batch tokenize because we need embeddings of last token and last token of different models maybe different

def get_embed_dataset(input_lst):

    input_txt = [f'{input}</s>' for input in input_lst['text']]

    tokenized_inputs = tokenizer(input_txt, return_tensors='pt', padding="max_length", truncation=True, max_length=tokenizer_max_len)
    tokenized_inputs = tokenized_inputs.to(device)
    with torch.no_grad():
        # compute query embedding
        outputs = model(**tokenized_inputs)
        embedding = outputs.last_hidden_state[:,-1,:]   #outputs.last_hidden_state[0][-1] # Get embedding of last token i.e. <s>
        embedding = torch.nn.functional.normalize(embedding, p=2, dim=0)
    return embedding

def get_embed(input):

    tokenized_inputs = tokenizer(f'{input}</s>', return_tensors='pt')
    tokenized_inputs = tokenized_inputs.to(device)
    with torch.no_grad():
        # compute query embedding
        outputs = model(**tokenized_inputs)
        embedding = outputs.last_hidden_state[0][-1] #outputs.last_hidden_state[:,-1,:]  # Get embedding of last token i.e. </s>
        embedding = torch.nn.functional.normalize(embedding, p=2, dim=0)
    return embedding

# query_dataset = query_dataset.map(get_embed, batched=True, batch_size=8)
# query_dataset

In [12]:
from tqdm import tqdm

query_embeddings = {}
doc_embeddings = {}

print("Encoding queries ...")
for k,q in tqdm(queries.items()):
    query_embed = get_embed(q)
    query_embeddings[k] = query_embed

print("Encoding passages ...")
for k,q in tqdm(corpus.items()):
    doc_embed = get_embed(q['text'])
    doc_embeddings[k] = doc_embed


Encoding queries ...


100%|██████████| 300/300 [00:14<00:00, 20.86it/s]


Encoding passages ...


100%|██████████| 5183/5183 [29:58<00:00,  2.88it/s]


In [15]:
results = {}
for q_id, q_embed in tqdm(query_embeddings.items()):
    results[q_id] = {}
    for d_id, d_embed in doc_embeddings.items():
        # compute similarity score
        score = torch.dot(q_embed, d_embed)
        results[q_id][d_id] = score.item() #.item() to get value out of tensor


100%|██████████| 300/300 [00:36<00:00,  8.25it/s]


In [19]:
import pickle

with open(f"{data_path}/{dataset}_score_repLlama.pickle", 'wb') as f:
    pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
import pytrec_eval

metric = 'ndcg_cut_10'
evaluator = pytrec_eval.RelevanceEvaluator(qrels, {metric})
results_metric = evaluator.evaluate(results)
print(f'nDCG@10: {sum(item[metric] for item in results_metric.values()) / len(results_metric):0.4f}')

nDCG@10: 0.7599


In [61]:
metric = 'map_cut_10'
evaluator = pytrec_eval.RelevanceEvaluator(qrels, {metric})
results_metric = evaluator.evaluate(results)
print(f'MAP@10: {sum(item[metric] for item in results_metric.values()) / len(results_metric):0.4f}')

MAP@10: 0.7110


In [73]:
from beir.retrieval.search.lexical import BM25Search as BM25
from beir.retrieval.evaluation import EvaluateRetrieval

hostname = "localhost"
index_name = "scifact"
initialize = True

model = BM25(index_name=index_name, hostname=hostname, initialize=initialize)
retriever = EvaluateRetrieval(model)
# model doesn't do anything and results for ndcg and map are same as pytrec_eval

retriever.evaluate(qrels, results, retriever.k_values)

ERROR:root:Unable to create Index in Elastic Search. Reason: ConnectionError(<urllib3.connection.HTTPConnection object at 0x7f2a61db2ea0>: Failed to establish a new connection: [Errno 111] Connection refused) caused by: NewConnectionError(<urllib3.connection.HTTPConnection object at 0x7f2a61db2ea0>: Failed to establish a new connection: [Errno 111] Connection refused)
ERROR:root:Unable to create Index in Elastic Search. Reason: ConnectionError(<urllib3.connection.HTTPConnection object at 0x7f2a59772420>: Failed to establish a new connection: [Errno 111] Connection refused) caused by: NewConnectionError(<urllib3.connection.HTTPConnection object at 0x7f2a59772420>: Failed to establish a new connection: [Errno 111] Connection refused)


({'NDCG@1': 0.63333,
  'NDCG@3': 0.71401,
  'NDCG@5': 0.73013,
  'NDCG@10': 0.75995,
  'NDCG@100': 0.77343,
  'NDCG@1000': 0.77925},
 {'MAP@1': 0.60261,
  'MAP@3': 0.68315,
  'MAP@5': 0.69664,
  'MAP@10': 0.71105,
  'MAP@100': 0.71456,
  'MAP@1000': 0.71482},
 {'Recall@1': 0.60261,
  'Recall@3': 0.77139,
  'Recall@5': 0.81272,
  'Recall@10': 0.89867,
  'Recall@100': 0.95667,
  'Recall@1000': 1.0},
 {'P@1': 0.63333,
  'P@3': 0.28111,
  'P@5': 0.182,
  'P@10': 0.10167,
  'P@100': 0.01087,
  'P@1000': 0.00113})

In [68]:
metric = 'recall.10'
evaluator = pytrec_eval.RelevanceEvaluator(qrels, {metric})
results_metric = evaluator.evaluate(results)
# print(f'recall@10: {sum(item[metric] for item in results_metric.values()) / len(results_metric):0.4f}')
results_metric

{'1': {'recall_10': 0.0},
 '3': {'recall_10': 1.0},
 '5': {'recall_10': 1.0},
 '13': {'recall_10': 0.0},
 '36': {'recall_10': 0.5},
 '42': {'recall_10': 1.0},
 '48': {'recall_10': 1.0},
 '49': {'recall_10': 1.0},
 '50': {'recall_10': 1.0},
 '51': {'recall_10': 1.0},
 '53': {'recall_10': 1.0},
 '54': {'recall_10': 1.0},
 '56': {'recall_10': 1.0},
 '57': {'recall_10': 1.0},
 '70': {'recall_10': 1.0},
 '72': {'recall_10': 1.0},
 '75': {'recall_10': 1.0},
 '94': {'recall_10': 1.0},
 '99': {'recall_10': 1.0},
 '100': {'recall_10': 1.0},
 '113': {'recall_10': 1.0},
 '115': {'recall_10': 1.0},
 '118': {'recall_10': 1.0},
 '124': {'recall_10': 1.0},
 '127': {'recall_10': 1.0},
 '128': {'recall_10': 0.0},
 '129': {'recall_10': 1.0},
 '130': {'recall_10': 1.0},
 '132': {'recall_10': 0.0},
 '133': {'recall_10': 0.8},
 '137': {'recall_10': 1.0},
 '141': {'recall_10': 1.0},
 '142': {'recall_10': 1.0},
 '143': {'recall_10': 1.0},
 '146': {'recall_10': 1.0},
 '148': {'recall_10': 1.0},
 '163': {'reca

In [69]:
metric = 'P.10'
evaluator = pytrec_eval.RelevanceEvaluator(qrels, {metric})
results_metric = evaluator.evaluate(results)
# print(f'P@10: {sum(item[metric] for item in results_metric.values()) / len(results_metric):0.4f}')
results_metric

{'1': {'P_10': 0.0},
 '3': {'P_10': 0.1},
 '5': {'P_10': 0.1},
 '13': {'P_10': 0.0},
 '36': {'P_10': 0.1},
 '42': {'P_10': 0.1},
 '48': {'P_10': 0.1},
 '49': {'P_10': 0.1},
 '50': {'P_10': 0.1},
 '51': {'P_10': 0.1},
 '53': {'P_10': 0.1},
 '54': {'P_10': 0.1},
 '56': {'P_10': 0.1},
 '57': {'P_10': 0.1},
 '70': {'P_10': 0.2},
 '72': {'P_10': 0.1},
 '75': {'P_10': 0.1},
 '94': {'P_10': 0.1},
 '99': {'P_10': 0.1},
 '100': {'P_10': 0.1},
 '113': {'P_10': 0.1},
 '115': {'P_10': 0.1},
 '118': {'P_10': 0.1},
 '124': {'P_10': 0.1},
 '127': {'P_10': 0.1},
 '128': {'P_10': 0.0},
 '129': {'P_10': 0.1},
 '130': {'P_10': 0.1},
 '132': {'P_10': 0.0},
 '133': {'P_10': 0.4},
 '137': {'P_10': 0.1},
 '141': {'P_10': 0.2},
 '142': {'P_10': 0.1},
 '143': {'P_10': 0.1},
 '146': {'P_10': 0.1},
 '148': {'P_10': 0.1},
 '163': {'P_10': 0.1},
 '171': {'P_10': 0.1},
 '179': {'P_10': 0.4},
 '180': {'P_10': 0.1},
 '183': {'P_10': 0.1},
 '185': {'P_10': 0.1},
 '198': {'P_10': 0.1},
 '208': {'P_10': 0.1},
 '212': {'