In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]= "6"

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
logging.set_verbosity_info()


In [4]:
model_name = "NousResearch/Llama-2-7b-chat-hf"
new_model = "llama-2-7b-IR2"
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False
output_dir = "./results"
num_train_epochs = 1
fp16 = False
bf16 = False
per_device_train_batch_size = 16
per_device_eval_batch_size = 16
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 0
logging_steps = 25
max_seq_length = None
packing = False
device_map = {"": 0}


In [5]:
import torch
from transformers import AutoModel, AutoTokenizer
from peft import PeftModel, PeftConfig

In [6]:
# import torch
# from peft import PeftModel
# from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# def get_model(peft_model_name):
#     config = PeftConfig.from_pretrained(peft_model_name)
#     base_model = AutoModel.from_pretrained(config.base_model_name_or_path, device_map={"": 0})
#     model = PeftModel.from_pretrained(base_model, peft_model_name)
#     model = model.merge_and_unload()
#     model.eval()
#     return model

# # Load the tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained('NousResearch/Llama-2-7b-chat-hf')
# model = get_model("../models/Llama_new")

# model = model.to(device)

In [7]:
# model

In [8]:
import os
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from beir import util, LoggingHandler
from beir.retrieval import models
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
from beir.retrieval.evaluation import EvaluateRetrieval
import logging
import collections
import pytrec_eval

logging.basicConfig(
    format='%(asctime)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.INFO,
    handlers=[LoggingHandler()]
)

# Define file paths
corpus_file = "tiny_collection.json"
queries_file = "topics.dl20.txt"
qrels_test_file = "qrels.dl20-passage.txt"
dataset_path = "../beir/datasets/msmarco_tiny/"

# Load queries
def load_queries(path):
    queries = {}
    with open(path) as f:
        for line in f:
            query_id, query_text = line.strip().split('\t')
            queries[query_id] = query_text
    return queries

# Load qrels
def load_qrels(path):
    with open(path, 'r') as f_qrel:
        qrels = pytrec_eval.parse_qrel(f_qrel)
    return qrels

# Load corpus
def load_corpus_json(path):
    with open(path, 'r') as corpus_f:
        corpus_json = json.load(corpus_f)
    return corpus_json

qrels = load_qrels(os.path.join(dataset_path, qrels_test_file))
queries = load_queries(os.path.join(dataset_path, queries_file))
corpus = load_corpus_json(os.path.join(dataset_path, corpus_file))


In [9]:
# def get_embed_dataset(input_lst):

#     input_txt = [f'{input}</s>' for input in input_lst['text']]

#     tokenized_inputs = tokenizer(input_txt, return_tensors='pt', padding="max_length", truncation=True, max_length=tokenizer_max_len)
#     tokenized_inputs = tokenized_inputs.to(device)
#     with torch.no_grad():
#         # compute query embedding
#         outputs = model(**tokenized_inputs)
#         embedding = outputs.last_hidden_state[:,-1,:]   #outputs.last_hidden_state[0][-1] # Get embedding of last token i.e. <s>
#         embedding = torch.nn.functional.normalize(embedding, p=2, dim=0)
#     return embedding

# def get_embed(input):

#     tokenized_inputs = tokenizer(f'{input}</s>', return_tensors='pt')
#     tokenized_inputs = tokenized_inputs.to(device)
#     with torch.no_grad():
#         # compute query embedding
#         outputs = model(**tokenized_inputs)
#         embedding = outputs.last_hidden_state[0][-1] #outputs.last_hidden_state[:,-1,:]  # Get embedding of last token i.e. </s>
#         embedding = torch.nn.functional.normalize(embedding, p=2, dim=0)
#     return embedding

In [10]:
# from tqdm import tqdm

# query_embeddings = {}
# doc_embeddings = {}

# print("Encoding queries ...")
# for k,q in tqdm(queries.items()):
#     query_embed = get_embed(q)
#     query_embeddings[k] = query_embed

# import pickle

# # Can't save tensors to json
# with open(f"{dataset_path}queries_newllamaEmbed.pickle", 'wb') as f:
#     pickle.dump(query_embeddings, f, protocol=pickle.HIGHEST_PROTOCOL)

# print("Encoding passages ...")
# i = 0
# for k,q in tqdm(corpus.items()):
#     i+=1
#     doc_embed = get_embed(q['text'])
#     doc_embeddings[k] = doc_embed

#     if i%50_000==0:
#         with open(f"{dataset_path}corpus_newllamaEmbed_{i}.pickle", 'wb') as f:
#             pickle.dump(doc_embeddings, f, protocol=pickle.HIGHEST_PROTOCOL)


# with open(f"{dataset_path}corpus_newllamaEmbed.pickle", 'wb') as f:
#     pickle.dump(doc_embeddings, f, protocol=pickle.HIGHEST_PROTOCOL)

# Evaluating

In [11]:
import pickle

with open(f"{dataset_path}corpus_newllamaEmbed.pickle", 'rb') as f:
    doc_embeddings = pickle.load(f)

In [12]:
with open(f"{dataset_path}queries_newllamaEmbed.pickle", 'rb') as f:
    query_embeddings = pickle.load(f)

In [13]:
from tqdm import tqdm

results = {}
for q_id, q_embed in tqdm(query_embeddings.items()):
    results[q_id] = {}
    for d_id, d_embed in doc_embeddings.items():
        # compute similarity score
        score = torch.dot(q_embed, d_embed)
        results[q_id][d_id] = score.item() #.item() to get value out of tensor


100%|██████████| 54/54 [10:07<00:00, 11.26s/it]


In [16]:
import pytrec_eval

metric = 'ndcg_cut_10'
evaluator = pytrec_eval.RelevanceEvaluator(qrels, {metric})
results_metric = evaluator.evaluate(results)
print(f'nDCG@10: {sum(item[metric] for item in results_metric.values()) / len(results_metric):0.4f}')

nDCG@10: 0.0283


In [15]:
from beir.retrieval.search.lexical import BM25Search as BM25
from beir.retrieval.evaluation import EvaluateRetrieval

hostname = "localhost"
index_name = "msmarco_abd"
initialize = True

model = BM25(index_name=index_name, hostname=hostname, initialize=initialize)
retriever = EvaluateRetrieval(model)
# model doesn't do anything and results for ndcg and map are same as pytrec_eval

retriever.evaluate(qrels, results, retriever.k_values)

2024-06-02 08:22:47 - Activating Elasticsearch....
2024-06-02 08:22:47 - Elastic Search Credentials: {'hostname': 'localhost', 'index_name': 'msmarco_abd', 'keys': {'title': 'title', 'body': 'txt'}, 'timeout': 100, 'retry_on_timeout': True, 'maxsize': 24, 'number_of_shards': 'default', 'language': 'english'}
2024-06-02 08:22:47 - Deleting previous Elasticsearch-Index named - msmarco_abd
2024-06-02 08:22:47 - Unable to create Index in Elastic Search. Reason: ConnectionError(('Connection aborted.', BadStatusLine('ÿ\x00\x00\x00\x00\x00\x00\x00\x01\x7fi\x00entity\r\n'))) caused by: ProtocolError(('Connection aborted.', BadStatusLine('ÿ\x00\x00\x00\x00\x00\x00\x00\x01\x7fi\x00entity\r\n')))
2024-06-02 08:22:49 - Creating fresh Elasticsearch-Index named - msmarco_abd
2024-06-02 08:22:49 - Unable to create Index in Elastic Search. Reason: ConnectionError(('Connection aborted.', BadStatusLine('ÿ\x00\x00\x00\x00\x00\x00\x00\x01\x7fn\x00-Length: 117\r\n'))) caused by: ProtocolError(('Connection 

({'NDCG@1': 0.01852,
  'NDCG@3': 0.03071,
  'NDCG@5': 0.02854,
  'NDCG@10': 0.02833,
  'NDCG@100': 0.03862,
  'NDCG@1000': 0.06866},
 {'MAP@1': 0.00088,
  'MAP@3': 0.00293,
  'MAP@5': 0.00313,
  'MAP@10': 0.00392,
  'MAP@100': 0.00652,
  'MAP@1000': 0.00828},
 {'Recall@1': 0.00088,
  'Recall@3': 0.00421,
  'Recall@5': 0.00499,
  'Recall@10': 0.00886,
  'Recall@100': 0.03895,
  'Recall@1000': 0.12443},
 {'P@1': 0.01852,
  'P@3': 0.05556,
  'P@5': 0.04444,
  'P@10': 0.03704,
  'P@100': 0.02241,
  'P@1000': 0.00696})