In [1]:
import os
import time
from dotenv import load_dotenv
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn.functional as F
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings
from langchain.retrievers import BM25Retriever, TFIDFRetriever
from langchain.schema import Document
import pinecone
from datasets import load_from_disk
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig
from transformers import (AutoModelForCausalLM,
                          BitsAndBytesConfig,
                          AutoTokenizer,
                          GenerationConfig)


load_dotenv();


  from tqdm.autonotebook import tqdm


bin C:\ProgramData\Miniconda\envs\qlora\lib\site-packages\bitsandbytes\libbitsandbytes_cuda118.dll


In [2]:
class CustomEmbedding:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    @torch.inference_mode()
    def get_embeddings(self, text):
        inputs = self.tokenizer.batch_encode_plus(text, padding=True, return_tensors="pt").to("cuda")
        attn_mask = inputs["attention_mask"]

        outputs = self.model(**inputs, output_hidden_states=True)
        last_layer_hidden = outputs.hidden_states[-1]
        mask = attn_mask.unsqueeze(-1).expand(last_layer_hidden.size()).float()
        masked_embeddings = last_layer_hidden * mask

        # Extract the embedding by mean pooling
        embeddings = torch.sum(masked_embeddings, dim=1)
        seq_length = torch.sum(mask, dim=1)
        embedding = embeddings / seq_length

        # L2 normalization to have magnitude of 1
        return F.normalize(embedding, p=2, dim=1).tolist()

    def embed_documents(self, texts):
        return self.get_embeddings(texts)

    def embed_query(self, query):
        return self.get_embeddings([query])[0]


In [3]:
class DatasetRetriever:
    def __init__(self, dataset):
        self.dataset = dataset
        self.doc_ids = dataset['docs']['doc_id']
        self.query_ids = dataset['qrels']['query_id']
        self.doc_idx = {sample['doc_id']: idx for idx, sample in enumerate(dataset['docs'])}
        self.query_idx = {sample['query_id']: idx for idx, sample in enumerate(dataset['qrels'])}

    def get_doc(self, doc_id):
        idx = self.doc_idx.get(doc_id)
        if idx is not None:
            return self.dataset['docs'][idx]['doc_str']
        else:
            return None

    def get_qa(self, query_id):
        idx = self.query_idx.get(query_id)
        if idx is not None:
            return self.dataset['qrels'][idx]['query'], self.dataset['qrels'][idx]['rel_doc_ids']
        else:
            return None


def binary_relevance_labeling(results, answers, result_len=1000):
    result_array = np.zeros(result_len, dtype=int)
    for i in range(result_len):

        if type(results[0]) == Document:
            result_doc_id = results[i].metadata['doc_id']
        else:
            result_doc_id = results[i][0].metadata['doc_id']

        if result_doc_id in answers:
            result_array[i] = 1
    return result_array


def average_precision(results_array, num_rel_docs):
    relevant_idx = np.where(results_array == 1)[0]
    precisions = np.cumsum(results_array)[relevant_idx] / (relevant_idx + 1)
    AP = np.sum(precisions) / num_rel_docs
    return AP


def evaluation_mAP_embedding(dataset, database, search_func='cos_sim', k=1000):
    qlist = dataset['qrels']['query_id']
    qlen = len(qlist)
    qret = DatasetRetriever(dataset)
    ap_arr = np.zeros(qlen, dtype=float)

    cos_sim_kwargs = {'k': k}
    mmr_kwargs = {'k': k, 'fetch_k': 5*k}

    search_functions = {
        'cos_sim': database.similarity_search_with_score,
        # WIP: needs some fix on mmr
        'mmr': database.max_marginal_relevance_search
    }

    search = search_functions.get(search_func, search_func)

    for i in tqdm(range(qlen), desc="Processing queries"):
        query, answers = qret.get_qa(qlist[i])
        current_kwargs = cos_sim_kwargs if search_func != 'mmr' else mmr_kwargs
        q_result = search(query, **current_kwargs)
        result_arr = binary_relevance_labeling(q_result, answers, result_len=k)
        ap = average_precision(result_arr, len(answers))
        ap_arr[i] = ap

    mAP = np.mean(ap_arr)
    print(f"The mAP is {mAP:.3f}")


def evaluation_mAP_retriever(dataset, retriever, k=1000, reorder=False):
    qlist = dataset['qrels']['query_id']
    qlen = len(qlist)
    qret = DatasetRetriever(dataset)
    ap_arr = np.zeros(qlen, dtype=float)
    retriever = retriever.from_documents(
        [Document(page_content=i['doc_str'], metadata={'doc_id': i['doc_id']}) for i in dataset['docs']]
    )

    retriever.k = k

    for i in tqdm(range(qlen), desc="Processing queries"):
        query, answers = qret.get_qa(qlist[i])
        q_result = retriever.get_relevant_documents(query)
        result_arr = binary_relevance_labeling(q_result, answers, result_len=k)
        ap = average_precision(result_arr, len(answers))
        ap_arr[i] = ap

    mAP = np.mean(ap_arr)
    print(f"The mAP@{k} is {mAP:.3f}")


In [4]:
nf = load_from_disk('data/eval_nf')


In [5]:
ckpt = 60  # 20 40 60 80 100
lora_rank = 128

In [6]:
model_path = "llms/Llama-2-13b-chat-hf"
adapter_path = f"checkpoints/llama_{lora_rank}/checkpoint-{ckpt}"

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    load_in_8bit=True,
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    padding_side="right",
    use_fast=False,
    trust_remote_code=True
)

model = PeftModel.from_pretrained(model, adapter_path)
tokenizer.pad_token_id = model.config.eos_token_id


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
llama_embedding = CustomEmbedding(model, tokenizer)

nf_llama = Chroma(
    collection_name='nf_eval',
    persist_directory='database/nf_llama_qlora_best_bertscore',
    embedding_function=llama_embedding
)


In [None]:
evaluation_mAP_embedding(
    dataset=nf,
    database=nf_llama,
    search_func='cos_sim',
    k=100
)

evaluation_mAP_embedding(
    dataset=nf,
    database=nf_llama,
    search_func='cos_sim',
    k=10
)


Processing queries: 100%|██████████| 323/323 [01:20<00:00,  4.00it/s]


The mAP is 0.024


Processing queries:  28%|██▊       | 89/323 [00:21<00:56,  4.12it/s]

## Sentence Transformers all-MiniLM-L6-v2

In [5]:
hf_embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


### 1. Pinecone

In [6]:
pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),
    environment=os.getenv('PINECONE_ENV')
)

nf_hf_pinecone = Pinecone(
    index=pinecone.Index('sentense-transformers'),
    embedding=hf_embedding,
    text_key='text'
)

evaluation_mAP_embedding(
    dataset=nf,
    database=nf_hf_pinecone,
    search_func='cos_sim',
    k=100
)

evaluation_mAP_embedding(
    dataset=nf,
    database=nf_hf_pinecone,
    search_func='cos_sim',
    k=10
)


Processing queries: 100%|██████████| 323/323 [04:11<00:00,  1.28it/s]


The mAP is 0.144


Processing queries: 100%|██████████| 323/323 [00:33<00:00,  9.55it/s]

The mAP is 0.112





### 2. Chroma

In [6]:
nf_hf = Chroma(
    collection_name='nf_eval',
    persist_directory='database/nf_hf',
    embedding_function=hf_embedding
)

evaluation_mAP_embedding(
    dataset=nf,
    database=nf_hf,
    search_func='cos_sim',
    k=100
)

evaluation_mAP_embedding(
    dataset=nf,
    database=nf_hf,
    search_func='cos_sim',
    k=10
)


Processing queries: 100%|██████████| 323/323 [00:05<00:00, 57.80it/s] 


The mAP is 0.142


Processing queries: 100%|██████████| 323/323 [00:01<00:00, 201.27it/s]

The mAP is 0.093





## BGE-base-en

In [8]:
hf_bge_norm = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-base-en",
    model_kwargs={'device': 'cuda'},
    encode_kwargs={'normalize_embeddings': True}
)

nf_hf_bge = Chroma(
    collection_name='nf_eval',
    persist_directory='database/nf_hf_bge',
    embedding_function=hf_bge_norm
)

evaluation_mAP_embedding(
    dataset=nf,
    database=nf_hf_bge,
    search_func='cos_sim',
    k=100
)

evaluation_mAP_embedding(
    dataset=nf,
    database=nf_hf_bge,
    search_func='cos_sim',
    k=10
)


Processing queries: 100%|██████████| 323/323 [00:03<00:00, 92.22it/s] 


The mAP is 0.171


Processing queries: 100%|██████████| 323/323 [00:02<00:00, 118.27it/s]

The mAP is 0.107





## Word frequency based retrievers

In [9]:
evaluation_mAP_retriever(
    dataset=nf,
    retriever=BM25Retriever,
    k=100
)

evaluation_mAP_retriever(
    dataset=nf,
    retriever=BM25Retriever,
    k=10
)


Processing queries: 100%|██████████| 323/323 [00:00<00:00, 564.83it/s]


The mAP@100 is 0.088


Processing queries: 100%|██████████| 323/323 [00:00<00:00, 592.66it/s]

The mAP@10 is 0.076





In [10]:
evaluation_mAP_retriever(
    dataset=nf,
    retriever=TFIDFRetriever,
    k=100
)

evaluation_mAP_retriever(
    dataset=nf,
    retriever=TFIDFRetriever,
    k=10
)


Processing queries: 100%|██████████| 323/323 [00:01<00:00, 285.83it/s]


The mAP@100 is 0.129


Processing queries: 100%|██████████| 323/323 [00:01<00:00, 284.24it/s]

The mAP@10 is 0.108



