In [1]:
import os
import time
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn.functional as F
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings
from chromadb.api.types import Documents, EmbeddingFunction, Embeddings
from datasets import load_from_disk
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig
from transformers import (AutoModelForCausalLM,
                          BitsAndBytesConfig,
                          AutoTokenizer,
                          GenerationConfig)


bin C:\ProgramData\Miniconda\envs\qlora\lib\site-packages\bitsandbytes\libbitsandbytes_cuda118.dll


In [2]:
class CustomEmbedding:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    @torch.inference_mode()
    def get_embeddings(self, text):
        inputs = self.tokenizer.batch_encode_plus(text,
                                                  padding=True,
                                                  return_tensors="pt",
                                                  max_length=512,
                                                  truncation=True).to("cuda")
        attn_mask = inputs["attention_mask"]

        outputs = self.model(**inputs, output_hidden_states=True)
        last_layer_hidden = outputs.hidden_states[-1]
        mask = attn_mask.unsqueeze(-1).expand(last_layer_hidden.size()).float()
        masked_embeddings = last_layer_hidden * mask

        # Extract the embedding by mean pooling
        embeddings = torch.sum(masked_embeddings, dim=1)
        seq_length = torch.sum(mask, dim=1)
        embedding = embeddings / seq_length
        return F.normalize(embedding, p=2, dim=1).tolist()

    def embed_documents(self, texts):
        return self.get_embeddings(texts)

    def embed_query(self, query):
        return self.get_embeddings([query])[0]


In [3]:
class Retriever:
    def __init__(self, dataset):
        self.dataset = dataset
        self.doc_ids = dataset['docs']['doc_id']
        self.query_ids = dataset['qrels']['query_id']
        self.doc_idx = {sample['doc_id']: idx for idx, sample in enumerate(dataset['docs'])}
        self.query_idx = {sample['query_id']: idx for idx, sample in enumerate(dataset['qrels'])}

    def get_doc(self, doc_id):
        idx = self.doc_idx.get(doc_id)
        if idx is not None:
            return self.dataset['docs'][idx]['doc_str']
        else:
            return None

    def get_qa(self, query_id):
        idx = self.query_idx.get(query_id)
        if idx is not None:
            return self.dataset['qrels'][idx]['query'], self.dataset['qrels'][idx]['rel_doc_ids']
        else:
            return None


In [4]:
def store_embeddings(dataset,
                     database,
                     batch_size=4,
                     store_per_n_docs=64
                    ):

    doc_ids = set(dataset['docs']['doc_id'])
    stored_ids = set(database.get()['ids'])
    unstored_docs = list(doc_ids - stored_ids)
    len_unstored_docs = len(unstored_docs)

    print(f"NO. of documents in the dataset:  {len(doc_ids)}")
    print(f"NO. of unstored document:         {len(unstored_docs)}\n")

    if len_unstored_docs == 0:
        print("All documents and embeddings are stored.\n")
        return
    else:
        print("Storing documents and embeddings into database...\n")
        dret = Retriever(dataset)
        processed_docs = 0
        for i in tqdm(range(0, len_unstored_docs, batch_size), desc="Processing docs"):
            end_idx = min(i + batch_size, len_unstored_docs)
            doc_ids = unstored_docs[i:end_idx]
            doc_str = [dret.get_doc(doc_id) for doc_id in doc_ids]
            database.add_texts(texts=doc_str,
                               metadatas=[{'doc_id': doc_id} for doc_id in doc_ids],
                               ids=doc_ids)

            processed_docs += len(doc_ids)
            if processed_docs % store_per_n_docs == 0:
                database.persist()
                torch.cuda.empty_cache()

        database.persist()
        torch.cuda.empty_cache()
        print(f"Process completed. NO. of documents in the database: {database._collection.count()}")


In [5]:
nf = load_from_disk('data/eval_nf')


In [6]:
ckpt = 100  # 20 40 60 80 100
lora_rank = 64

In [7]:
model_path = "llms/Llama-2-13b-chat-hf"
adapter_path = f"checkpoints/llama_{lora_rank}/checkpoint-{ckpt}"

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    load_in_8bit=True,
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    padding_side="right",
    use_fast=False,
    trust_remote_code=True
)

model = PeftModel.from_pretrained(model, adapter_path)
tokenizer.pad_token_id = model.config.eos_token_id


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
llama_embedding = CustomEmbedding(model, tokenizer)

nf_llama = Chroma(
    collection_name='nf_eval',
    persist_directory='database/nf_llama_qlora_best_f1',
    embedding_function=llama_embedding
)


In [9]:
store_embeddings(dataset=nf,
                 database=nf_llama,
                 batch_size=4)


NO. of documents in the dataset:  3633
NO. of unstored document:         3633

Storing documents and embeddings into database...



Processing docs: 100%|██████████| 909/909 [08:48<00:00,  1.72it/s]

Process completed. NO. of documents in the database: 3633





In [6]:
hf_embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

nf_hf = Chroma(
    collection_name='nf_eval',
    persist_directory='database/nf_hf',
    embedding_function=hf_embedding
)


In [7]:
store_embeddings(dataset=nf,
                 database=nf_hf,
                 batch_size=4)


NO. of documents in the dataset:  3633
NO. of unstored document:         3633

Storing documents and embeddings into database...



Processing docs: 100%|██████████| 909/909 [00:36<00:00, 24.98it/s]

Process completed. NO. of documents in the database: 3633





In [9]:
hf_bge_norm = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-base-en",
    model_kwargs={'device': 'cuda'},
    encode_kwargs={'normalize_embeddings': True}
)

nf_hf_bge = Chroma(
    collection_name='nf_eval',
    persist_directory='database/nf_hf_bge',
    embedding_function=hf_bge_norm
)


In [10]:
store_embeddings(dataset=nf,
                 database=nf_hf_bge,
                 batch_size=4)


NO. of documents in the dataset:  3633
NO. of unstored document:         3633

Storing documents and embeddings into database...



Processing docs: 100%|██████████| 909/909 [00:29<00:00, 30.38it/s]

Process completed. NO. of documents in the database: 3633



