In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import locale
# locale.getpreferredencoding = lambda: "UTF-8"

# Install necessary packages

In [2]:
!pip install bs4 datasets torch langchain langchain-community transformers faiss-gpu bitsandbytes sentence-transformers accelerate langchain-huggingface

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting langchain
  Downloading langchain-0.2.16-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community
  Downloading langchain_community-0.2.16-py3-none-any.whl.metadata (2.7 kB)
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-0.0.3-py3-none-any.whl.metadata (1.2 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloa

# Prepare dataset

In [3]:
import torch
import pandas as pd
from bs4 import BeautifulSoup
# from huggingface_hub import InferenceClient, login
# login(token=os.environ[MY_HF_TOKEN])
from langchain.text_splitter import RecursiveCharacterTextSplitter

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [5]:
content = pd.read_csv('/content/drive/MyDrive/RAG/content.csv')

In [6]:
URLS = list()
questions = list()
visited = set()
company_names = list()

for _, r in content.iterrows():
  url, company_name = r['url'].strip(), r['company_name'].strip()
  if (url, company_name) not in visited:
    URLS.append(url)
    questions.append(f'What does the company {company_name} do?')
    company_names.append(company_name)
    visited.add((url, company_name))

print(f'Num unique urls : {len(URLS)}')
assert len(URLS) == len(questions) == len(company_names)

Num unique urls : 518


In [7]:
questions_and_context = pd.DataFrame({'questions':questions, 'company':company_names})

In [8]:
import nest_asyncio
nest_asyncio.apply()
from langchain_community.document_loaders import WebBaseLoader

dataset = WebBaseLoader(URLS)
dataset.requests_per_second = 10
dataloader = dataset.aload()

Fetching pages: 100%|##########| 518/518 [00:48<00:00, 10.67it/s]


# Split the documents and recreate chunks

In [9]:
embedding_model_name = 'BAAI/bge-large-en-v1.5'

In [10]:
#

# text_splitter = RecursiveCharacterTextSplitter()
# documents = text_splitter.split_documents(dataloader)

In [11]:
# len(documents)

# Do Chunk Length EDA

In [12]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained(embedding_model_name, device_map=device)


print(f'Max tokenizer length: {tokenizer.model_max_length}')

lengths = [len(tokenizer.encode(doc.page_content)) for doc in documents]

pd.Series(lengths).hist()
print(f'Max chunk length:{pd.Series(lengths).max()}')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

# Build the retrieval flow

In [13]:
# Adjust chunk length again as 3627 > 512 which can lead to truncation at the stage of storing in VectorStores
chunk_length = tokenizer.model_max_length
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(tokenizer, chunk_size=chunk_length, chunk_overlap = chunk_length//10)
documents = text_splitter.split_documents(dataloader)

# Redo the histogram test
lengths = [len(tokenizer.encode(doc.page_content)) for doc in documents]

pd.Series(lengths).hist()
max_len = pd.Series(lengths).max()

print(f'Max chunk length:{max_len}')
print(f'Max model length:{chunk_length}')



Token indices sequence length is longer than the specified maximum sequence length for this model (3103 > 512). Running this sequence through the model will result in indexing errors


In [14]:
len(documents)

4812

In [15]:
import os
import nest_asyncio
from transformers import AutoModelForSequenceClassification
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain.vectorstores.utils import DistanceStrategy
from langchain_community.vectorstores import FAISS
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
save_path = f'/content/drive/MyDrive/RAG/faiss_index_{embedding_model_name}'
#async def load_or_create_faiss_vectorstore():
if os.path.exists(save_path):
    faiss_vectorstore = FAISS.load_local(folder_path=save_path, allow_dangerous_deserialization=True, embeddings=HuggingFaceEmbeddings(model_name=embedding_model_name, model_kwargs={'device': 'cuda'}, encode_kwargs={'batch_size': 300, 'normalize_embeddings':True}, show_progress=True))
else:
    faiss_vectorstore = await FAISS.afrom_documents(documents, HuggingFaceEmbeddings(model_name=embedding_model_name, model_kwargs={'device': 'cuda'}, encode_kwargs={'batch_size': 300, 'normalize_embeddings':True}, show_progress=True), distance_strategy = DistanceStrategy.COSINE)
    faiss_vectorstore.save_local(save_path)

faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={'k':6})

#Reranker
model_name = 'BAAI/bge-reranker-large'
cross_encoding_reranker = HuggingFaceCrossEncoder(model_name=model_name, model_kwargs={'device': device})

cross_encoder_wrapper = CrossEncoderReranker(model=cross_encoding_reranker, top_n=4)
faiss_retriever = ContextualCompressionRetriever(base_compressor=cross_encoder_wrapper, base_retriever=faiss_retriever)


  faiss_vectorstore = FAISS.load_local(folder_path=save_path, allow_dangerous_deserialization=True, embeddings=HuggingFaceEmbeddings(model_name=embedding_model_name, model_kwargs={'device': 'cuda'}, encode_kwargs={'batch_size': 300, 'normalize_embeddings':True}, show_progress=True))


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

# Agentic Query transformation

In [16]:
#Define the retriever tool

from transformers import Tool
from langchain_core.vectorstores.base import VectorStore

class QueryTransformandRetrieve(Tool):
  name = "Retriever"
  description = "Based on semantic similarity retrieve the documents from the knowledge base which are most sematically similar to the query using embeddings from both \
  query embeddings and document embeddings"
  inputs = {
      'query':{
          'type':'text',
          'description':'The query to be used to perform retrieval. It should be in affirmative form, not interrogative form such that it is semantically close to the\
          target documents'
      }
  }

  output_type = "text"

  def __init__(self, vectordb:VectorStore, **kwargs):
    super().__init__(**kwargs)
    self.vectordb = vectordb

  def forward(self, query: str) -> str:
    context = self.vectordb.similarity_search(query, k=4)

    return f"Retrieved documents: "+''.join([f"\n------Document {str(i)} -------: {doc.page_content}" for i, doc in enumerate(context)])


In [None]:
HfEngine?

In [19]:
#Define llm_engine and reactjsonagent
from transformers import HfEngine, ReactJsonAgent

llm_engine = HfEngine('meta-llama/Meta-Llama-3-8B-Instruct')
agent = ReactJsonAgent(tools=[QueryTransformandRetrieve(faiss_vectorstore)], llm_engine=llm_engine, max_iterations=4, verbose=2)

In [20]:
def AgenticRAGPipeline(question):
  enhanced_query = f"You are an assistant who will give extremely relevant answers for the question that is asked. You have already been given the context and the question.\
  You can access the context from the knowledge base of the tool called 'Retriever'. You must give a thorough answer taking all the necessary points into account. \
  Further cover the question completely by calling the retriever multiple times each time with semantically different queries. The query needs to be in affirmative language\
  at the same time capturing the original meaning of the interrogative form of query eg. Interrogative form: Who is Shaunak? Affirmative form: Shaunak's identity is being asked\
  This is the question: {question}"

  answer = agent.run(enhanced_query)
  return answer

In [None]:
agentic_rag_answers = [AgenticRAGPipeline(question)for question in questions]

# Non Agentic RAG

# Load, quantized the generator LLM and build a generation pipeline

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
generation_model = AutoModelForCausalLM.from_pretrained('HuggingFaceH4/zephyr-7b-beta', quantization_config=quant_config, device_map=device)
generation_tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta', device_map=device)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
from langchain import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_huggingface import HuggingFacePipeline
from transformers import pipeline

prompt = """
<|system|>

You are an assistant who will give extremely relevant answers for the questions asked. You have already been given the context and the question. The question should capture information correctly from the context given and give a thorough answer
taking all the necessary points into account. Also don't hallucinate on the answer and don't provide metadata details to the answer.

<|user|>
context: {context}
question: {question}
<|assistant|>
"""

prompt_template = PromptTemplate(template=prompt, input_variables=["context", "question"])
hf_pipeline = pipeline(
    "text-generation",
    model=generation_model,
    tokenizer=generation_tokenizer,
    return_full_text=False,
    max_new_tokens=400,
    do_sample=True,
    temperature=0.2,
    repetition_penalty=1.1)
text_generation_pipeline = HuggingFacePipeline(pipeline=hf_pipeline)
generation_chain = prompt_template | text_generation_pipeline | StrOutputParser()

In [None]:
from langchain_core.runnables import RunnablePassthrough

rag_chain = {'context': faiss_retriever, 'question': RunnablePassthrough()} | generation_chain

In [None]:
print(rag_chain.invoke(question))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Based on the provided context, it can be concluded that the company Aboitiz is involved in multiple businesses. From the given information, we know that Aboitiz Power Corporation is a vertically integrated company engaged in power generation, power distribution, and retail electricity services. They generate power from a mix of renewable and thermal sources through their Generation Business Group, which includes hydroelectric power plants developed by their subsidiary Hedcor. Additionally, the Aboitiz Group, which started as a hemp trading business, now has investments in power, banking and financial services, food, land, construction, shipbuilding, and infrastructure. This information suggests that Aboitiz is involved in various industries such as power, finance, food, real estate, construction, shipping, and infrastructure.


# Evaluation

In [None]:
import random
from datasets import Dataset
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
#Add context to the questions using retriever pipeline
if not os.path.exists('/content/drive/MyDrive/RAG/results_bge_large_reranked.csv'):
  def get_context_and_answer(question_batch):
    answer_batch = list()
    context_batch = list()
    for question in question_batch:
      context = [elem.page_content for elem in faiss_retriever.invoke(question)]
      torch.cuda.empty_cache()
      answer = rag_chain.invoke(question)
      answer_batch.append(answer)
      context_batch.append(context)
    return question_batch, context_batch, answer_batch

  batch_size = 2
  question_list = questions_and_context['questions'].tolist()[:len(questions_and_context)//10]
  random.shuffle(question_list)
  #print(len(question_list))
  questions_batches = [question_list[i:i+batch_size] for i in range(0, len(question_list), batch_size)]

  with ThreadPoolExecutor() as executor:
    results = list(tqdm(executor.map(get_context_and_answer, questions_batches), total = len(questions_batches)))

  new_results = list()
  for result in tqdm(results):
    questions, contexts, answers = result
    for question, context, answer in zip(questions, contexts, answers):
      new_results.append((question, context, answer))

  new_results_df = pd.DataFrame(new_results, columns=['question', 'context', 'answer'])
  new_results_df.to_csv('/content/drive/MyDrive/RAG/results_bge_large_reranked.csv', index=False)
else:
  new_results_df = pd.read_csv('/content/drive/MyDrive/RAG/results_bge_large_reranked.csv')

  0%|          | 0/26 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  4%|▍         | 1/26 [14:48<6:10:19, 888.77s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  8%|▊         | 2/26 [19:32<3:33:11, 532.97s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 23%|██▎       | 6/26 [20:55<46:04, 138.23s/it]  

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 35%|███▍      | 9/26 [20:57<21:31, 75.99s/it] 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 46%|████▌     | 12/26 [24:45<17:43, 75.98s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 26/26 [36:11<00:00, 83.53s/it] 
100%|██████████| 26/26 [00:00<00:00, 24511.55it/s]


In [None]:
new_results_df

Unnamed: 0,question,context,answer
0,What does the company The Best Bees Company do?,[Get Started\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\...,The Best Bees Company installs and maintains h...
1,What does the company Big Media Expert do?,[E-COMMERCE\nStand out on the digital shelf.\n...,The company Big Media Expert offers a variety ...
2,What does the company Abelini do?,"[While we are committed to security, we cannot...",Abelini is a company that provides jewelry ser...
3,What does the company adabra do?,"[Adabra nasce Blendee, una Customer Data Platf...",Adabra is a Customer Data Platform (CDP) that ...
4,What does the company Action24 do?,[CONTACT OUR SALES TEAM\n\nNot sure what's rig...,Action24 is a company that provides monitored ...
5,What does the company aamartech.com do?,[Your IT journey starts here...\n\n\n\n\n\n\n\...,"AamarTech, a US-Bangladesh joint venture compa..."
6,What does the company 7EDGE Private Limited do?,[We care about your privacy\n7EDGE Internet Pr...,"Based on the provided context, it can be infer..."
7,What does the company Absolute Reg do?,[About us\nAbsolute Reg aims to provide Britis...,The company Absolute Reg specializes in provid...
8,What does the company Action24 do?,[CONTACT OUR SALES TEAM\n\nNot sure what's rig...,Action24 is a company that provides monitored ...
9,What does the company contact@acom.co.id do?,[Acom\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,"Based on the provided context, the company con..."


## Use evaluation LLM

In [None]:
EVALUATION_PROMPT = """
You will be given a user_question, system_answer and the context which was used to answer the question.
The context is in the form of a list of strings.

Your task is to provide a 'total rating' scoring based upon the following criteria:
1) Groundedness
2) Answer Relevance
3) Context Relevance

Let's dig into each one:
1) Groundedness: It means is the system_answer supported by the context. Give your answer on a scale of 1 to 4, where 1 means that the system_answer is least or not at all supported by the context,
 2 means the system_answer misses some key aspects of the context, 3 means the context supports the system_answer but still could be improved and 4 means that the system_answer is completely supported by the context.

2) Answer Relevance: It means is the system_answer relevant to the user_question. Give your answer on a scale of 1 to 4, where 1 means that the system_answer is least or not at all relevant to the user_question,
2 means the system_answer misses some key aspects of the user_question, 3 means the system_answer is relevant to the user_question but still could be improved and 4 means that the system_answer is completely relevant to the user_question.

3) Context Relevance: It means is the context relevant to the user_question. Give your answer on a scale of 1 to 4, where 1 means that the context is least or not at all relevant to the user_question,
2 means the context misses some key aspects of the user_question, 3 means the context is relevant to the user_question but still could be improved and 4 means that the context is completely relevant to the user_question.

Now here are the question and answer.

Question: {user_question}
Answer: {system_answer}
Context: {context}

Provide your feedback as follows:

Feedback for groundedness: (your rating, as a number between 1 and 4)
Feedback for answer relevance: (your rating, as a number between 1 and 4)
Feedback for context relevance: (your rating, as a number between 1 and 4)

Evaluation rationale for groundedness: (your reasoning for the rating)
Evaluation rationale for answer relevance: (your reasoning for the rating)
Evaluation rationale for context relevance: (your reasoning for the rating

Total rating: (your average rating, as a number between 1 and 4 for each of criteria and divided by 3)

Now here are the question and answer.

Question: {user_question}
Answer: {system_answer}
Context: {context}


 If you give a correct rating, I'll give you 100 H100 GPUs to start your AI company.
 """

In [None]:
repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"

# evaluation_model = AutoModelForCausalLM.from_pretrained(repo_id, device_map=device)
# evaluation_tokenizer = AutoTokenizer.from_pretrained(repo_id, device_map=device)

llm_client = InferenceClient(
    model=repo_id,
    timeout=120,
)



The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
new_results_df

Unnamed: 0,question,context,answer
0,What does the company The Best Bees Company do?,[Get Started\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\...,The Best Bees Company installs and maintains h...
1,What does the company Big Media Expert do?,[E-COMMERCE\nStand out on the digital shelf.\n...,The company Big Media Expert offers a variety ...
2,What does the company Abelini do?,"[While we are committed to security, we cannot...",Abelini is a company that provides jewelry ser...
3,What does the company adabra do?,"[Adabra nasce Blendee, una Customer Data Platf...",Adabra is a Customer Data Platform (CDP) that ...
4,What does the company Action24 do?,[CONTACT OUR SALES TEAM\n\nNot sure what's rig...,Action24 is a company that provides monitored ...
5,What does the company aamartech.com do?,[Your IT journey starts here...\n\n\n\n\n\n\n\...,"AamarTech, a US-Bangladesh joint venture compa..."
6,What does the company 7EDGE Private Limited do?,[We care about your privacy\n7EDGE Internet Pr...,"Based on the provided context, it can be infer..."
7,What does the company Absolute Reg do?,[About us\nAbsolute Reg aims to provide Britis...,The company Absolute Reg specializes in provid...
8,What does the company Action24 do?,[CONTACT OUR SALES TEAM\n\nNot sure what's rig...,Action24 is a company that provides monitored ...
9,What does the company contact@acom.co.id do?,[Acom\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,"Based on the provided context, the company con..."


In [None]:
import re
regex = r'(?<=groundedness: )\d|(?<=answer relevance: )\d|(?<=context relevance: )\d'
def get_evals(row):
  #ratings = list()
  #for question in question_batch:
  question = row['question']
  context = row['context']
  system_answer = row['answer']
  torch.cuda.empty_cache()
  evaluation = llm_client.text_generation(
      prompt=EVALUATION_PROMPT.format(user_question=question, system_answer=system_answer, context=context),
      max_new_tokens=1000)
  rating = list(map(int, re.findall(regex, evaluation)))
  return sum(rating)/len(rating)

new_results_df['eval'] = new_results_df.apply(get_evals, axis=1)


In [None]:
new_results_df.columns

Index(['question', 'context', 'answer', 'eval'], dtype='object')

In [None]:
print(f"The mean score of LLM judge is {new_results_df['eval'].mean()}")

The mean score of LLM judge is 3.8888888888888893


In [None]:
new_results_df = new_results_df._append({'question':'', 'context':'', 'answer':'','eval':new_results_df['eval'].mean()}, ignore_index=True)

In [None]:
new_results_df

Unnamed: 0,question,context,answer,eval
0,What does the company The Best Bees Company do?,[Get Started\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\...,The Best Bees Company installs and maintains h...,4.0
1,What does the company Big Media Expert do?,[E-COMMERCE\nStand out on the digital shelf.\n...,The company Big Media Expert offers a variety ...,4.0
2,What does the company Abelini do?,"[While we are committed to security, we cannot...",Abelini is a company that provides jewelry ser...,3.0
3,What does the company adabra do?,"[Adabra nasce Blendee, una Customer Data Platf...",Adabra is a Customer Data Platform (CDP) that ...,4.0
4,What does the company Action24 do?,[CONTACT OUR SALES TEAM\n\nNot sure what's rig...,Action24 is a company that provides monitored ...,4.0
5,What does the company aamartech.com do?,[Your IT journey starts here...\n\n\n\n\n\n\n\...,"AamarTech, a US-Bangladesh joint venture compa...",4.0
6,What does the company 7EDGE Private Limited do?,[We care about your privacy\n7EDGE Internet Pr...,"Based on the provided context, it can be infer...",4.0
7,What does the company Absolute Reg do?,[About us\nAbsolute Reg aims to provide Britis...,The company Absolute Reg specializes in provid...,4.0
8,What does the company Action24 do?,[CONTACT OUR SALES TEAM\n\nNot sure what's rig...,Action24 is a company that provides monitored ...,4.0
9,What does the company contact@acom.co.id do?,[Acom\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,"Based on the provided context, the company con...",3.666667


In [None]:
new_results_df.to_csv('/content/drive/MyDrive/RAG/results_with_evals_bge_large_reranked.csv', index=False)