In [1]:
!pip install -q pymupdf
!pip install -q bitsandbytes

In [2]:
import fitz
import os
import numpy as np
import json
import tqdm
import re

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
device = "cuda"
gen_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="unsloth/Llama-3.2-3B-Instruct")
gen_model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path="unsloth/Llama-3.2-3B-Instruct",
    torch_dtype="auto",
    device_map="auto"
    )

embed_model = AutoModel.from_pretrained("BAAI/bge-base-en")
embed_tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
def extract_text_from_pdf(pdf_path):
  pdf = fitz.open(pdf_path)
  text = ""

  for page in pdf:
    text += page.get_text("text")

  return text

def chunk_text(text, n = 1000, overlap = 200):
  return [text[i:i+n] for i in range(0, len(text), n-overlap)]

In [5]:
class SimpleVectorStore:
  def __init__(self):
    self.vectors = []
    self.texts = []
    self.metadata = []

  def add_item(self, text, embedding, metadata = None):
    self.vectors.append(embedding)
    self.texts.append(text)
    self.metadata.append(metadata or {})

  def similarity_search(self, query_embedding, k = 5):
    if not self.vectors:
      return []

    similarities = [(i, np.dot(query_embedding, vector) / np.linalg.norm(query_embedding) * np.linalg.norm(vector)) for i, vector in enumerate(self.vectors)]

    similarities.sort(key = lambda x:x[1], reverse = True)

    results = []
    for i in range(min(k, len(similarities))):
      idx, score = similarities[i]
      results.append({
          "text": self.texts[idx],
          "metadata": self.metadata[idx],
          "similarity": score
      })

    return results

In [6]:
embed_model.to(device)
def embed(text):
    is_single = isinstance(text, str)
    if is_single:
        text = [text]

    inputs = embed_tokenizer(
        text,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        output = embed_model(**inputs)
        cls_emb = output.last_hidden_state[:, 0, :]
        emb_normalized = F.normalize(cls_emb, p=2, dim=1)

    embeddings = emb_normalized.cpu().numpy()

    return embeddings[0] if is_single else embeddings # (dim,) with str (n, dim) with list

In [7]:
def process_document(pdf_path, chunk_size = 1000, chunk_overlap = 200):
  print("Extracting text from PDF...")
  text = extract_text_from_pdf(pdf_path)

  print("Chunking text...")
  text_chunks = chunk_text(text, chunk_size, chunk_overlap)
  print(f"{len(text_chunks)} text chunks has been created")

  print("Create embeddings...")
  embeddings = [embed(chunk) for chunk in tqdm.tqdm(text_chunks)]

  print("Creating vector store...")
  store = SimpleVectorStore()

  for i, (chunk, embedding) in enumerate(zip(text_chunks, embeddings)):
    store.add_item(
        text = chunk,
        embedding = embedding,
        metadata = {
            "index": i,
            "source": pdf_path
            }
        )
  return store

In [8]:
def gen(system_prompt, user_prompt): # work with unsloth/Llama-3.2-3B-Instruct
    text = gen_tokenizer.apply_chat_template(
        conversation = [
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": user_prompt
            }
        ],
        tokenize = False,
        add_generation_prompt = False
    )

    model_inputs = gen_tokenizer([text], return_tensors = "pt").to(device)

    generated_ids = gen_model.generate(
        **model_inputs,
        do_sample = True
    )

    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]

    response =  gen_tokenizer.batch_decode(generated_ids, skip_special_tokens = True)[0].strip("assistant\n\n")

    # print("===========================================")
    # print(f"resposne: \n{response}")
    # print("===========================================")
    return response

In [9]:
def rerank_with_llm(query, results, top_n=3, model="unsloth/Llama-3.2-3B-Instruct"):
    """
    Reranks search results using LLM relevance scoring.

    Args:
        query (str): User query
        results (List[Dict]): Initial search results
        top_n (int): Number of results to return after reranking
        model (str): Model to use for scoring

    Returns:
        List[Dict]: Reranked results
    """
    print(f"Reranking {len(results)} documents...")  # Print the number of documents to be reranked

    scored_results = []  # Initialize an empty list to store scored results

    # Define the system prompt for the LLM
    system_prompt = """You are an expert at evaluating document relevance for search queries.
Your task is to rate documents on a scale from 0 to 10 based on how well they answer the given query.

Guidelines:
- Score 0-2: Document is completely irrelevant
- Score 3-5: Document has some relevant information but doesn't directly answer the query
- Score 6-8: Document is relevant and partially answers the query
- Score 9-10: Document is highly relevant and directly answers the query

You MUST respond with ONLY a single integer score between 0 and 10. Do not include ANY other text."""

    # Iterate through each result
    for i, result in enumerate(results):
        # Show progress every 5 documents
        if i % 5 == 0:
            print(f"Scoring document {i+1}/{len(results)}...")

        # Define the user prompt for the LLM
        user_prompt = f"""Query: {query}

Document:
{result['text']}

Rate this document's relevance to the query on a scale from 0 to 10:"""

        # # Get the LLM response
        # response = client.chat.completions.create(
        #     model=model,
        #     temperature=0,
        #     messages=[
        #         {"role": "system", "content": system_prompt},
        #         {"role": "user", "content": user_prompt}
        #     ]
        # )

        # Extract the score from the LLM response
        score_text = gen(system_prompt, user_prompt)

        # Use regex to extract the numerical score
        score_match = re.search(r'\b(10|[0-9])\b', score_text)
        if score_match:
            score = float(score_match.group(1))
        else:
            # If score extraction fails, use similarity score as fallback
            print(f"Warning: Could not extract score from response: '{score_text}', using similarity score instead")
            score = result["similarity"] * 10

        # Append the scored result to the list
        scored_results.append({
            "text": result["text"],
            "metadata": result["metadata"],
            "similarity": result["similarity"],
            "relevance_score": score
        })

    # Sort results by relevance score in descending order
    reranked_results = sorted(scored_results, key=lambda x: x["relevance_score"], reverse=True)

    # Return the top_n results
    return reranked_results[:top_n]


In [10]:
def rerank_with_keywords(query, results, top_n=3):
    """
    A simple alternative reranking method based on keyword matching and position.

    Args:
        query (str): User query
        results (List[Dict]): Initial search results
        top_n (int): Number of results to return after reranking

    Returns:
        List[Dict]: Reranked results
    """
    # Extract important keywords from the query
    keywords = [word.lower() for word in query.split() if len(word) > 3]

    scored_results = []  # Initialize a list to store scored results

    for result in results:
        document_text = result["text"].lower()  # Convert document text to lowercase

        # Base score starts with vector similarity
        base_score = result["similarity"] * 0.5

        # Initialize keyword score
        keyword_score = 0
        for keyword in keywords:
            if keyword in document_text:
                # Add points for each keyword found
                keyword_score += 0.1

                # Add more points if keyword appears near the beginning
                first_position = document_text.find(keyword)
                if first_position < len(document_text) / 4:  # In the first quarter of the text
                    keyword_score += 0.1

                # Add points for keyword frequency
                frequency = document_text.count(keyword)
                keyword_score += min(0.05 * frequency, 0.2)  # Cap at 0.2

        # Calculate the final score by combining base score and keyword score
        final_score = base_score + keyword_score

        # Append the scored result to the list
        scored_results.append({
            "text": result["text"],
            "metadata": result["metadata"],
            "similarity": result["similarity"],
            "relevance_score": final_score
        })

    # Sort results by final relevance score in descending order
    reranked_results = sorted(scored_results, key=lambda x: x["relevance_score"], reverse=True)

    # Return the top_n results
    return reranked_results[:top_n]


In [11]:
def generate_response(query, context, model="unsloth/Llama-3.2-3B-Instruct"):
    """
    Generates a response based on the query and context.

    Args:
        query (str): User query
        context (str): Retrieved context
        model (str): Model to use for response generation

    Returns:
        str: Generated response
    """
    # Define the system prompt to guide the AI's behavior
    system_prompt = "You are a helpful AI assistant. Answer the user's question based only on the provided context. If you cannot find the answer in the context, state that you don't have enough information."

    # Create the user prompt by combining the context and query
    user_prompt = f"""
        Context:
        {context}

        Question: {query}

        Please provide a comprehensive answer based only on the context above.
    """

    # Generate the response using the specified model
    response = gen(system_prompt, user_prompt)

    # Return the generated response content
    return response


In [12]:
def rag_with_reranking(query, vector_store, reranking_method="llm", top_n=3, model="unsloth/Llama-3.2-3B-Instruct"):
    """
    Complete RAG pipeline incorporating reranking.

    Args:
        query (str): User query
        vector_store (SimpleVectorStore): Vector store
        reranking_method (str): Method for reranking ('llm' or 'keywords')
        top_n (int): Number of results to return after reranking
        model (str): Model for response generation

    Returns:
        Dict: Results including query, context, and response
    """
    # Create query embedding
    query_embedding = embed(query)

    # Initial retrieval (get more than we need for reranking)
    initial_results = vector_store.similarity_search(query_embedding, k=10)

    # Apply reranking
    if reranking_method == "llm":
        reranked_results = rerank_with_llm(query, initial_results, top_n=top_n)
    elif reranking_method == "keywords":
        reranked_results = rerank_with_keywords(query, initial_results, top_n=top_n)
    else:
        # No reranking, just use top results from initial retrieval
        reranked_results = initial_results[:top_n]

    # Combine context from reranked results
    context = "\n\n===\n\n".join([result["text"] for result in reranked_results])

    # Generate response based on context
    response = generate_response(query, context, model)

    return {
        "query": query,
        "reranking_method": reranking_method,
        "initial_results": initial_results[:top_n],
        "reranked_results": reranked_results,
        "context": context,
        "response": response
    }


In [13]:
# Load the validation data from a JSON file
with open('val.json') as f:
    data = json.load(f)

# Extract the first query from the validation data
query = data[0]['question']

# Extract the reference answer from the validation data
reference_answer = data[0]['ideal_answer']

# pdf_path
pdf_path = "AI_Information.pdf"

In [14]:
# Process document
vector_store = process_document(pdf_path)

# Example query
query = "Does AI have the potential to transform the way we live and work?"

# Compare different methods
print("Comparing retrieval methods...")

# 1. Standard retrieval (no reranking)
print("\n=== STANDARD RETRIEVAL ===")
standard_results = rag_with_reranking(query, vector_store, reranking_method="none")
print(f"\nQuery: {query}")
print(f"\nResponse:\n{standard_results['response']}")

# 2. LLM-based reranking
print("\n=== LLM-BASED RERANKING ===")
llm_results = rag_with_reranking(query, vector_store, reranking_method="llm")
print(f"\nQuery: {query}")
print(f"\nResponse:\n{llm_results['response']}")

# 3. Keyword-based reranking
print("\n=== KEYWORD-BASED RERANKING ===")
keyword_results = rag_with_reranking(query, vector_store, reranking_method="keywords")
print(f"\nQuery: {query}")
print(f"\nResponse:\n{keyword_results['response']}")

Extracting text from PDF...
Chunking text...
42 text chunks has been created
Create embeddings...


100%|██████████| 42/42 [00:00<00:00, 53.68it/s]


Creating vector store...
Comparing retrieval methods...

=== STANDARD RETRIEVAL ===

Query: Does AI have the potential to transform the way we live and work?

Response:
Yes, AI has the potential to transform the way we live and work. According to the context, AI is increasingly being used to address various aspects of our lives, including social and environmental challenges, healthcare, finance, transportation, retail, manufacturing, and the future of work.

In various industries, AI is enhancing trust and accountability, improving productivity, and augmenting human capabilities. It is used for applications such as medical diagnosis, personalized medicine, and robotic surgery, which can lead to breakthroughs in healthcare. In finance, AI is used for fraud detection, algorithmic trading, and risk management, enabling more efficient financial processes.

Moreover, AI is revolutionizing transportation with self-driving cars and autonomous vehicles, and transforming the retail industry wit

In [15]:
def evaluate_reranking(query, standard_results, reranked_results, reference_answer=None):
    """
    Evaluates the quality of reranked results compared to standard results.

    Args:
        query (str): User query
        standard_results (Dict): Results from standard retrieval
        reranked_results (Dict): Results from reranked retrieval
        reference_answer (str, optional): Reference answer for comparison

    Returns:
        str: Evaluation output
    """
    # Define the system prompt for the AI evaluator
    system_prompt = """You are an expert evaluator of RAG systems.
    Compare the retrieved contexts and responses from two different retrieval methods.
    Assess which one provides better context and a more accurate, comprehensive answer."""

    # Prepare the comparison text with truncated contexts and responses
    comparison_text = f"""Query: {query}

Standard Retrieval Context:
{standard_results['context'][:1000]}... [truncated]

Standard Retrieval Answer:
{standard_results['response']}

Reranked Retrieval Context:
{reranked_results['context'][:1000]}... [truncated]

Reranked Retrieval Answer:
{reranked_results['response']}"""

    # If a reference answer is provided, include it in the comparison text
    if reference_answer:
        comparison_text += f"""

Reference Answer:
{reference_answer}"""

    # Create the user prompt for the AI evaluator
    user_prompt = f"""
{comparison_text}

Please evaluate which retrieval method provided:
1. More relevant context
2. More accurate answer
3. More comprehensive answer
4. Better overall performance

Provide a detailed analysis with specific examples.
"""

    # Generate the evaluation response using the specified model
    response = gen(system_prompt, user_prompt)

    # Return the evaluation output
    return response


In [16]:
# Evaluate the quality of reranked results compared to standard results
evaluation = evaluate_reranking(
    query=query,  # The user query
    standard_results=standard_results,  # Results from standard retrieval
    reranked_results=llm_results,  # Results from LLM-based reranking
    reference_answer=reference_answer  # Reference answer for comparison
)

# Print the evaluation results
print("\n=== EVALUATION RESULTS ===")
print(evaluation)


=== EVALUATION RESULTS ===
**Evaluation Criteria:**

1. Relevance of context: How well does the context provide information relevant to the query?
2. Accuracy of answer: How accurate is the answer in relation to the query?
3. Comprehensive answer: Does the answer provide a thorough and detailed explanation?
4. Overall performance: How well does the retrieval method perform in providing a coherent and relevant response?

**Evaluation:**

1. **Relevance of context:**
   - Standard Retrieval Context: The context provides a clear overview of various applications of AI, including its potential to transform the way we live and work. It covers topics such as AI at the edge, quantum computing and AI, human-AI collaboration, and AI for social good. The context is relevant to the query and provides a broad understanding of AI's potential impact.
   - Reranked Retrieval Context: The context is more focused on the development and deployment of AI, job roles, and ethical considerations. While it t