In [1]:
!pip install pymupdf
!pip install bitsandbytes

Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m102.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.3
Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64

In [5]:
import os
import fitz
import numpy as np
import json

import torch
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM

In [21]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
gen_tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-3B-Instruct")
gen_model = AutoModelForCausalLM.from_pretrained("unsloth/Llama-3.2-3B-Instruct",
    torch_dtype="auto",
    device_map="auto"
    )

embed_tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en-v1.5")
embed_model = AutoModel.from_pretrained("BAAI/bge-base-en-v1.5").to(device)

Using device: cuda


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [22]:
def create_embeddings(text):
  is_string = isinstance(text, str)
  if is_string: text = [text]

  try:
    inputs = embed_tokenizer(
        text,
        padding = True,
        return_tensors = "pt"
    ).to(device)
  except Exception as e:
    print(f"Tokenizer error: {e}")
    return None

  try:
    with torch.no_grad():
      output = embed_model(**inputs)
      cls = output.last_hidden_state[:, 0, :]
      embed_normalized = F.normalize(cls, p = 2, dim = 1)
    embeddings = [embed.cpu().numpy() for embed in embed_normalized]

    return embeddings

  except Exception as e:
    print(f"Embedding generation error: {e}")
    return None

In [23]:
def gen(system_prompt, user_prompt): # work with unsloth/Llama-3.2-3B-Instruct
    text = gen_tokenizer.apply_chat_template(
        conversation = [
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": user_prompt
            }
        ],
        tokenize = False,
        add_generation_prompt = False
    )

    model_inputs = gen_tokenizer([text], return_tensors = "pt").to(device)

    generated_ids = gen_model.generate(
        **model_inputs,
        do_sample = True
    ).to(device)

    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]

    response =  gen_tokenizer.batch_decode(generated_ids, skip_special_tokens = True)[0].strip("assistant\n\n")

    # print("===========================================")
    # print(f"resposne: \n{response}")
    # print("===========================================")
    return response

In [24]:
def extract_text_from_pdf(pdf_path):
  pdf = fitz.open(pdf_path)
  text = ""

  for page in pdf:
    text += page.get_text()
  return text

def chunk_text(text, n, overlap):
  return [text[i:i+n] for i in range(0, len(text), n - overlap)]

In [25]:
class SimpleVectorStore:
  def __init__(self):
    self.vectors = []
    self.texts = []
    self.metadata = []

  def add_item(self, text, embedding, metadata = None):
    self.vectors.append(embedding)
    self.texts.append(text)
    self.metadata.append(metadata or {})

  def similarity_search(self, query_embedding, k = 5):
    if not self.vectors: return []

    similarities = [(i, cosine_similarity(query_embedding.reshape(1, -1), vector.reshape(1, -1))) for i, vector in enumerate(self.vectors)]

    similarities.sort(key=lambda x:x[1], reverse = True)

    results = []
    for i in range(min(k, len(similarities))):
      idx, score = similarities[i]
      results.append({
          "text": self.texts[idx],
          "metadata": self.metadata[idx],
          "similarity": score
      })

    return results

In [26]:
def process_document(pdf_path, chunk_size = 1000, overlap = 200):
  print("Extracting text...")
  extracted_text = extract_text_from_pdf(pdf_path)

  print("Chunking text...")
  text_chunks = chunk_text(extracted_text, chunk_size, overlap)

  print("Creating embeddings...")
  embeddings = create_embeddings(text_chunks)

  store = SimpleVectorStore()

  for i, (chunk, embedding) in enumerate(zip(text_chunks, embeddings)):
    store.add_item(
        text = chunk,
        embedding = embedding, # nda(dim,)
        metadata = {
            "index":i,
            "source": pdf_path
        }
    )

  print(f"Added {len(text_chunks)} chunks to the vector store")

  return store

In [27]:
def compress_chunk(chunk, query, compression_type = "selective"):
  if compression_type == "selective":
      system_prompt = """You are an expert at information filtering.
      Your task is to analyze a document chunk and extract ONLY the sentences or paragraphs that are directly
      relevant to the user's query. Remove all irrelevant content.

      Your output should:
      1. ONLY include text that helps answer the query
      2. Preserve the exact wording of relevant sentences (do not paraphrase)
      3. Maintain the original order of the text
      4. Include ALL relevant content, even if it seems redundant
      5. EXCLUDE any text that isn't relevant to the query

      Format your response as plain text with no additional comments."""
  elif compression_type == "summary":
      system_prompt = """You are an expert at summarization.
      Your task is to create a concise summary of the provided chunk that focuses ONLY on
      information relevant to the user's query.

      Your output should:
      1. Be brief but comprehensive regarding query-relevant information
      2. Focus exclusively on information related to the query
      3. Omit irrelevant details
      4. Be written in a neutral, factual tone

      Format your response as plain text with no additional comments."""
  else:  # extraction
      system_prompt = """You are an expert at information extraction.
      Your task is to extract ONLY the exact sentences from the document chunk that contain information relevant
      to answering the user's query.

      Your output should:
      1. Include ONLY direct quotes of relevant sentences from the original text
      2. Preserve the original wording (do not modify the text)
      3. Include ONLY sentences that directly relate to the query
      4. Separate extracted sentences with newlines
      5. Do not add any commentary or additional text

      Format your response as plain text with no additional comments."""

  user_prompt = f"""
      Query: {query}

      Document Chunk:
      {chunk}

      Extract only the content relevant to answering this query.
  """
  compressed_chunk = gen(system_prompt, user_prompt) #str

  original_length = len(chunk)
  compressed_length = len(compressed_chunk)
  compression_ratio = (original_length - compressed_length) / original_length * 100

  print(f"""
  ===Chunk: {chunk[:20]}
  ===Compressed: {compressed_chunk}
  ===Compressed length: {compressed_length}
  =============================================
  """)

  return compressed_chunk, compression_ratio

In [28]:
def generate_response(query, context):
    """
    Generate a response based on the query and context.

    Args:
        query (str): User query
        context (str): Context text from compressed chunks
        model (str): LLM model to use

    Returns:
        str: Generated response
    """
    # Define the system prompt to guide the AI's behavior
    system_prompt = """You are a helpful AI assistant. Answer the user's question based only on the provided context.
    If you cannot find the answer in the context, state that you don't have enough information."""

    # Create the user prompt by combining the context and the query
    user_prompt = f"""
        Context:
        {context}

        Question: {query}

        Please provide a comprehensive answer based only on the context above.
    """

    # Generate a response using the OpenAI API
    response = gen(system_prompt, user_prompt)

    # Return the generated response content
    return response


In [35]:
def rag_with_compression(pdf_path, query, k=10, compression_type="selective", model="meta-llama/Llama-3.2-3B-Instruct"):
    """
    Complete RAG pipeline with contextual compression.

    Args:
        pdf_path (str): Path to PDF document
        query (str): User query
        k (int): Number of chunks to retrieve initially
        compression_type (str): Type of compression
        model (str): LLM model to use

    Returns:
        dict: Results including query, compressed chunks, and response
    """
    print("\n=== RAG WITH CONTEXTUAL COMPRESSION ===")
    print(f"Query: {query}")
    print(f"Compression type: {compression_type}")

    # Process the document to extract text, chunk it, and create embeddings
    vector_store = process_document(pdf_path)

    # Create an embedding for the query
    query_embedding = create_embeddings(query)[0] # nda(dim,)

    # Retrieve the top k most similar chunks based on the query embedding
    print(f"Retrieving top {k} chunks...")
    results = vector_store.similarity_search(query_embedding, k=k)
    retrieved_chunks = [result["text"] for result in results]

    # Apply compression to the retrieved chunks
    compressed_results = [compress_chunk(retrieved_chunk, query, compression_type) for retrieved_chunk in retrieved_chunks]
    compressed_chunks = [result[0] for result in compressed_results]
    compression_ratios = [result[1] for result in compressed_results]

    # Filter out any empty compressed chunks
    filtered_chunks = [(chunk, ratio) for chunk, ratio in zip(compressed_chunks, compression_ratios) if chunk.strip()]

    if not filtered_chunks:
        # If all chunks are compressed to empty strings, use the original chunks
        print("Warning: All chunks were compressed to empty strings. Using original chunks.")
        filtered_chunks = [(chunk, 0.0) for chunk in retrieved_chunks]
    else:
        compressed_chunks, compression_ratios = zip(*filtered_chunks)

    # Generate context from the compressed chunks
    context = "\n\n---\n\n".join(compressed_chunks)

    # Generate a response based on the compressed chunks
    print("Generating response based on compressed chunks...")
    response = generate_response(query, context)

    # Prepare the result dictionary
    result = {
        "query": query,
        "original_chunks": retrieved_chunks,
        "compressed_chunks": compressed_chunks,
        "compression_ratios": compression_ratios,
        "context_length_reduction": f"{sum(compression_ratios)/len(compression_ratios):.2f}%",
        "response": response
    }

    print("\n=== RESPONSE ===")
    print(response)

    return result


In [36]:
def standard_rag(pdf_path, query, k=10):
    """
    Standard RAG without compression.

    Args:
        pdf_path (str): Path to PDF document
        query (str): User query
        k (int): Number of chunks to retrieve
        model (str): LLM model to use

    Returns:
        dict: Results including query, chunks, and response
    """
    print("\n=== STANDARD RAG ===")
    print(f"Query: {query}")

    # Process the document to extract text, chunk it, and create embeddings
    vector_store = process_document(pdf_path)

    # Create an embedding for the query
    query_embedding = create_embeddings(query)[0]

    # Retrieve the top k most similar chunks based on the query embedding
    print(f"Retrieving top {k} chunks...")
    results = vector_store.similarity_search(query_embedding, k=k)
    retrieved_chunks = [result["text"] for result in results]

    # Generate context from the retrieved chunks
    context = "\n\n---\n\n".join(retrieved_chunks)

    # Generate a response based on the retrieved chunks
    print("Generating response...")
    response = generate_response(query, context)

    # Prepare the result dictionary
    result = {
        "query": query,
        "chunks": retrieved_chunks,
        "response": response
    }

    print("\n=== RESPONSE ===")
    print(response)

    return result


In [37]:
def evaluate_responses(query, responses, reference_answer):
    """
    Evaluate multiple responses against a reference answer.

    Args:
        query (str): User query
        responses (Dict[str, str]): Dictionary of responses by method
        reference_answer (str): Reference answer

    Returns:
        str: Evaluation text
    """
    # Define the system prompt to guide the AI's behavior for evaluation
    system_prompt = """You are an objective evaluator of RAG responses. Compare different responses to the same query
    and determine which is most accurate, comprehensive, and relevant to the query."""

    # Create the user prompt by combining the query and reference answer
    user_prompt = f"""
    Query: {query}

    Reference Answer: {reference_answer}

    """

    # Add each response to the prompt
    for method, response in responses.items():
        user_prompt += f"\n{method.capitalize()} Response:\n{response}\n"

    # Add the evaluation criteria to the user prompt
    user_prompt += """
    Please evaluate these responses based on:
    1. Factual accuracy compared to the reference
    2. Comprehensiveness - how completely they answer the query
    3. Conciseness - whether they avoid irrelevant information
    4. Overall quality

    Rank the responses from best to worst with detailed explanations.
    """

    # Generate an evaluation response using the OpenAI API
    evaluation_response = gen(system_prompt, user_prompt)

    # Return the evaluation text from the response
    return evaluation_response


In [38]:
def evaluate_compression(pdf_path, query, reference_answer=None, compression_types=["selective", "summary", "extraction"]):
    """
    Compare different compression techniques with standard RAG.

    Args:
        pdf_path (str): Path to PDF document
        query (str): User query
        reference_answer (str): Optional reference answer
        compression_types (List[str]): Compression types to evaluate

    Returns:
        dict: Evaluation results
    """
    print("\n=== EVALUATING CONTEXTUAL COMPRESSION ===")
    print(f"Query: {query}")

    # Run standard RAG without compression
    standard_result = standard_rag(pdf_path, query)

    # Dictionary to store results of different compression techniques
    compression_results = {}

    # Run RAG with each compression technique
    for comp_type in compression_types:
        print(f"\nTesting {comp_type} compression...")
        compression_results[comp_type] = rag_with_compression(pdf_path, query, compression_type=comp_type)

    # Gather responses for evaluation
    responses = {
        "standard": standard_result["response"]
    }
    for comp_type in compression_types:
        responses[comp_type] = compression_results[comp_type]["response"]

    # Evaluate responses if a reference answer is provided
    if reference_answer:
        evaluation = evaluate_responses(query, responses, reference_answer)
        print("\n=== EVALUATION RESULTS ===")
        print(evaluation)
    else:
        evaluation = "No reference answer provided for evaluation."

    # Calculate metrics for each compression type
    metrics = {}
    for comp_type in compression_types:
        metrics[comp_type] = {
            "avg_compression_ratio": f"{sum(compression_results[comp_type]['compression_ratios'])/len(compression_results[comp_type]['compression_ratios']):.2f}%",
            "total_context_length": len("\n\n".join(compression_results[comp_type]['compressed_chunks'])),
            "original_context_length": len("\n\n".join(standard_result['chunks']))
        }

    # Return the evaluation results, responses, and metrics
    return {
        "query": query,
        "responses": responses,
        "evaluation": evaluation,
        "metrics": metrics,
        "standard_result": standard_result,
        "compression_results": compression_results
    }


In [39]:
# Path to the PDF document containing information on AI ethics
pdf_path = "AI_Information.pdf"

# Query to extract relevant information from the document
query = "What are the ethical concerns surrounding the use of AI in decision-making?"

# Optional reference answer for evaluation
reference_answer = """
The use of AI in decision-making raises several ethical concerns.
- Bias in AI models can lead to unfair or discriminatory outcomes, especially in critical areas like hiring, lending, and law enforcement.
- Lack of transparency and explainability in AI-driven decisions makes it difficult for individuals to challenge unfair outcomes.
- Privacy risks arise as AI systems process vast amounts of personal data, often without explicit consent.
- The potential for job displacement due to automation raises social and economic concerns.
- AI decision-making may also concentrate power in the hands of a few large tech companies, leading to accountability challenges.
- Ensuring fairness, accountability, and transparency in AI systems is essential for ethical deployment.
"""

# Run evaluation with different compression techniques
# Compression types:
# - "selective": Retains key details while omitting less relevant parts
# - "summary": Provides a concise version of the information
# - "extraction": Extracts relevant sentences verbatim from the document
results = evaluate_compression(
    pdf_path=pdf_path,
    query=query,
    reference_answer=reference_answer,
    compression_types=["selective", "summary", "extraction"]
)


=== EVALUATING CONTEXTUAL COMPRESSION ===
Query: What are the ethical concerns surrounding the use of AI in decision-making?

=== STANDARD RAG ===
Query: What are the ethical concerns surrounding the use of AI in decision-making?
Extracting text...
Chunking text...
Creating embeddings...
Added 42 chunks to the vector store


  return forward_call(*args, **kwargs)


Retrieving top 10 chunks...
Generating response...

=== RESPONSE ===
Based on the provided context, the ethical concerns surrounding the use of AI in decision-making include:

1. **Bias and Fairness**: AI systems can inherit and amplify biases present in the data they are trained on, leading to unfair or discriminatory outcomes. Ensuring fairness and mitigating bias in AI systems is a critical challenge.
2. **Transparency and Explainability**: Many AI systems, particularly deep learning models, are "black boxes," making it difficult to understand how they arrive at their decisions. Enhancing transparency and explainability is crucial for building trust and accountability.
3. **Robustness and Reliability**: Ensuring that AI systems are robust and reliable is essential for building trust. This includes testing and validating AI models, monitoring their performance, and addressing potential vulnerabilities.
4. **User Control and Agency**: Empowering users with control over AI systems and 

  return forward_call(*args, **kwargs)


Added 42 chunks to the vector store
Retrieving top 10 chunks...

  ===Chunk:  experiences. AI alg 
  ===Compressed: The rapid development and deployment of AI raise significant ethical and societal concerns.
These concerns include: 
Bias and Fairness 
AI systems can inherit and amplify biases present in the data they are trained on, leading to unfair or discriminatory outcomes.
Ensuring fairness and mitigating bias in AI systems is a critical challenge.
Transparency and Explainability 
Many AI systems, particularly deep learning models, are "black boxes," making it difficult to understand how they arrive at their decisions.
Enhancing transparency and explainability is crucial.
  ===Compressed length: 569
  

  ===Chunk: to building trust in 
  ===Compressed: Ensuring that AI systems are robust and reliable is essential for building trust. This includes testing and validating AI models, monitoring their performance, and addressing potential vulnerabilities. Empowering users with control

  return forward_call(*args, **kwargs)


Added 42 chunks to the vector store
Retrieving top 10 chunks...

  ===Chunk:  experiences. AI alg 
  ===Compressed: The ethical concerns surrounding the use of AI in decision-making include:

Bias and Fairness: AI systems can inherit and amplify biases present in the data they are trained on, leading to unfair or discriminatory outcomes.
Transparency and Explainability: Many AI systems, particularly deep learning models, are "black boxes," making it difficult to understand how they arrive at their decisions.
  ===Compressed length: 397
  

  ===Chunk: to building trust in 
  ===Compressed: The ethical concerns surrounding the use of AI in decision-making include:

- Ensuring AI systems are robust and reliable
- Empowering users with control over AI systems
- Incorporating ethical considerations into the design and development of AI systems
- Conducting ethical impact assessments and engaging stakeholders
- Providing insights into AI decision-making processes to assess reliability and f

  return forward_call(*args, **kwargs)


Added 42 chunks to the vector store
Retrieving top 10 chunks...

  ===Chunk:  experiences. AI alg 
  ===Compressed: The rapid development and deployment of AI raise significant ethical and societal concerns.
Bias and Fairness
AI systems can inherit and amplify biases present in the data they are trained on, leading to unfair or discriminatory outcomes.
Ensuring fairness and mitigating bias in AI systems is a critical challenge.
Transparency and Explainability
Many AI systems, particularly deep learning models, are "black boxes," making it difficult to understand how they arrive at their decisions.
  ===Compressed length: 488
  

  ===Chunk: to building trust in 
  ===Compressed: Making AI systems understandable and providing insights into their decision-making processes helps users assess their reliability and fairness.
Ensuring that AI systems are robust and reliable is essential for building trust.
Empowering users with control over AI systems and providing them with agency in their 