## Final Integrating Pinecone with an LLM

In [None]:
!pip install langchain_pinecone
!pip install langfuse

Collecting langchain_pinecone
  Downloading langchain_pinecone-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting aiohttp<3.10,>=3.9.5 (from langchain_pinecone)
  Downloading aiohttp-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.5 kB)
Collecting pinecone-client<6.0.0,>=5.0.0 (from langchain_pinecone)
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client<6.0.0,>=5.0.0->langchain_pinecone)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client<6.0.0,>=5.0.0->langchain_pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading langchain_pinecone-0.2.0-py3-none-any.whl (11 kB)
Downloading aiohttp-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [None]:
!pip install pinecone

Collecting pinecone
  Downloading pinecone-5.4.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<4.0.0,>=2.0.0 (from pinecone)
  Downloading pinecone_plugin_inference-3.0.0-py3-none-any.whl.metadata (2.2 kB)
Downloading pinecone-5.4.1-py3-none-any.whl (427 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m427.3/427.3 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_inference-3.0.0-py3-none-any.whl (87 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.7/87.7 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pinecone-plugin-inference, pinecone
  Attempting uninstall: pinecone-plugin-inference
    Found existing installation: pinecone-plugin-inference 1.1.0
    Uninstalling pinecone-plugin-inference-1.1.0:
      Successfully uninstalled pinecone-plugin-inference-1.1.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are i

In [None]:
pip install langchain



Connecting Pinecone Index

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import Pinecone as LangchainPinecone
from langchain.prompts import PromptTemplate
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings.base import Embeddings
from langchain.llms.base import LLM
from typing import Optional, List

import vertexai
from vertexai.generative_models import GenerativeModel

# Initialize Vertex AI for Google Gemini model
PROJECT_ID = "ids-560-project-group-1-bosch"
vertexai.init(project=PROJECT_ID, location="us-central1")
gemini_model = GenerativeModel("gemini-1.5-flash")

# Custom wrapper for Google Gemini to make it compatible with LangChain
class RunnableGemini(LLM):
    def __init__(self, model: GenerativeModel):
        super().__init__()
        self._model = model

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        response = self._model.generate_content(prompt)
        return response.text

    @property
    def _llm_type(self) -> str:
        return "google_gemini"

# Instantiate the wrapped model
llm = RunnableGemini(gemini_model)

# Load the embedding model for the retriever function
MODEL_NAME = "avsolatorio/NoInstruct-small-Embedding-v0"
embedding_model = AutoModel.from_pretrained(MODEL_NAME)
embedding_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Custom LangChain Embedding wrapper
class HFEmbeddingWrapper(Embeddings):
    def embed_query(self, text: str) -> List[float]:
        return get_embedding(text).flatten().tolist()

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [get_embedding(text).flatten().tolist() for text in texts]

# Initialize the custom embedding wrapper
embedding = HFEmbeddingWrapper()

# Initialize Pinecone client
pinecone_api_key = os.getenv("PINECONE_API_KEY") or input("Enter your Pinecone API key: ")
pc = Pinecone(api_key=pinecone_api_key)
# Change Pinecone Index name here -
index_name = "chunks-without-metadata-final"

# Check if the index exists; if not, create it with dimension 384
if index_name not in [index_info["name"] for index_info in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
index = pc.Index(index_name)
print(f"Connected to index '{index_name}' in Pinecone.")

# Initialize LangChain Pinecone Retriever with embedding wrapper
vectorstore = LangchainPinecone(index=index, embedding=embedding)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# Define the QA PromptTemplate for debugging and documentation assistance
QA_PROMPT = PromptTemplate.from_template("""
You are an expert code debugger and documentation assistant. Help answer technical queries based on the script provided, offering solutions, clarifications, or steps as needed. Use precise language and avoid assumptions. Refer to the documentation context for direct responses.

Documentation Context:
{context}

Question: {question}
Answer:
""")

# Set up the conversational retrieval chain with the custom Gemini wrapper
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    condense_question_prompt=QA_PROMPT,
    combine_docs_chain_kwargs={"prompt": QA_PROMPT},
    return_source_documents=True
)

# Function to maintain conversation history in the RAG chain
def ask_question_with_history(qa_chain, question, chat_history):
    result = qa_chain.invoke({"question": question, "chat_history": chat_history})
    print("Response:", result["answer"])
    chat_history.append((question, result["answer"]))
    return chat_history


Enter your Pinecone API key: e8db3a70-ab95-49cf-9710-5db89ac159a6
Connected to index 'chunks-without-metadata-final' in Pinecone.


In [None]:
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
model = AutoModel.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

computing embeddings

In [None]:
import json
import torch

# Assuming `model` and `tokenizer` are already loaded (e.g., SentenceTransformer, T5, etc.)

# Function to compute embeddings
def get_embedding(text, mode="sentence"):
    model.eval()
    inp = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        output = model(**inp)
    return output.last_hidden_state[:, 0, :].numpy()  # [CLS] token representation

'''
# Function to index semantic chunks and metadata in Pinecone
def index_chunks_with_metadata(index, chunks_with_metadata):
    # Process each chunk, compute embeddings, and store it in Pinecone
    for i, chunk_data in enumerate(chunks_with_metadata):
        # Extract the content and metadata
        chunk = chunk_data["content"]
        keywords = chunk_data.get("keywords", [])
        title = chunk_data.get("title", "Unknown Title")

        # Generate embedding for the chunk
        embedding = get_embedding(chunk).flatten().tolist()

        # Prepare metadata for Pinecone
        doc_metadata = {
            "title": title,
            "keywords": keywords,
            "text": chunk  # Storing the text as part of metadata if desired
        }

        # Index each chunk in Pinecone
        index.upsert(
            [(f"chunk-{i}", embedding, doc_metadata)]
        )

        # Print progress every 100 chunks
        if (i + 1) % 100 == 0:
            print(f"Indexed {i + 1} chunks out of {len(chunks_with_metadata)}")

    print("All chunks and metadata have been successfully indexed in Pinecone.")
'''

'\n# Function to index semantic chunks and metadata in Pinecone\ndef index_chunks_with_metadata(index, chunks_with_metadata):\n    # Process each chunk, compute embeddings, and store it in Pinecone\n    for i, chunk_data in enumerate(chunks_with_metadata):\n        # Extract the content and metadata\n        chunk = chunk_data["content"]\n        keywords = chunk_data.get("keywords", [])\n        title = chunk_data.get("title", "Unknown Title")\n\n        # Generate embedding for the chunk\n        embedding = get_embedding(chunk).flatten().tolist()\n\n        # Prepare metadata for Pinecone\n        doc_metadata = {\n            "title": title,\n            "keywords": keywords,\n            "text": chunk  # Storing the text as part of metadata if desired\n        }\n\n        # Index each chunk in Pinecone\n        index.upsert(\n            [(f"chunk-{i}", embedding, doc_metadata)]\n        )\n\n        # Print progress every 100 chunks\n        if (i + 1) % 100 == 0:\n            p

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory

# Define the QA PromptTemplate with the 'context' and 'question' placeholders
QA_PROMPT = PromptTemplate.from_template("""
You are an assistant helping with error debugging and code understanding. Use the context provided to answer the user's question. If there is no context, let the user know more information is needed.

Context:
{context}

User's Question: {question}
Answer:
""")

# Initialize your LLM and retriever (already defined in your environment)

# Create the RetrievalQA chain with the custom prompt and specify the input key
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    input_key="question",  # Specify 'question' as the input key
    chain_type_kwargs={
        "prompt": QA_PROMPT,
    },
    return_source_documents=True,
)

# Initialize memory to store conversation history
memory = ConversationBufferMemory(memory_key="chat_history", input_key="question")

# Function to ask a question with context management
def ask_question_with_history(qa_chain, question, memory):
    # Retrieve the conversation history
    chat_history = memory.load_memory_variables({}).get("chat_history", "")

    # Prepare the inputs
    inputs = {
        "question": question,  # Use 'question' as the key
        "chat_history": chat_history,
    }

    # Call the QA chain with the inputs
    result = qa_chain(inputs)

    # Display the result and update memory
    print("Response:", result["result"])
    memory.save_context({"question": question}, {"result": result["result"]})
    return memory


In [None]:

# Example usage
question = "What are the steps to create a bucket, access point, and endpoint for Amazon S3 on Outposts?"
memory = ask_question_with_history(qa_chain, question, memory)

question = "Can you explain how Amazon S3 on Outposts is different from regular Amazon S3?"
memory = ask_question_with_history(qa_chain, question, memory)

question = "What are the supported API operations for Amazon S3 on Outposts, and how do they differ from standard Amazon S3?"
memory = ask_question_with_history(qa_chain, question, memory)

Response: The steps to create a bucket, access point, and endpoint for Amazon S3 on Outposts are as follows:

1. **Sign in to the AWS Management Console and open the Amazon S3 console:** Navigate to https://console.aws.amazon.com/s3/.
2. **In the left navigation pane, choose Outposts buckets:** This will take you to the Outposts buckets section.
3. **Choose Create Outposts bucket:** This will open the create bucket dialog box.
4. **For Bucket name, enter a Domain Name System (DNS)-compliant name for your bucket:** The bucket name must be unique within your AWS account, the Outpost, and the AWS Region the Outpost is homed to. It should be 3-63 characters long, not contain uppercase characters, and start with a lowercase letter or number.
5. **For Outpost, choose the Outpost where you want the bucket to reside:** Select the specific Outpost where you want to create your bucket.
6. **Under Bucket Versioning, set the S3 Versioning state for your S3 on Outposts bucket to one of the followin

## Overall combined code -

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import Pinecone as LangchainPinecone
from langchain.prompts import PromptTemplate
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings.base import Embeddings
from langchain.llms.base import LLM
from typing import Optional, List

import vertexai
from vertexai.generative_models import GenerativeModel

# Initialize Vertex AI for Google Gemini model
PROJECT_ID = "ids-560-project-group-1-bosch"
vertexai.init(project=PROJECT_ID, location="us-central1")
gemini_model = GenerativeModel("gemini-1.5-flash")

# Custom wrapper for Google Gemini to make it compatible with LangChain
class RunnableGemini(LLM):
    def __init__(self, model: GenerativeModel):
        super().__init__()
        self._model = model

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        response = self._model.generate_content(prompt)
        return response.text

    @property
    def _llm_type(self) -> str:
        return "google_gemini"

# Instantiate the wrapped model
llm = RunnableGemini(gemini_model)

# Load the embedding model for the retriever function
MODEL_NAME = "avsolatorio/NoInstruct-small-Embedding-v0"
embedding_model = AutoModel.from_pretrained(MODEL_NAME)
embedding_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Custom LangChain Embedding wrapper
class HFEmbeddingWrapper(Embeddings):
    def embed_query(self, text: str) -> List[float]:
        return get_embedding(text).flatten().tolist()

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [get_embedding(text).flatten().tolist() for text in texts]

# Initialize the custom embedding wrapper
embedding = HFEmbeddingWrapper()

# Initialize Pinecone client
pinecone_api_key = os.getenv("PINECONE_API_KEY") or input("Enter your Pinecone API key: ")
pc = Pinecone(api_key=pinecone_api_key)
index_name = "chunks-without-metadata-final"

# Check if the index exists; if not, create it with dimension 384
if index_name not in [index_info["name"] for index_info in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
index = pc.Index(index_name)
print(f"Connected to index '{index_name}' in Pinecone.")

# Initialize LangChain Pinecone Retriever with embedding wrapper
vectorstore = LangchainPinecone(index=index, embedding=embedding)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# Define the QA PromptTemplate for debugging and documentation assistance
QA_PROMPT = PromptTemplate.from_template("""
You are an expert code debugger and documentation assistant. Help answer technical queries based on the script provided, offering solutions, clarifications, or steps as needed. Use precise language and avoid assumptions. Refer to the documentation context for direct responses.

Documentation Context:
{context}

Question: {question}
Answer:
""")

# Set up the conversational retrieval chain with the custom Gemini wrapper
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    condense_question_prompt=QA_PROMPT,
    combine_docs_chain_kwargs={"prompt": QA_PROMPT},
    return_source_documents=True
)

# Function to maintain conversation history in the RAG chain
def ask_question_with_history(qa_chain, question, chat_history):
    result = qa_chain.invoke({"question": question, "chat_history": chat_history})
    print("Response:", result["answer"])
    chat_history.append((question, result["answer"]))
    return chat_history

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
model = AutoModel.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

import json
import torch

# Assuming `model` and `tokenizer` are already loaded (e.g., SentenceTransformer, T5, etc.)

# Function to compute embeddings
def get_embedding(text, mode="sentence"):
    model.eval()
    inp = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        output = model(**inp)
    return output.last_hidden_state[:, 0, :].numpy()  # [CLS] token representation

from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory

# Define the QA PromptTemplate with the 'context' and 'question' placeholders
QA_PROMPT = PromptTemplate.from_template("""
You are an assistant helping with error debugging and code understanding. Use the context provided to answer the user's question. If there is no context, let the user know more information is needed.

Context:
{context}

User's Question: {question}
Answer:
""")

# Initialize your LLM and retriever (already defined in your environment)

# Create the RetrievalQA chain with the custom prompt and specify the input key
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    input_key="question",  # Specify 'question' as the input key
    chain_type_kwargs={
        "prompt": QA_PROMPT,
    },
    return_source_documents=True,
)

# Initialize memory to store conversation history
memory = ConversationBufferMemory(memory_key="chat_history", input_key="question")

# Function to ask a question with context management
def ask_question_with_history(qa_chain, question, memory):
    # Retrieve the conversation history
    chat_history = memory.load_memory_variables({}).get("chat_history", "")

    # Prepare the inputs
    inputs = {
        "question": question,  # Use 'question' as the key
        "chat_history": chat_history,
    }

    # Call the QA chain with the inputs
    result = qa_chain(inputs)

    # Display the result and update memory
    print("Response:", result["result"])
    memory.save_context({"question": question}, {"result": result["result"]})
    return memory


# Generate an embedding for a test question
test_query = "How does S3 on Outposts differ from standard Amazon S3?"
test_embedding = embedding.embed_query(test_query)

# Query Pinecone directly
results = index.query(vector=test_embedding, top_k=5, include_values=False)
print(f"Results: {results}")

# Example usage
#question = "What are the steps to create a bucket, access point, and endpoint for Amazon S3 on Outposts?"
#memory = ask_question_with_history(qa_chain, question, memory)

#question = "Can you explain how Amazon S3 on Outposts is different from regular Amazon S3?"
#memory = ask_question_with_history(qa_chain, question, memory)

#question = "What are the supported API operations for Amazon S3 on Outposts, and how do they differ from standard Amazon S3?"
#memory = ask_question_with_history(qa_chain, question, memory)

Enter your Pinecone API key: keyConnected to index 'chunks-without-metadata-final' in Pinecone.
Results: {'matches': [{'id': 'chunk-133', 'score': 0.953870952, 'values': []},
             {'id': 'chunk-135', 'score': 0.943853796, 'values': []},
             {'id': 'chunk-165', 'score': 0.942353845, 'values': []},
             {'id': 'chunk-166', 'score': 0.936433077, 'values': []},
             {'id': 'chunk-167', 'score': 0.932382, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}


In [None]:
# Generate an embedding for a test question
test_query = "How does S3 on Outposts differ from standard Amazon S3?"
test_embedding = embedding.embed_query(test_query)

# Query Pinecone directly
results = index.query(vector=test_embedding, top_k=5, include_values=False)
print(f"Results: {results}")


Results: {'matches': [], 'namespace': '', 'usage': {'read_units': 1}}


## Evaluation - Read & Write json file

In [None]:
import json
import time

# Load JSON data from file
input_file = "/questions_answers.json"
output_file = "/Ans_with_context_metadata.json"


In [None]:
import json
import time

# Open the input JSON file
with open(input_file, "r") as f:
    data = json.load(f)

# Function to process a batch of questions
def process_batch(qa_chain, retriever, batch):
    for entry in batch:
        if not entry.get("answer"):  # Only process questions without answers
            question = entry["question"]
            print(f"Processing question: {question}")
            try:
                # Retrieve context using retriever
                retrieved_docs = retriever.get_relevant_documents(question)

                # Sanity print: Check retrieved_docs content
                if not retrieved_docs:
                    print(f"No documents retrieved for question: {question}")
                else:
                    print(f"Retrieved {len(retrieved_docs)} documents for question: {question}")

                # Extract the top 3 chunks
                top_chunks = [doc.page_content for doc in retrieved_docs[:3]]  # Top 3 chunks

                # Sanity print: Check top_chunks content
                if not top_chunks:
                    print(f"No chunks available for question: {question}")
                else:
                    print(f"Top chunks for question '{question}': {top_chunks}")

                # Update the context field in the JSON
                context = "\n\n".join(top_chunks)  # Combine chunks as context
                entry["context"] = context

                # Call the QA chain with the question and context
                inputs = {"question": question, "context": context}
                result = qa_chain.invoke(inputs)

                # Update the JSON with the answer and sources
                entry["answer"] = result.get("result", "No response generated.")
                entry["sources"] = [doc.metadata for doc in retrieved_docs[:3]]  # Metadata of top 3 sources
            except Exception as e:
                print(f"Error processing question: {question}. Error: {e}")
                entry["answer"] = "Error occurred during processing."
                entry["context"] = "Error occurred during context retrieval."
                entry["sources"] = []
    return batch

# Batch size and timeout configuration
batch_size = 2  # Number of questions per batch
timeout = 5    # Time to wait (in seconds) between batches

# Process questions in batches
for i in range(0, len(data), batch_size):
    batch = data[i:i + batch_size]
    print(f"Processing batch {i // batch_size + 1} of {len(data) // batch_size + 1}")
    updated_batch = process_batch(qa_chain, retriever, batch)
    # Save intermediate results to avoid data loss
    with open(output_file, "w") as f:
        json.dump(data, f, indent=4)
    # Wait between batches
    if i + batch_size < len(data):
        print(f"Waiting {timeout} seconds before processing the next batch...")
        time.sleep(timeout)

print(f"Updated JSON saved to {output_file}")


Processing batch 1 of 21
Processing question: How does S3 on Outposts differ from standard Amazon S3, and what are the key considerations for its use in hybrid cloud environments?
No documents retrieved for question: How does S3 on Outposts differ from standard Amazon S3, and what are the key considerations for its use in hybrid cloud environments?
No chunks available for question: How does S3 on Outposts differ from standard Amazon S3, and what are the key considerations for its use in hybrid cloud environments?
Processing question: Debug the following issue: An S3 on Outposts bucket policy returns an `Access Denied` error for cross-account operations. What might be missing in the policy, and how can it be fixed?
No documents retrieved for question: Debug the following issue: An S3 on Outposts bucket policy returns an `Access Denied` error for cross-account operations. What might be missing in the policy, and how can it be fixed?
No chunks available for question: Debug the following i

KeyboardInterrupt: 

In [None]:
with open(input_file, "r") as f:
    data = json.load(f)

# Function to process a batch of questions
def process_batch(qa_chain, batch):
    for entry in batch:
        if not entry.get("answer"):  # Only process questions without answers
            question = entry["question"]
            print(f"Processing question: {question}")
            # Retrieve context if available (optional)
            context = entry.get("context", "")  # Adjust as per JSON structure
            inputs = {"question": question, "context": context}
            try:
                # Call the QA chain
                result = qa_chain.invoke(inputs)
                # Update the JSON with the answer
                entry["answer"] = result.get("result", "No response generated.")
                # Include source documents if necessary
                entry["sources"] = result.get("source_documents", [])
            except Exception as e:
                print(f"Error processing question: {question}. Error: {e}")
                entry["answer"] = "Error occurred during processing."
    return batch

# Batch size and timeout configuration
batch_size = 2  # Number of questions per batch
timeout = 5    # Time to wait (in seconds) between batches

# Process questions in batches
for i in range(0, len(data), batch_size):
    batch = data[i:i + batch_size]
    print(f"Processing batch {i // batch_size + 1} of {len(data) // batch_size + 1}")
    updated_batch = process_batch(qa_chain, batch)
    # Save intermediate results to avoid data loss
    with open(output_file, "w") as f:
        json.dump(data, f, indent=4)
    # Wait between batches
    if i + batch_size < len(data):
        print(f"Waiting {timeout} seconds before processing the next batch...")
        time.sleep(timeout)

print(f"Updated JSON saved to {output_file}")

Processing batch 1 of 41
Processing question: How does S3 on Outposts differ from standard Amazon S3, and what are the key considerations for its use in hybrid cloud environments?
Waiting 10 seconds before processing the next batch...


KeyboardInterrupt: 

Intermediator code - removing the sources

In [None]:
import json
import time

# Load JSON data from file
input_file = "/questions_answers.json"
output_file = "/Ans_without_context_metadata.json"


In [None]:
import json
import time

# Function to process a batch of questions
def process_batch(qa_chain, retriever, batch):
    for entry in batch:
        if not entry.get("answer"):  # Only process questions without answers
            question = entry["question"]
            print(f"Processing question: {question}")
            try:
                # Retrieve context using retriever
                retrieved_docs = retriever.get_relevant_documents(question)
                top_chunks = [doc.page_content for doc in retrieved_docs[:3]]  # Top 3 chunks

                # Update the context field in the JSON
                context = "\n\n".join(top_chunks)  # Combine chunks as context
                entry["context"] = context

                # Call the QA chain with the question and context
                inputs = {"question": question, "context": context}
                result = qa_chain.invoke(inputs)

                # Update the JSON with the answer
                entry["answer"] = result.get("result", "No response generated.")
            except Exception as e:
                print(f"Error processing question: {question}. Error: {e}")
                entry["answer"] = "Error occurred during processing."
                entry["context"] = "Error occurred during context retrieval."
    return batch

timeout = 10

# Process questions in batches
for i in range(0, len(data), batch_size):
    batch = data[i:i + batch_size]
    print(f"Processing batch {i // batch_size + 1} of {len(data) // batch_size + 1}")
    updated_batch = process_batch(qa_chain, retriever, batch)
    # Remove sources field (if it exists) to ensure JSON compatibility
    for entry in updated_batch:
        entry.pop("sources", None)  # Safely remove sources if present
    # Save intermediate results to avoid data loss
    with open(output_file, "w") as f:
        json.dump(data, f, indent=4)
    # Wait between batches
    if i + batch_size < len(data):
        print(f"Waiting {timeout} seconds before processing the next batch...")
        time.sleep(timeout)

print(f"Updated JSON saved to {output_file}")


Processing batch 1 of 21
Waiting 10 seconds before processing the next batch...
Processing batch 2 of 21
Waiting 10 seconds before processing the next batch...
Processing batch 3 of 21
Waiting 10 seconds before processing the next batch...
Processing batch 4 of 21
Waiting 10 seconds before processing the next batch...
Processing batch 5 of 21
Waiting 10 seconds before processing the next batch...
Processing batch 6 of 21
Waiting 10 seconds before processing the next batch...
Processing batch 7 of 21
Waiting 10 seconds before processing the next batch...
Processing batch 8 of 21
Waiting 10 seconds before processing the next batch...
Processing batch 9 of 21
Waiting 10 seconds before processing the next batch...
Processing batch 10 of 21
Waiting 10 seconds before processing the next batch...
Processing batch 11 of 21
Waiting 10 seconds before processing the next batch...
Processing batch 12 of 21
Waiting 10 seconds before processing the next batch...
Processing batch 13 of 21
Waiting 10 

In [None]:
# Function to process a batch of questions
def process_batch(qa_chain, batch):
    for entry in batch:
        if not entry.get("answer"):  # Only process questions without answers
            question = entry["question"]
            print(f"Processing question: {question}")
            # Retrieve context if available (optional)
            context = entry.get("context", "")  # Adjust as per JSON structure
            inputs = {"question": question, "context": context}
            try:
                # Call the QA chain
                result = qa_chain.invoke(inputs)
                # Update the JSON with the answer
                entry["answer"] = result.get("result", "No response generated.")
            except Exception as e:
                print(f"Error processing question: {question}. Error: {e}")
                entry["answer"] = "Error occurred during processing."
    return batch

# Process questions in batches
for i in range(0, len(data), batch_size):
    batch = data[i:i + batch_size]
    print(f"Processing batch {i // batch_size + 1} of {len(data) // batch_size + 1}")
    updated_batch = process_batch(qa_chain, batch)
    # Remove sources field (if it exists) to ensure JSON compatibility
    for entry in updated_batch:
        entry.pop("sources", None)  # Safely remove sources if present
    # Save intermediate results to avoid data loss
    with open(output_file, "w") as f:
        json.dump(data, f, indent=4)
    # Wait between batches
    if i + batch_size < len(data):
        print(f"Waiting {timeout} seconds before processing the next batch...")
        time.sleep(timeout)

print(f"Updated JSON saved to {output_file}")

Processing batch 1 of 41
Waiting 10 seconds before processing the next batch...
Processing batch 2 of 41
Processing question: Debug the following issue: An S3 on Outposts bucket policy returns an `Access Denied` error for cross-account operations. What might be missing in the policy, and how can it be fixed?
Waiting 10 seconds before processing the next batch...
Processing batch 3 of 41
Processing question: A user is unable to access an S3 on Outposts bucket from a different VPC using PrivateLink. Outline the steps to troubleshoot and resolve the issue.
Waiting 10 seconds before processing the next batch...
Processing batch 4 of 41
Processing question: Describe how to create an S3 on Outposts bucket, add tags, and configure a bucket policy using the AWS CLI.
Waiting 10 seconds before processing the next batch...
Processing batch 5 of 41
Processing question: What limitations exist when using multipart uploads with S3 on Outposts buckets?
Waiting 10 seconds before processing the next bat

In [None]:
def ask_single_question(qa_chain, question, context=""):
    """
    Ask a single question using the QA chain and return the answer.

    Args:
        qa_chain: The LangChain QA chain to use.
        question (str): The question to ask.
        context (str): Optional context to provide to the chain.

    Returns:
        str: The answer from the QA chain.
    """
    try:
        inputs = {"question": question, "context": context}
        result = qa_chain.invoke(inputs)
        answer = result.get("result", "No response generated.")
        return answer
    except Exception as e:
        print(f"Error occurred: {e}")
        return "Error occurred during processing."


In [None]:
# Example question and context
question = "How does S3 on Outposts differ from standard Amazon S3?"
context = "Answer in a format compatible with json. Contain the whole answer in strings"

# Get the answer
answer = ask_single_question(qa_chain, question, context)
print("Answer:", answer)


Answer: Amazon S3 on Outposts is a fully managed service that brings the same S3 capabilities you know and love to your on-premises environment.  Here's how it differs from standard Amazon S3:

* **Location:** Standard S3 is a cloud service hosted in Amazon's data centers. S3 on Outposts places the service on hardware that you own and manage in your own data center or co-location facility.
* **Data Residency:** Data stored in S3 on Outposts stays within your own environment, offering increased control and compliance for data subject to regulatory requirements.
* **Latency:**  S3 on Outposts provides lower latency for applications that need to access data with minimal delay.
* **Connectivity:** Since it's located on-premises, you have a direct connection to your data, simplifying network configuration and reducing reliance on public internet connections.

**Note:** While it shares many features with S3, some features like cross-region replication or server-side encryption with KMS aren'

In [None]:
import json

def ask_single_question(qa_chain, question, context=""):
    """
    Ask a single question using the QA chain and return the answer in JSON-compatible format.

    Args:
        qa_chain: The LangChain QA chain to use.
        question (str): The question to ask.
        context (str): Optional context to provide to the chain.

    Returns:
        str: The answer from the QA chain in JSON format.
    """
    try:
        inputs = {"question": question, "context": context}
        result = qa_chain.invoke(inputs)
        answer = result.get("result", "No response generated.")
        # Convert the answer to a JSON-friendly string
        json_answer = json.dumps({"answer": answer}, indent=4)
        return json_answer
    except Exception as e:
        error_message = f"Error occurred during processing: {e}"
        return json.dumps({"error": error_message}, indent=4)


In [None]:
# Example question and context
question = "How does S3 on Outposts differ from standard Amazon S3?"
context = " "

# Get the answer in JSON-compatible format
answer_json = ask_single_question(qa_chain, question, context)
print(answer_json)


{
    "answer": "S3 on Outposts and standard Amazon S3 share the same core functionality, allowing you to store and retrieve objects, but they differ in terms of **location**, **access**, and **features**.\n\n**Location:**\n\n* **Standard S3:**  Data is stored in Amazon's global data centers.\n* **S3 on Outposts:** Data is stored locally in your on-premises data center within an Outposts rack.\n\n**Access:**\n\n* **Standard S3:** You access data over the internet using a global network.\n* **S3 on Outposts:**  You access data through your local network, offering lower latency and better performance for applications that require local data access.\n\n**Features:**\n\n* **Standard S3:**  Offers a wider range of features, including replication, lifecycle management, and various storage classes. \n* **S3 on Outposts:** Currently offers a subset of the features available in standard S3, but Amazon is continuously adding new features.\n\n**Key Differences:**\n\n| Feature | Standard S3 | S3 o

## RAG Eval using Arize Pheniox

## RAG Eval Maually - using NLP and Python - naive one

1. Response Evaluation

A. QA Correctness
Measures how accurate the answers are by comparing them to the ground truth.

Step-by-Step:

Use a semantic similarity approach (e.g., cosine similarity of embeddings).
Compute the similarity score for each pair of retrieved and ground-truth answers.

Output: Scores close to 1 indicate higher correctness.

In [None]:
from sentence_transformers import SentenceTransformer, util

# Load pre-trained embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Example data
retrieved_answers = ["Amazon S3 is a storage service", "It is scalable and secure"]
ground_truth_answers = ["Amazon S3 provides scalable cloud storage", "S3 is secure for storing data"]

# Calculate cosine similarity
correctness_scores = []
for retrieved, ground_truth in zip(retrieved_answers, ground_truth_answers):
    embedding1 = model.encode(retrieved, convert_to_tensor=True)
    embedding2 = model.encode(ground_truth, convert_to_tensor=True)
    similarity = util.cos_sim(embedding1, embedding2).item()
    correctness_scores.append(similarity)

print("QA Correctness Scores:", correctness_scores)


QA Correctness Scores: [0.7918268442153931, 0.4494016170501709]


B. Hallucination Detection
Detects factual inconsistencies by comparing retrieved answers to a trusted knowledge base.

Step-by-Step:

Retrieve facts from a knowledge base (e.g., Wikipedia or your corpus).
Check the factual alignment using semantic similarity.

Output: Scores below a threshold (e.g., 0.7) indicate potential hallucinations.



In [None]:
# Example knowledge base
knowledge_base = ["Amazon S3 is a cloud storage service by AWS"]

# Detect hallucinations
for retrieved in retrieved_answers:
    scores = [util.cos_sim(model.encode(retrieved, convert_to_tensor=True),
                           model.encode(fact, convert_to_tensor=True)).item()
              for fact in knowledge_base]
    max_score = max(scores)
    print(f"Hallucination Score for '{retrieved}': {max_score}")

Hallucination Score for 'Amazon S3 is a storage service': 0.9121593236923218
Hallucination Score for 'It is scalable and secure': 0.3454829156398773


C. Toxicity Detection
Checks for biased or inappropriate content in the responses.

Step-by-Step:

Use a toxicity detection model (e.g., Detoxify).
Evaluate toxicity probability for each response.

Output: Higher scores indicate higher toxicity.

In [None]:
pip install detoxify

Collecting detoxify
  Downloading detoxify-0.5.2-py3-none-any.whl.metadata (13 kB)
Downloading detoxify-0.5.2-py3-none-any.whl (12 kB)
Installing collected packages: detoxify
Successfully installed detoxify-0.5.2


In [None]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://download.pytorch.org/whl/cpu


In [None]:
from detoxify import Detoxify

# Example responses
responses = ["This service is only for advanced users", "All users can use it easily"]

# Detect toxicity
for response in responses:
    toxicity_score = Detoxify('original').predict(response)['toxicity']
    print(f"Toxicity Score for '{response}': {toxicity_score}")

Toxicity Score for 'This service is only for advanced users': 0.0007110816077329218
Toxicity Score for 'All users can use it easily': 0.0006345395813696086


*D*. Faithfulness
Measures factual consistency with the provided context.

Step-by-Step:

Compare retrieved answers to the context provided.
Use cosine similarity to evaluate alignment.

Output: Higher scores indicate higher factual alignment.

In [None]:
# Example contexts
contexts = ["Amazon S3 offers scalable storage and data management capabilities"]

# Faithfulness evaluation
for retrieved, context in zip(retrieved_answers, contexts):
    context_embedding = model.encode(context, convert_to_tensor=True)
    retrieved_embedding = model.encode(retrieved, convert_to_tensor=True)
    faithfulness_score = util.cos_sim(context_embedding, retrieved_embedding).item()
    print(f"Faithfulness Score for '{retrieved}': {faithfulness_score}")

Faithfulness Score for 'Amazon S3 is a storage service': 0.8088440895080566


E. Answer Relevancy
Measures how relevant the response is to the query.

Step-by-Step:

Compare the query and the retrieved answer.
Use cosine similarity to assess relevance.

Output: Higher scores indicate more relevant answers.

In [None]:
# Example queries
queries = ["What is Amazon S3?", "How secure is S3?"]

# Answer relevancy evaluation
for query, retrieved in zip(queries, retrieved_answers):
    query_embedding = model.encode(query, convert_to_tensor=True)
    retrieved_embedding = model.encode(retrieved, convert_to_tensor=True)
    relevancy_score = util.cos_sim(query_embedding, retrieved_embedding).item()
    print(f"Relevancy Score for '{retrieved}': {relevancy_score}")

Relevancy Score for 'Amazon S3 is a storage service': 0.7893038988113403
Relevancy Score for 'It is scalable and secure': 0.5125440359115601


2. Retrieval Evaluation


A. Context Relevance
Assesses how well the retrieved context supports the query.

Retrieve contexts and compare them to the query.

Use cosine similarity to evaluate support.

In [None]:
# Example retrieved contexts
retrieved_contexts = ["Amazon S3 is a highly scalable and durable service"]

# Context relevance evaluation
for query, context in zip(queries, retrieved_contexts):
    query_embedding = model.encode(query, convert_to_tensor=True)
    context_embedding = model.encode(context, convert_to_tensor=True)
    relevance_score = util.cos_sim(query_embedding, context_embedding).item()
    print(f"Context Relevance Score for '{context}': {relevance_score}")

Context Relevance Score for 'Amazon S3 is a highly scalable and durable service': 0.8107646703720093


B. Context Precision
Evaluates how well the retrieved items are ranked by relevance.

Assign relevance scores to retrieved items.

Rank them and compute precision (e.g., top-k).


In [None]:
from sklearn.metrics import precision_score

# Ground truth relevance
true_relevance = [1, 0, 1]  # 1 = relevant, 0 = not relevant
retrieved_relevance = [1, 1, 1]  # Model's relevance predictions

# Precision score
precision = precision_score(true_relevance, retrieved_relevance, average='binary')
print("Context Precision:", precision)

Context Precision: 0.6666666666666666


C. Hit Rate
Measures the percentage of queries where relevant contexts are retrieved.

Calculate the number of queries with at least one relevant context.

Divide by the total number of queries.

In [None]:
# Example data
relevant_retrieved = [1, 0, 1, 1]  # 1 = at least one relevant context, 0 = none

# Hit rate
hit_rate = sum(relevant_retrieved) / len(relevant_retrieved) * 100
print("Hit Rate (%):", hit_rate)

Hit Rate (%): 75.0


Visualization

In [None]:
pip install umap-learn matplotlib seaborn pandas scikit-learn umap

Collecting umap-learn
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting umap
  Downloading umap-0.1.1.tar.gz (3.2 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/88.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pynndescent-0.5.13-py3-none-any.whl (56 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: umap
  Building wheel for umap (setup.py) ... [?25l[?25hdone
  Created wheel for umap: filename=umap-0.1.1-py3-none-any.whl size=3543 sha256=4f53a2ef7750fc51ccbed0568b0134ab419a25e1ed1f34e86cad049dd61d2517
  Stored in directory: /root/.cache/pip/wheels/15/f1/28/53dcf7a309118ed35d810a5f9cb

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from umap import UMAP
from sklearn.metrics.pairwise import cosine_similarity

# Example data: Replace these with your actual embeddings
# Naive and metadata-enriched embeddings
#naive_embeddings = np.random.rand(100, 768)  # Replace with actual embeddings
#metadata_enriched_embeddings = np.random.rand(100, 768)  # Replace with actual embeddings
#queries = np.random.rand(5, 768)  # Example query embeddings

import os
import json
import numpy as np
from typing import List
from transformers import AutoModel, AutoTokenizer
from langchain.embeddings.base import Embeddings
from langchain_pinecone import Pinecone as LangchainPinecone
import pinecone

# Load questions from Qs.json
with open("/questions_answers_populated(To_Eval_GPT).json", "r") as f:
    questions_data = json.load(f)

# Extract questions from the list
if not isinstance(questions_data, list):
    raise ValueError("The file 'Qs.json' does not contain a valid list of question-answer pairs.")

questions = [item["question"] for item in questions_data if "question" in item]

# Sanity check: Ensure questions are non-empty strings
if not questions:
    raise ValueError("No valid questions found in the 'Qs.json' file.")
if not all(isinstance(q, str) and q.strip() for q in questions):
    raise ValueError("Some questions in the 'questions' list are invalid or empty.")

print(f"Loaded {len(questions)} questions from Qs.json.")


# Sanity check: Ensure questions are non-empty strings
if not all(isinstance(q, str) and q.strip() for q in questions):
    raise ValueError("Some questions in the 'questions' list are invalid or empty.")

print("All questions passed the sanity check.")

# Load the embedding model
MODEL_NAME = "avsolatorio/NoInstruct-small-Embedding-v0"
embedding_model = AutoModel.from_pretrained(MODEL_NAME)
embedding_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Function to generate embeddings
def get_embedding(text: str) -> np.ndarray:
    inputs = embedding_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = embedding_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Custom LangChain Embedding wrapper
class HFEmbeddingWrapper(Embeddings):
    def embed_query(self, text: str) -> List[float]:
        return get_embedding(text).flatten().tolist()

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [get_embedding(text).flatten().tolist() for text in texts]

# Initialize the custom embedding wrapper
embedding = HFEmbeddingWrapper()

# Generate query embeddings
query_embeddings = [get_embedding(question).flatten() for question in questions]

# Sanity check: Ensure embeddings are valid
if not all(isinstance(embedding, np.ndarray) and embedding.size > 0 for embedding in query_embeddings):
    raise ValueError("Failed to generate valid embeddings for some questions.")

print(f"Generated embeddings for {len(query_embeddings)} queries.")

# e8db3a70-ab95-49cf-9710-5db89ac159a6
# Initialize Pinecone client
pinecone_api_key = os.getenv("PINECONE_API_KEY") or input("Enter your Pinecone API key: ")
pc = Pinecone(api_key=pinecone_api_key)

index_name_enriched = "chunks-with-metadata-final"
index_name_naive = "chunks-without-metadata-final"


# Check if the index exists; if not, create it with dimension 384
if index_name_enriched not in [index_info["name"] for index_info in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    print(pc.list_indexes())
# Check if the index exists; if not, create it with dimension 384
if index_name_naive not in [index_info["name"] for index_info in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

# Initialize Pinecone indexes
index_enriched = pc.Index(index_name_enriched)
index_naive = pc.Index(index_name_naive)
print(f"Indexes '{index_name_enriched}' and '{index_name_naive}' are ready.")

# Initialize LangChain Pinecone Retriever with embedding wrapper
vectorstore_enriched = LangchainPinecone(index=index_enriched, embedding=embedding)
vectorstore_naive = LangchainPinecone(index=index_naive, embedding=embedding)

retriever_enriched = vectorstore_enriched.as_retriever(search_type="similarity", search_kwargs={"k": 5})
retriever_naive = vectorstore_naive.as_retriever(search_type="similarity", search_kwargs={"k": 5})

print("Retrievers initialized successfully.")

Loaded 40 questions from Qs.json.
All questions passed the sanity check.
Generated embeddings for 40 queries.
Enter your Pinecone API key: e8db3a70-ab95-49cf-9710-5db89ac159a6
Indexes 'chunks-with-metadata-final' and 'chunks-without-metadata-final' are ready.
Retrievers initialized successfully.


In [None]:
# Step 1: Describe the index to get metadata
naive_index_stats = index_naive.describe_index_stats()
metadata_index_stats = index_enriched.describe_index_stats()
# Function to fetch embeddings for all vectors using the initialized retriever
def get_all_embeddings(index, retriever, embedding_wrapper, texts, top_k=1000):
    """
    Retrieve embeddings for all vectors in an index using the initialized retriever and embedding wrapper.
    """
    all_embeddings = []
    for text in texts:
        query_embedding = np.array(embedding_wrapper.embed_query(text)).reshape(1, -1)
        # Perform the query to retrieve top_k relevant embeddings
        results = index.query(vector=query_embedding.tolist()[0], top_k=top_k, include_values=True)
        embeddings = [match["values"] for match in results["matches"]]
        all_embeddings.extend(embeddings)
    return np.array(all_embeddings)

# Step 1: Retrieve embeddings for naive and metadata-enriched indexes
print("Fetching embeddings from the Naive index...")
naive_embeddings = get_all_embeddings(index_naive, retriever_naive, embedding, questions, top_k=100)



Fetching embeddings from the Naive index...
Fetching embeddings from the Metadata-Enriched index...
Fetched 4000 naive embeddings and 0 metadata-enriched embeddings.


In [None]:
# Check if the index exists; if not, create it with dimension 384
if index_name_enriched not in [index_info["name"] for index_info in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )


print(pc.list_indexes())
index_enriched = pc.Index(index_name_enriched)

print("Fetching embeddings from the Metadata-Enriched index...")
metadata_enriched_embeddings = get_all_embeddings(index_enriched, retriever_enriched, embedding, questions, top_k=100)

# Step 2: Sanity check - Ensure embeddings are valid
if not naive_embeddings.any():
    raise ValueError("Failed to fetch embeddings from the Naive index.")
if metadata_enriched_embeddings.any() and naive_embeddings.shape[1] != metadata_enriched_embeddings.shape[1]:
    raise ValueError("Dimension mismatch between naive and metadata-enriched embeddings.")

print(f"Fetched {naive_embeddings.shape[0]} naive embeddings and {metadata_enriched_embeddings.shape[0]} metadata-enriched embeddings.")


[{
    "name": "chunks-with-metadata-final",
    "dimension": 384,
    "metric": "cosine",
    "host": "chunks-with-metadata-final-y0p5d25.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "deletion_protection": "disabled"
}, {
    "name": "chunks-without-metadata-final",
    "dimension": 384,
    "metric": "cosine",
    "host": "chunks-without-metadata-final-y0p5d25.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "deletion_protection": "disabled"
}, {
    "name": "llm-chatbot-project",
    "dimension": 384,
    "metric": "cosine",
    "host": "llm-chatbot-project-y0p5d25.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": 

## Eval Again (Latest)

In [9]:
import json

# Load your JSON files
with open("questions.json", "r") as file:
    questions = json.load(file)

with open("naive_answers.json", "r") as file:
    naive_answers = json.load(file)

with open("metadata_answers.json", "r") as file:
    metadata_answers = json.load(file)

with open("ground_truth.json", "r") as file:
    ground_truth = json.load(file)

In [10]:
# Ensure consistency in structure
assert len(questions) == len(naive_answers) == len(metadata_answers) == len(ground_truth), "Mismatch in file lengths!"


2. Evaluate QA Responses
2.1. Correctness (Exact Match and F1 Score)

In [11]:
import numpy as np

def compute_exact_match(predictions, references):
    return np.mean([1 if pred.strip().lower() == ref.strip().lower() else 0
                    for pred, ref in zip(predictions, references)])

def compute_f1(predictions, references):
    f1_scores = []
    for pred, ref in zip(predictions, references):
        pred_tokens = set(pred.strip().lower().split())
        ref_tokens = set(ref.strip().lower().split())
        common_tokens = pred_tokens & ref_tokens
        precision = len(common_tokens) / len(pred_tokens) if pred_tokens else 0
        recall = len(common_tokens) / len(ref_tokens) if ref_tokens else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0
        f1_scores.append(f1)
    return np.mean(f1_scores)

# Extract answers
naive_predictions = [item["answer"] for item in naive_answers]
metadata_predictions = [item["answer"] for item in metadata_answers]
ground_truth_answers = [item["answer"] for item in ground_truth]

# Calculate metrics
em_naive = compute_exact_match(naive_predictions, ground_truth_answers)
f1_naive = compute_f1(naive_predictions, ground_truth_answers)

em_metadata = compute_exact_match(metadata_predictions, ground_truth_answers)
f1_metadata = compute_f1(metadata_predictions, ground_truth_answers)

print(f"Exact Match (Naive): {em_naive:.2f}, F1 Score (Naive): {f1_naive:.2f}")
print(f"Exact Match (Metadata): {em_metadata:.2f}, F1 Score (Metadata): {f1_metadata:.2f}")

Exact Match (Naive): 0.00, F1 Score (Naive): 0.03
Exact Match (Metadata): 0.00, F1 Score (Metadata): 0.03


2.2 Hallucination Detection

In [12]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

def detect_hallucination(answers, contexts):
    hallucination_scores = []
    for answer, context in zip(answers, contexts):
        answer_embedding = model.encode(answer)
        context_embedding = model.encode(context)
        similarity = cosine_similarity([answer_embedding], [context_embedding])[0][0]
        hallucination_scores.append(similarity)
    return np.mean(hallucination_scores)

# Simulated contexts (replace with actual retrieved contexts from Pinecone if available)
retrieved_contexts_naive = [item.get("context", "") for item in naive_answers]
retrieved_contexts_metadata = [item.get("context", "") for item in metadata_answers]

hallucination_naive = detect_hallucination(naive_predictions, retrieved_contexts_naive)
hallucination_metadata = detect_hallucination(metadata_predictions, retrieved_contexts_metadata)

print(f"Hallucination Score (Naive): {hallucination_naive:.2f}")
print(f"Hallucination Score (Metadata): {hallucination_metadata:.2f}")

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Hallucination Score (Naive): 0.01
Hallucination Score (Metadata): 0.01


2.3. Toxicity Detection

In [14]:
def detect_toxicity(responses, max_length=512):
    toxicity_scores = []
    for response in responses:
        # Truncate response to the maximum length
        truncated_response = response[:max_length]
        result = toxicity_detector(truncated_response)
        toxicity_scores.append(result[0]["score"])
    return np.mean(toxicity_scores)

toxicity_naive = detect_toxicity(naive_predictions)
toxicity_metadata = detect_toxicity(metadata_predictions)

print(f"Toxicity Score (Naive): {toxicity_naive:.2f}")
print(f"Toxicity Score (Metadata): {toxicity_metadata:.2f}")

Toxicity Score (Naive): 0.00
Toxicity Score (Metadata): 0.00


In [15]:
def detect_toxicity_in_chunks(responses, max_length=512):
    toxicity_scores = []
    for response in responses:
        chunks = [response[i:i+max_length] for i in range(0, len(response), max_length)]
        chunk_scores = []
        for chunk in chunks:
            result = toxicity_detector(chunk)
            chunk_scores.append(result[0]["score"])
        # Average toxicity score for all chunks of a response
        toxicity_scores.append(np.mean(chunk_scores))
    return np.mean(toxicity_scores)

toxicity_naive = detect_toxicity_in_chunks(naive_predictions)
toxicity_metadata = detect_toxicity_in_chunks(metadata_predictions)

print(f"Toxicity Score (Naive): {toxicity_naive:.2f}")
print(f"Toxicity Score (Metadata): {toxicity_metadata:.2f}")

Toxicity Score (Naive): 0.00
Toxicity Score (Metadata): 0.00


In [16]:
from transformers import pipeline

# Use a model that supports longer sequences
toxicity_detector = pipeline("text-classification", model="facebook/bart-large-mnli")

toxicity_naive = detect_toxicity(naive_predictions)
toxicity_metadata = detect_toxicity(metadata_predictions)

print(f"Toxicity Score (Naive): {toxicity_naive:.2f}")
print(f"Toxicity Score (Metadata): {toxicity_metadata:.2f}")

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Toxicity Score (Naive): 0.87
Toxicity Score (Metadata): 0.84


Step 3: Evaluate Retrieval
3.1. Context Relevance (Recall@K)

In [17]:
def compute_recall_at_k(retrieved_contexts, relevant_contexts, k=5):
    recalls = []
    for retrieved, relevant in zip(retrieved_contexts, relevant_contexts):
        retrieved_set = set(retrieved[:k])
        relevant_set = set(relevant)
        recall = len(retrieved_set & relevant_set) / len(relevant_set) if relevant_set else 0
        recalls.append(recall)
    return np.mean(recalls)

# Replace with actual top-K retrieved contexts
top_k_contexts_naive = [["context1", "context2", "context3"]] * len(questions)
top_k_contexts_metadata = [["context1", "context2", "context3"]] * len(questions)

recall_naive = compute_recall_at_k(top_k_contexts_naive, retrieved_contexts_naive, k=5)
recall_metadata = compute_recall_at_k(top_k_contexts_metadata, retrieved_contexts_metadata, k=5)

print(f"Recall@5 (Naive): {recall_naive:.2f}")
print(f"Recall@5 (Metadata): {recall_metadata:.2f}")


Recall@5 (Naive): 0.00
Recall@5 (Metadata): 0.00


In [18]:
import pandas as pd

metrics = {
    "Metric": ["Exact Match", "F1 Score", "Hallucination Score", "Toxicity Score", "Recall@5"],
    "Naive": [em_naive, f1_naive, hallucination_naive, toxicity_naive, recall_naive],
    "Metadata Enriched": [em_metadata, f1_metadata, hallucination_metadata, toxicity_metadata, recall_metadata]
}

df_metrics = pd.DataFrame(metrics)
print(df_metrics)

                Metric     Naive  Metadata Enriched
0          Exact Match  0.000000           0.000000
1             F1 Score  0.028869           0.026139
2  Hallucination Score  0.013555           0.011674
3       Toxicity Score  0.873632           0.838705
4             Recall@5  0.000000           0.000000
