### Import Libraries and constant values

In [242]:
from dotenv import load_dotenv
load_dotenv()
import json

from langchain_openai import ChatOpenAI
model_name = "gpt-3.5-turbo"

from typing import Any, Dict, List
from langchain import hub
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.history_aware_retriever import (
    create_history_aware_retriever,
)
from langchain.chains.retrieval import create_retrieval_chain
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore

### Improve the llm by these...

##### Comment a block below

In [243]:
index_name = "metrics"
chunk_size=1000
chunk_overlap=200
llm = ChatOpenAI(model_name=model_name)
evaluation_file_name = "evaluation.json"

# index_name = "metricsimproved"
# chunk_size=80
# chunk_overlap=15
# llm = ChatOpenAI(model_name=model_name)
# evaluation_file_name = "evaluation-improved.json"


### Read documents

In [244]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import ReadTheDocsLoader

loader = ReadTheDocsLoader("mongodb-docs/www.mongodb.com")
raw_documents = loader.load()
print(f"loaded {len(raw_documents)} documents")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
docs = text_splitter.split_documents(raw_documents)

loaded 1608 documents


### Embeddings

In [245]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

### Pass to Pinecone

In [246]:
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_documents(docs, embeddings, index_name=index_name)

### Queries

In [247]:
queries = [
    "What is MongoDB?",
    "How do I create an index in mongodb?",
    "What can mongodb do for my app?",
    "What is Atlas?",
    "What is database sharding?",
    "What is useState?",
]

### LLM

In [248]:
def run_llm(query: str, chat_history: List[Dict[str, Any]] = []):
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    docsearch = PineconeVectorStore(
        index_name=index_name, embedding=embeddings
    )

    rephrase_prompt = hub.pull("langchain-ai/chat-langchain-rephrase")

    retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
    stuff_documents_chain = create_stuff_documents_chain(
        llm, retrieval_qa_chat_prompt
    )

    history_aware_retriever = create_history_aware_retriever(
        llm=llm, retriever=docsearch.as_retriever(), prompt=rephrase_prompt
    )
    qa = create_retrieval_chain(
        retriever=history_aware_retriever,
        combine_docs_chain=stuff_documents_chain,
    )

    result = qa.invoke(input={"input": query, "chat_history": chat_history})
    return result

In [249]:
def generate_evaluation_data(query):
    retrieval_results = docsearch.similarity_search(query)
    contexts = [result.page_content for result in retrieval_results]

    result = run_llm(query, [])
    # result = rag_chain.invoke(query)
    return {
        "query": query,
        "contexts": contexts,
        "answer": result["answer"]
    }

### Evaluate

In [250]:
evaluation = [generate_evaluation_data(query) for query in queries]

In [251]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def evaluate_context_relevance(questions, retrieved_contexts):
    vectorizer = TfidfVectorizer().fit_transform(retrieved_contexts + questions)
    vectors = vectorizer.toarray()
    relevance_scores = []
    
    for i, question in enumerate(questions):
        context_vector = vectors[i]
        question_vector = vectors[len(questions) + i]
        cosine_sim = cosine_similarity([context_vector], [question_vector])[0][0]
        relevance_scores.append(cosine_sim)
    
    return relevance_scores

In [252]:
from rouge_score import rouge_scorer

def evaluate_answer_relevance(answers_from_llm, reference_answers):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    
    for generated, reference in zip(answers_from_llm, reference_answers):
        scores = scorer.score(reference, generated)
        rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
        rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
        rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)
    
    return rouge_scores

### Answers generated by GPT-4o

In [253]:
reference_answers = [
    "MongoDB is a NoSQL, document-oriented database that stores data in flexible, JSON-like documents. It is designed for scalability, high performance, and ease of development.",
    "Use the createIndex method on a collection, specifying the field to index, like db.collection.createIndex({ fieldName: 1 }). This improves query performance by allowing the database to quickly locate data.",
    "MongoDB offers flexible schema design, high scalability, and robust performance, making it ideal for handling large datasets, real-time applications, and rapidly evolving data models. It also supports advanced features like geospatial queries and full-text search.",
    "MongoDB Atlas is a fully managed cloud database service that automates database provisioning, setup, and maintenance. It provides features like automated backups, scaling, and high availability.",
    "Database sharding is the process of distributing a single database across multiple servers to improve performance and scalability. Each shard contains a subset of the database's data, enabling the system to handle larger datasets and higher traffic loads.",
    "useState is a Hook in React that allows you to add state management to functional components. Introduced in React 16.8, Hooks provide a way to use state and other React features without writing class components. useState is used to declare state variables in functional components, enabling them to maintain and update their own state.",
]

In [254]:
retrieved_contexts = []
for data in evaluation:
    retrieved_contexts.append(data['contexts'][0])

answers_from_llm = []
for data in evaluation:
    answers_from_llm.append(data['answer'])

### Context relevance, answer relevance and noise robustness

In [255]:
context_relevance_scores = evaluate_context_relevance(queries, retrieved_contexts)
context_relevance_scores

answer_relevance_scores = evaluate_answer_relevance(answers_from_llm, reference_answers)
answer_relevance_scores

{'rouge1': [0.48000000000000004,
  0.27027027027027023,
  0.37500000000000006,
  0.31249999999999994,
  0.3125,
  0.2162162162162162],
 'rouge2': [0.12499999999999997,
  0.10958904109589043,
  0.1090909090909091,
  0.0425531914893617,
  0.06451612903225808,
  0.0],
 'rougeL': [0.32,
  0.21621621621621623,
  0.23214285714285718,
  0.18749999999999997,
  0.25,
  0.12612612612612611]}

## Display results

In [256]:
evaluation_results = {
    "answers_from_llm": answers_from_llm,
    "context_relevance_scores": context_relevance_scores,
    "answer_relevance_scores": answer_relevance_scores
}

with open(evaluation_file_name, 'w') as f:
    json.dump(evaluation_results, f, indent=4)