In [None]:
!pip install -qU ragas==0.2.10
!pip install -qU langchain-community==0.3.14 langchain-openai==0.2.14 unstructured==0.16.12 langgraph==0.2.61 langchain-qdrant==0.2.0 nltk

### Keys

In [None]:
import os
from getpass import getpass
os.environ["OPENAI_API_KEY"] = getpass("Please enter your OpenAI API key!")
os.environ["RAGAS_APP_TOKEN"] = getpass("Please enter your Ragas API key!")

Please enter your OpenAI API key!··········
Please enter your Ragas API key!··········


### SDG: Generating synthetic data

### Data Preparation

Download the webpages which we'll be using for our data today.

These webpages are from [Simon Willison's](https://simonwillison.net/) yearly "AI learnings".

- [2023 Blog](https://simonwillison.net/2023/Dec/31/ai-in-2023/)
- [2024 Blog](https://simonwillison.net/2024/Dec/31/llms-in-2024/)


In [None]:
from langchain_community.document_loaders import DirectoryLoader
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from ragas.testset import TestsetGenerator

def dataPreparation():
  !mkdir data
  !curl https://simonwillison.net/2023/Dec/31/ai-in-2023/ -o data/2023_llms.html
  !curl https://simonwillison.net/2024/Dec/31/llms-in-2024/ -o data/2024_llms.html

  #Load the data into a familiar LangChain format using the DirectoryLoader
  path = "data/"
  loader = DirectoryLoader(path, glob="*.html")
  docs = loader.load()

  return docs

def dataGeneration(docs):
  generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
  generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

  generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
  dataset = generator.generate_with_langchain_docs(docs, testset_size=30)
  #print(dataset.to_pandas())
  print(dataset.upload())
  return dataset

In [None]:
docs = dataPreparation()
dataset = dataGeneration(docs)

mkdir: cannot create directory ‘data’: File exists
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 31427    0 31427    0     0   140k      0 --:--:-- --:--:-- --:--:--  140k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 70286    0 70286    0     0   489k      0 --:--:-- --:--:-- --:--:--  490k


Applying HeadlinesExtractor:   0%|          | 0/2 [00:00<?, ?it/s]

Applying HeadlineSplitter:   0%|          | 0/2 [00:00<?, ?it/s]

Applying SummaryExtractor:   0%|          | 0/2 [00:00<?, ?it/s]

Applying CustomNodeFilter:   0%|          | 0/12 [00:00<?, ?it/s]

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/26 [00:00<?, ?it/s]

Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/2 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/30 [00:00<?, ?it/s]

Testset uploaded! View at https://app.ragas.io/dashboard/alignment/testset/5d76caaa-ee59-4c47-88d2-910fc4c62b88
https://app.ragas.io/dashboard/alignment/testset/5d76caaa-ee59-4c47-88d2-910fc4c62b88


## HW:


1.   LangGraph RAG with naive retrieval
2. Baseline Evaluation using RAGAS METRICS
3. Implement semantic chunking
4. Update LangGraph RAG to use semnatic chunking with naive retrieval
5. Run evaluations to compare and constract the results



### LangGraph RAG with naive retreival

#### Chunking and embedding

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

from langchain_openai import OpenAIEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
import numpy as np
from nltk.tokenize import sent_tokenize
from langchain_core.documents import Document


class EmbeddingRetriever():

  def __init__(self, docs):
    self.docs = docs
    self.embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    self.split_documents = []

  def __createEmbeddingsAndRetriever(self, split_documents):
    client = QdrantClient(":memory:")

    client.create_collection(
        collection_name="ai_across_years",
        vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
    )

    vector_store = QdrantVectorStore(
        client=client,
        collection_name="ai_across_years",
        embedding=self.embeddings,
    )

    vector_store.add_documents(documents=split_documents)
    retriever = vector_store.as_retriever(search_kwargs={"k": 5})

    return retriever

  # Perform semantic chunking.
  # For current version, chunk size is represented in number of lines.
  def __semanticChunking(self,min_chunk_size: 'int', max_chunk_size: 'int', threshold:'float'):
    split_documents = []
    chunk = ""
    chunk_size = 0
    for doc in self.docs:
      for sentence in sent_tokenize(doc.page_content):
        if chunk_size<min_chunk_size:
          chunk+=sentence
          chunk_size+=1
          continue

        chunk_embedding = self.embeddings.embed_query(chunk)
        sentence_embedding = self.embeddings.embed_query(sentence)
        similarity_score = self.__cosine_similarity(chunk_embedding,sentence_embedding)

        if similarity_score >= threshold:
          chunk += sentence
          chunk_size+=1

        if chunk_size >= max_chunk_size or similarity_score <threshold:
          split_documents.append(Document(page_content=chunk, metadata=doc.metadata))
          chunk = ""
          chunk_size = 0

        #print(f"Processing chunk:{len(split_documents)}")

    return split_documents

  def __cosine_similarity(self,v1,v2):
    dot_product = np.dot(v1, v2)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    return dot_product / (norm_v1 * norm_v2)


  def __chunking(self, stratergy):
    if stratergy["algorithm"] == "recursiveCharacterTextSplitter":
      #chunk_overlap parameter specifically controls how much overlap there is between consecutive chunks
      text_splitter = RecursiveCharacterTextSplitter(chunk_size=stratergy["chunk_size"],
                                                     chunk_overlap=stratergy["chunk_overlap"])
      self.split_documents = text_splitter.split_documents(self.docs)

    if stratergy["algorithm"] == "semanticChunking":
      self.split_documents = self.__semanticChunking(stratergy["min_chunk_size"],
                                                    stratergy["max_chunk_size"],
                                                     stratergy["threshold"])

    return self.split_documents

  def getRetriever(self, stratergy):
    split_documents = self.__chunking(stratergy)
    print(f"split_documents length:{len(split_documents)}")
    return self.__createEmbeddingsAndRetriever(split_documents)


#### R - Retrieval

In [None]:
embedding_retriever_util = EmbeddingRetriever(docs)
retriever = embedding_retriever_util.getRetriever({
    "algorithm":"recursiveCharacterTextSplitter",
    "chunk_size":1000,
    "chunk_overlap":200})

def retrieve(state):
  retrieved_docs = retriever.invoke(state["question"])
  return {"context" : retrieved_docs}

split_documents length:74


#### A - Augument

In [None]:
from langchain.prompts import ChatPromptTemplate

RAG_PROMPT = """\
You are a helpful assistant who answers questions based on provided context. You must only use the provided context, and cannot use your own knowledge.

### Question
{question}

### Context
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)

#### G - Generation

Using gpt-4o-mini

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

def generate(state):
  docs_content = "\n\n".join(doc.page_content for doc in state["context"])
  messages = rag_prompt.format_messages(question=state["question"], context=docs_content)
  response = llm.invoke(messages)
  return {"response" : response.content}

#### LangGraph

In [None]:
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
from langchain_core.documents import Document

class State(TypedDict):
  question: str
  context: List[Document]
  response: str

graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()


In [None]:
response = graph.invoke({"question" : "How are LLM agents useful?"})
response["response"]

'LLM agents, or Large Language Model agents, are useful in several ways:\n\n1. **Ease of Building**: They are relatively easy to construct, requiring only a few hundred lines of Python code, provided that adequate training data is available. This accessibility allows more people to experiment with and create LLMs, broadening their potential applications.\n\n2. **Running on Personal Devices**: Recent advancements have made it possible to run LLMs on personal computers, making the technology more accessible to individuals and smaller organizations without the need for expensive servers.\n\n3. **Code Generation**: LLMs have shown particular effectiveness in generating code. Their ability to write and execute code, particularly with tools like the ChatGPT Code Interpreter, allows users to verify the correctness of generated code. This capability is less susceptible to the issue of hallucination compared to other applications of LLMs.\n\n4. **Potential for AI Agents**: There is excitement a

### Evaluation

In [None]:
for test_row in dataset:
  response = graph.invoke({"question" : test_row.eval_sample.user_input})
  test_row.eval_sample.response = response["response"]
  test_row.eval_sample.retrieved_contexts = [context.page_content for context in response["context"]]

In [None]:
dataset.to_pandas()

Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,synthesizer_name
0,Wht did OpenAI kno that the rest of us didnt a...,"[If you can gather the right data, and afford ...",[Prompt driven app generation is a commodity a...,OpenAI had a unique understanding of the criti...,"OpenAI's best model, GPT-4, was almost a year ...",single_hop_specifc_query_synthesizer
1,What is the description of the butterfly photo...,[How good are those descriptions? Here’s what ...,"[gets you OpenAI’s most expensive model, o1. G...",The photo taken at the California Academy of S...,"A shallow dish, likely a hummingbird or butter...",single_hop_specifc_query_synthesizer
2,What advancements were made with GPT-4 in 2023?,[This is a huge advantage for open over closed...,[feed with the model and talk about what you c...,"In 2023, several advancements were made with G...","In 2023, GPT-4 enabled the creation of full in...",single_hop_specifc_query_synthesizer
3,Wht iz the role of AGI in the context of evals...,[A lot of people are excited about AI agents—a...,[dependent on AGI itself. A model that’s robus...,In the context of evals and LLMs (Large Langua...,The context does not provide specific informat...,single_hop_specifc_query_synthesizer
4,Wht is the role of OpenAI in the context of en...,"[I think this means that, as individual users,...",[that. DeepSeek v3 is a huge 685B parameter mo...,OpenAI's role in the context of energy usage a...,OpenAI has significantly reduced the cost of r...,single_hop_specifc_query_synthesizer
5,Wht role does Meta play in the development of ...,[So training an LLM still isn’t something a ho...,[Another common technique is to use larger mod...,Meta plays a significant role in the developme...,Meta's Llama 3.3 70B fine-tuning used over 25M...,single_hop_specifc_query_synthesizer
6,What stuff we learned about LLMs in 2023?,[This is Things we learned about LLMs in 2024 ...,[Simon Willison’s Weblog Subscribe Things we l...,"In 2023, we learned several key things about L...","The article is a sequel to a review of 2023, b...",single_hop_specifc_query_synthesizer
7,What Llama stuff you got?,[Meta’s Llama 3.2 models deserve a special men...,[Those of us who understand this stuff have a ...,The context mentions the Llama 3.2 models from...,Options for accessing Llama 3 from the termina...,single_hop_specifc_query_synthesizer
8,What role does Stability AI play in the develo...,[I wrote about how Large language models are h...,[Code may be the best application The ethics o...,Stability AI is one of the organizations that ...,Stability AI is one of the organizations that ...,single_hop_specifc_query_synthesizer
9,How does the use of JavaScript relate to the c...,[It’s still astonishing to me how effective th...,[Based Development As a computer scientist and...,The use of JavaScript relates to the capabilit...,Large Language Models are particularly effecti...,single_hop_specifc_query_synthesizer


In [None]:
from ragas import EvaluationDataset

evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())


In [None]:
#judge model
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))

In [None]:
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity
from ragas import evaluate, RunConfig

custom_run_config = RunConfig(timeout=360)

result = evaluate(
    dataset=evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
)
result

Evaluating:   0%|          | 0/180 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[65]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[71]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[101]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[107]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[125]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[131]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[137]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[149]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[167]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[173]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[179]: TimeoutError()


{'context_recall': 0.7118, 'faithfulness': 0.8062, 'factual_correctness': 0.4040, 'answer_relevancy': 0.9505, 'context_entity_recall': 0.4061, 'noise_sensitivity_relevant': 0.2766}

### Update chunking stratergy and re-evaluate

In [None]:
semantic_chunking_retriever_util = EmbeddingRetriever(docs)
semantic_chunking_retriever = semantic_chunking_retriever_util.getRetriever({
    "algorithm":"semanticChunking",
    "min_chunk_size":1,
    "max_chunk_size":100,
    "threshold":0.70})

def semantic_chunking_retrieve(state):
  retrieved_docs = semantic_chunking_retriever.invoke(state["question"])
  return {"context" : retrieved_docs}

split_documents length:211


In [None]:
semantic_chunking_retriever_util.split_documents[0:5]

[Document(metadata={'source': 'data/2024_llms.html'}, page_content='Simon Willison’s Weblog\n\nSubscribe\n\nThings we learned about LLMs in 2024\n\n31st December 2024\n\nA lot has happened in the world of Large Language Models over the course of 2024.'),
 Document(metadata={'source': 'data/2024_llms.html'}, page_content='This is a sequel to my review of 2023.'),
 Document(metadata={'source': 'data/2024_llms.html'}, page_content='The environmental impact got better\n\nThe environmental impact got much, much worse\n\nThe year of slop\n\nSynthetic training data works great\n\nLLMs somehow got even harder to use\n\nKnowledge is incredibly unevenly distributed\n\nLLMs need better criticism\n\nEverything tagged “llms” on my blog in 2024\n\nThe GPT-4 barrier was comprehensively broken\n\nIn my December 2023 review I wrote about how We don’t yet know how to build GPT-4—OpenAI’s best model was almost a year old at that point, yet no other AI lab had produced anything better.'),
 Document(metada

In [None]:
class State(TypedDict):
  question: str
  context: List[Document]
  response: str

new_graph_builder = StateGraph(State).add_sequence([semantic_chunking_retrieve, generate])
new_graph_builder.add_edge(START, "semantic_chunking_retrieve")
new_graph = new_graph_builder.compile()

In [None]:
semantic_chunking_response = new_graph.invoke({"question" : "How are LLM agents useful?"})
semantic_chunking_response["response"]

"LLM agents are useful because they can perform a wide range of tasks, often surprising both users and the people who trained them with their capabilities. They have the potential to accomplish things that may not have been anticipated. However, it's important to recognize that LLMs can also exhibit significant limitations, as they may believe any information provided to them. Therefore, the key to effectively utilizing LLMs lies in understanding how to navigate their inherent unreliability while harnessing their powerful features."

In [None]:
import time

for test_row in dataset:
  semantic_chunking_response = graph.invoke({"question" : test_row.eval_sample.user_input})
  test_row.eval_sample.response = semantic_chunking_response["response"]
  test_row.eval_sample.retrieved_contexts = [context.page_content for context in semantic_chunking_response["context"]]
  time.sleep(2) # To try to avoid rate limiting.

In [None]:
semantic_chunking_result = evaluate(
    dataset=evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
)
semantic_chunking_result

Evaluating:   0%|          | 0/180 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[17]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[65]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[71]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[83]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[101]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[107]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[119]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[125]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[131]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[137]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[143]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[149]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[155]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[161]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[167]: TimeoutError()
ERROR:ragas.ex

{'context_recall': 0.6990, 'faithfulness': 0.8419, 'factual_correctness': 0.4120, 'answer_relevancy': 0.9521, 'context_entity_recall': 0.4267, 'noise_sensitivity_relevant': 0.3027}

In [None]:
semantic_chunking_result

{'context_recall': 0.6990, 'faithfulness': 0.8419, 'factual_correctness': 0.4120, 'answer_relevancy': 0.9521, 'context_entity_recall': 0.4267, 'noise_sensitivity_relevant': 0.3027}

In [None]:
result

{'context_recall': 0.7118, 'faithfulness': 0.8062, 'factual_correctness': 0.4040, 'answer_relevancy': 0.9505, 'context_entity_recall': 0.4061, 'noise_sensitivity_relevant': 0.2766}

### Compare and Contrast the results:

| Metric | Semantic Chunking | Recursive Character Text Splitter | Comparison |
|---|---|---|---|
| `context_recall` | 0.6990 | 0.7118 | `Recursive Character Text Splitter` slightly higher (+1.28%) - indicates slightly better retrieval of relevant context. |
| `faithfulness` | 0.8419 | 0.8062 | `Semantic Chunking` significantly higher (+3.57%) - suggests responses are more grounded in the provided context. |
| `factual_correctness` | 0.4120 | 0.4040 | `Semantic Chunking` slightly higher (+0.8%) - indicates slightly more factually accurate answers. |
| `answer_relevancy` | 0.9521 | 0.9505 | `Semantic Chunking` slightly higher (+0.16%) - suggests slightly more relevant answers to the questions. |
| `context_entity_recall` | 0.4267 | 0.4061 | `Semantic Chunking` higher (+2.06%) - indicates better recall of important entities from the context. |
| `noise_sensitivity_relevant` | 0.3027 | 0.2766 | `Semantic Chunking` higher (+2.61%) - suggests slightly better robustness to irrelevant information. |

**Summary**:

Recursive Character Text Splitter performs negligibly better only in terms of retrieving relevant context (context_recall).
Semantic Chunking performs better across all other crucial metrics, most notably faithfulness, suggesting higher accuracy and grounding in provided context. It also shows marginal improvements in factual_correctness, answer_relevancy, context_entity_recall, and noise_sensitivity_relevant. <br/>

**Overall:**

Based on these results, Semantic Chunking appears to produce slightly more reliable and accurate responses compared to Recursive Character Text Splitter for this specific dataset and RAG pipeline configuration. Although Recursive Character Text Splitter retrieves marginally more relevant context, sSemantic Chunking demonstrates a better ability to utilize that context effectively and provide faithful and factually sound answers.

**Note**: The results are specific to the various parameters selected for given run of test. Ideally we should run multiple evaluation runs with different settings to identify better option