<a href="https://colab.research.google.com/github/santhoshml/semantic_chunking/blob/main/Semantic_chunking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U -q langchain langchain-openai langchain_core langchain-community langchainhub openai langchain-qdrant
!pip install -qU ragas
!pip install -qU qdrant-client pymupdf pandas
!pip install -qU langchain_experimental

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.7/49.7 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.4/404.4 kB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m62.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.7/383.7 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.0/78.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [4]:
from langchain_community.document_loaders import PyMuPDFLoader

complete_document = PyMuPDFLoader("./data/The-lord-of-the-rings.pdf").load()



In [5]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=50,
)

rcr_documents = text_splitter.split_documents(complete_document)

In [8]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient, models
from qdrant_client.http.models import Distance, VectorParams

COLLECTION_NAME = "The_Lord_of_the_Rings_rcr"
VECTOR_SIZE = 1536

qdrant_client =  QdrantClient(
     url="https://f39c0bd4-5b54-4053-b2e0-e52d2f151123.europe-west3-0.gcp.cloud.qdrant.io:6333",
    api_key=userdata.get('QDRANT_API_KEY'))

qdrant_client.create_collection(
     collection_name=COLLECTION_NAME,
    vectors_config=models.VectorParams(size=VECTOR_SIZE, distance=models.Distance.COSINE)
)

qdrant_vector_store = QdrantVectorStore(
    client=qdrant_client,
    collection_name=COLLECTION_NAME,
    embedding=embeddings,
)

qdrant_vector_store.add_documents(rcr_documents)

rcr_retriever = qdrant_vector_store.as_retriever()

In [None]:
retrieved_documents = rcr_retriever.invoke("Who took he Ring ?")
print(len(retrieved_documents))

4


In [11]:
from langchain_community.vectorstores import Qdrant
from langchain_experimental.text_splitter import SemanticChunker

semantic_chunker = SemanticChunker(
    embeddings,
    breakpoint_threshold_type="percentile"
)
semantic_documents = semantic_chunker.split_documents(complete_document)
semantic_vectorstore = Qdrant.from_documents(
    semantic_documents,
    embeddings,
    url="https://f39c0bd4-5b54-4053-b2e0-e52d2f151123.europe-west3-0.gcp.cloud.qdrant.io:6333",
    api_key=userdata.get('QDRANT_API_KEY'),
    collection_name="The_Lord_of_the_Rings_semantic"
)
semantic_retriever = semantic_vectorstore.as_retriever(search_kwargs={"k" : 10})

In [12]:
from langchain.prompts import ChatPromptTemplate

template = """Answer the question based only on the following context. If you cannot answer the question with the context, please respond with "I don't know":

Question:
{question}

Context:
{context}
"""

prompt = ChatPromptTemplate.from_template(template)

In [13]:
from operator import itemgetter

from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

primary_qa_llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

rcr_qa_chain = (
    {"context": itemgetter("question") | rcr_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": prompt | primary_qa_llm, "context": itemgetter("context")}
)

semantic_qa_chain = (
    {"context": itemgetter("question") | semantic_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": prompt | primary_qa_llm, "context": itemgetter("context")}
)

In [18]:
import pandas as pd
question_list = [
    "What is the core part of the story?",
    "Why does Aragorn use the name 'Strider'?",
    "What is the meaning of the line 'The Crownless again shall be king'?",
    "Why was the Fellowship of the Ring formed?",
    "What is the symbolism of the One Ring?",
    "What is the main message of The Fellowship of the Ring?",
    "What is the lesson of The Fellowship of the Ring?",
    "What is the Watcher in the Water?"
]

question_arr=[]
rcr_response_arr=[]
semantic_response_arr=[]

for question in question_list:
  question_arr.append(question)

  result_rcr = rcr_qa_chain.invoke({"question" : question})
  rcr_response_arr.append(result_rcr["response"].content)

  result_semantic = semantic_qa_chain.invoke({"question" : question})
  semantic_response_arr.append(result_semantic["response"].content)

In [19]:
df=pd.DataFrame({
    "Question" : question_arr,
    "Recursive Character Text Split" : rcr_response_arr,
    "Semantic Chunking" : semantic_response_arr
})
df


Unnamed: 0,Question,Recursive Character Text Split,Semantic Chunking
0,What is the core part of the story?,I don't know.,The core part of the story is the journey and ...
1,Why does Aragorn use the name 'Strider'?,I don't know.,Aragorn uses the name 'Strider' as a nickname ...
2,What is the meaning of the line 'The Crownless...,The line 'The Crownless again shall be king' s...,"The line ""The crownless again shall be king"" s..."
3,Why was the Fellowship of the Ring formed?,I don't know.,I don't know.
4,What is the symbolism of the One Ring?,I don't know.,The One Ring symbolizes the corrupting nature ...
5,What is the main message of The Fellowship of ...,I don't know.,The main message of The Fellowship of the Ring...
6,What is the lesson of The Fellowship of the Ring?,I don't know.,I don't know.
7,What is the Watcher in the Water?,The Watcher in the Water is a creature mention...,The Watcher in the Water is described as a mon...
