<a href="https://colab.research.google.com/github/rodiwaa/learnings-pocs/blob/main/notebooks/resume_rag_system_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain langchain.openai \
  openai langchain-community langsmith chromadb \
  python-dotenv sentence-transformers pypdf \
  langchain_community langchain_experimental \
  qdrant-client langchain-qdrant qdrant-client

Collecting langchain.openai
  Downloading langchain_openai-0.3.34-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.30-py3-none-any.whl.metadata (3.0 kB)
Collecting chromadb
  Downloading chromadb-1.1.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting pypdf
  Downloading pypdf-6.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain_experimental
  Downloading langchain_experimental-0.3.4-py3-none-any.whl.metadata (1.7 kB)
Collecting qdrant-client
  Downloading qdrant_client-1.15.1-py3-none-any.whl.metadata (11 kB)
Collecting langchain-qdrant
  Downloading langchain_qdrant-0.2.1-py3-none-any.whl.metadata (1.4 kB)
Collecting requests<3,>=2 (from langchain)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pybase64>=1.4.1 (from chr

## API keys from .env

In [None]:
import os
from google.colab import drive
from dotenv import load_dotenv

drive.mount("/content/drive", force_remount=True)

load_dotenv(dotenv_path="/content/drive/MyDrive/Projects/.env/.env")

Mounted at /content/drive


True

## Read and upload pdf from drive to Qdrant Cloud.
HOLD. will work with docs now, import PDF later.

### Get PDF content from gdrive

# Pretty Print Docs Util

In [None]:
def pretty_print_docs(docs):
  for doc in docs:
    clean_text = " ".join(doc.page_content.splitlines()) # resolves multi line colab issue (\n)
    print(clean_text)

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from google.colab import drive

drive.mount("/content/drive", force_remount=True)
file_path = "/content/drive/MyDrive/Projects/docs/rodi.pdf"

pdf_content = """

Rohit is a software engineer. For work, he automates complex business workflows
and builds AI systems. He has been working for over 15 years in the tech industry.
He has taken various roles, his latest role being of a Cloud Architect. He loves
to watch drama movies and listens to music. He loves to take a drive and do
road trips with friends and family, especially in the monsoons.

"""

about_rodi_loader = PyPDFLoader(file_path)

about_pdf_docs = about_rodi_loader.load()
print(f"docs loaded = {len(about_pdf_docs)}")
# print(docs[0].page_content)

for doc in about_pdf_docs:
  print(doc.page_content)

# clean, split, chunk, embed and upload to vector store
- lets use qdrant for public cloud URL

## Create docs for "about me"

### Retrieve docs from VS based on query sim searches

# Search Strategies
- similiarty
- MMR
- context compression
- semantic chunker

In [None]:
# queries
QUERY1 = "who are rohit's friends?"
QUERY2 = "what are rohit's hobbies?"
QUERY3 = "what does rohit work on?"

## Basic similarity search

In [None]:
# basic sim search
basic_search = vectorstore.similarity_search(
    query = QUERY2,
    k = 2
)
for doc in basic_search:
  print(doc.page_content)

# Output
# vectorstore.similary adhers to k; does not work invoking runnable (k)

# Jack loves to watch movies and listen to music.
# Jack loves to trek on weekends.


## MMR

In [None]:
# MMR
base_retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs = { "k": 2 }
)
res = base_retriever.invoke(QUERY2)
for doc in res:
  print(doc.page_content)

# notes - k is ignored. why?; need to used search_kwargs, not "k"
# output
# Jack loves to watch movies and listen to music.
# Jack likes to build workflows and AI systems


# Setup qdrant cloud vector store first

In [None]:
!pip install -U langchain-qdrant qdrant-client


In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_qdrant import Qdrant, QdrantVectorStore
from langchain_community.embeddings import SentenceTransformerEmbeddings
# from langchain_community.vectorstores import Chroma
# from langchain_openai import OpenAIEmbeddings
# from sentence_transformers import SentenceTransformer # does not work well w langchain/chroma; use SentenceTransformerEmbeddings instead

# docs = [
#     Document(page_content="Jack likes to build workflows and AI systems"),
#     Document(page_content="Jack has worked on following technology stacks - Langchain, Langgraph, Langsmith"),
#     Document(page_content="Jack is friends with Tom and Sally."),
#     Document(page_content="Jack loves to trek on weekends."),
#     Document(page_content="Jack loves to watch movies and listen to music.")
# ]

print(f"about_pdf_docs \n {about_pdf_docs}")

docs = about_pdf_docs



embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# vectorstore = Chroma.from_documents(
#     embedding = embedding_model,
#     documents = docs,
#     collection_name = "random_db_2",
#     persist_directory = "random_db_2"
# )

qdrant_url = os.environ["QDRANT_URL"]
qdrant_api_key = os.environ["QDRANT_API_KEY"]

vectorstore = QdrantVectorStore.from_documents(
    embedding = embedding_model,
    documents = docs,
    url=qdrant_url,
    api_key=qdrant_api_key,
    collection_name = "about_rodi_rag",
    force_recreate = True
    # persist_directory = "about_rodi_rag"
)

print("docs upload sucessfly to qdrant cloud collection about_rodi_rag")

# check if docs are added
# added_docs = vectorstore.get()


## contextual compression thingie
i will be using this one due to better results

In [None]:
# needs llm, embedding, compression mod, base retr, LLMChainExtractor
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers import ContextualCompressionRetriever
from langchain_openai import OpenAI, ChatOpenAI
from langchain_core.output_parsers import StrOutputParser

model = ChatOpenAI(
    model="gpt-3.5-turbo")

base_retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs = { "k": 2 }
)

base_compressor = LLMChainExtractor.from_llm(
    llm = model,
)

compressor_retriever = ContextualCompressionRetriever(
    base_retriever = base_retriever,
    base_compressor = base_compressor
)
result_docs = compressor_retriever.invoke(QUERY3)

# OUTPUTS - IMPRESSIVE!!

# QUERY1
# Jack is friends with Tom and Sally.

# QUERY2
# Jack loves to watch movies and listen to music.
# Jack likes to build workflows and AI systems

# QUERY3
# Jack has worked on following technology stacks - Langchain, Langgraph, Langsmith

for doc in result_docs:
  print(doc.page_content)

### more testing needed here for invokes

In [None]:
### test invoke with all 3 queries
# print("\n")
# print(QUERY1)
# result_docs1 = compressor_retriever.invoke(QUERY1)
# result_docs1[0].page_content

# print("\n")
# print(QUERY2)
# result_docs2 = compressor_retriever.invoke(QUERY2)
# result_docs2[0].page_content

# print("\n")
# print(QUERY3)
# result_docs3 = compressor_retriever.invoke(QUERY3)
# result_docs3[0].page_content

QUERY4 = "what is rohit's tech stack?"
print(QUERY4)
result_docs4 = compressor_retriever.invoke(QUERY4)
# result_docs4[0].page_content

print(result_docs4)

## Setup qdrant cloud

### init qdrant cloud and save chunks to VS

In [None]:
import os

print(os.environ["QDRANT_API_KEY"])
print(os.environ["QDRANT_URL"])


## Load pdf from drive

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from google.colab import drive

drive.mount("/content/drive", force_remount=True)
file_path = "/content/drive/MyDrive/Projects/docs/rodi.pdf"

about_rodi_loader = PyPDFLoader(file_path)

about_pdf_docs = about_rodi_loader.load()
print(f"docs loaded = {len(docs)}")
print(docs[0].page_content)
doc = docs[0].page_content

## Simple recursive text splitter

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)

chunks = splitter.split_text(doc)
print(chunks)
for doc in chunks:
  print(doc)
len(chunks)



## Semantic text splitter/ experiment
to create semantic aware chunks

In [None]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

llm = OpenAIEmbeddings(model='text-embedding-3-small')

# about_loader is defined in another cell above. used Pydfloader to load PDF doc from gdrive.
# about_loader = PyPDFLoader(file_path)

about_rodi_docs = about_rodi_loader.load()
print("dasdasdasd")
print(len(about_rodi_docs))
print(about_rodi_docs)

splitter = SemanticChunker(
    embeddings = llm,
    breakpoint_threshold_type="percentile",
    breakpoint_threshold_amount=90
)

docs1 = splitter.create_documents([about_pdf_docs[0].page_content])

about_rodi_vs = vectorstore.from_documents(docs1, embedding=llm, collection_name="about_rodi1")

print(docs1)


# Setup qdrant cloud retriever (OG)

In [None]:
# test queries
QUERY1 = "who is rohit?"
QUERY2 = "what are rohit's hobbies?"
QUERY3 = "what does rohit work on?"
QUERY4 = "what projects has rohit worked on?"


# retriever = vectorstore.as_retriever(
#     search_type="mmr",
#     search_kwargs = { "k": 2 }
# )

retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs = { "k": 2 }
)

result_docs_q1 = retriever.invoke(QUERY1)
result_docs_q2 = retriever.invoke(QUERY2)
result_docs_q3 = retriever.invoke(QUERY3)
result_docs_q4 = retriever.invoke(QUERY4)

print(f"q1", {QUERY1})
for doc in result_docs_q1:
  print(doc.page_content)

print(f"\nq2", {QUERY2})
for doc in result_docs_q2:
  print(doc.page_content)

print(f"\nq3", {QUERY3})
for doc in result_docs_q3:
  print(doc.page_content)

print(f"\nq3", {QUERY4})
for doc in result_docs_q4:
  print(doc.page_content)


# outputs

# q1 {'who is rohit?'}
# Jack likes to build workflows and AI systems
# Jack has worked on following technology stacks - Langchain, Langgraph, Langsmith

# q2 {"what are rohit's hobbies?"}
# Jack loves to watch movies and listen to music.
# Jack likes to build workflows and AI systems

# q3 {'what does rohit work on?'}
# Jack has worked on following technology stacks - Langchain, Langgraph, Langsmith
# Jack likes to build workflows and AI systems

# q3 {'what projects has rohit worked on?'}
# Jack likes to build workflows and AI systems
# Jack has worked on following technology stacks - Langchain, Langgraph, Langsmith

# Setup qdrant retriever TEST

In [None]:
import os
from langchain_qdrant import Qdrant, QdrantVectorStore
from langchain_community.embeddings import SentenceTransformerEmbeddings

# test queries
QUERY1 = "who is rohit?"
QUERY2 = "what are rohit's hobbies?"
QUERY3 = "what does rohit like to do?"
QUERY4 = "what is rohit's favourite drink?"


# retriever = vectorstore.as_retriever(
#     search_type="mmr",
#     search_kwargs = { "k": 2 }
# )

# create new vectorstore name to separate from OG above

llm_model = "all-MiniLM-L6-v2"
embedding_model = SentenceTransformerEmbeddings(model_name=llm_model)

retriever = QdrantVectorStore.from_existing_collection(
    embedding = embedding_model,
    # documents = docs,
    url=os.environ["QDRANT_URL"],
    api_key=os.environ["QDRANT_API_KEY"],
    # collection_name = "about_rodi_rag",
    collection_name="test_coll_name" # new rag testing collection

).as_retriever(
    search_type="similarity",
    search_kwargs = { "k": 2 }
)

result_docs_q1 = retriever.invoke(QUERY1)
result_docs_q2 = retriever.invoke(QUERY2)
result_docs_q3 = retriever.invoke(QUERY3)
result_docs_q4 = retriever.invoke(QUERY4)

print(f"q1", {QUERY1})
for doc in result_docs_q1:
  print(" ".join(doc.page_content.splitlines()))

print(f"\nq2", {QUERY2})
for doc in result_docs_q2:
  print(" ".join(doc.page_content.splitlines()))

print(f"\nq3", {QUERY3})
for doc in result_docs_q3:
  print(" ".join(doc.page_content.splitlines()))

print(f"\nq3", {QUERY4})
for doc in result_docs_q4:
  print(" ".join(doc.page_content.splitlines()))


# outputs

# q1 {'who is rohit?'}
# Jack likes to build workflows and AI systems
# Jack has worked on following technology stacks - Langchain, Langgraph, Langsmith

# q2 {"what are rohit's hobbies?"}
# Jack loves to watch movies and listen to music.
# Jack likes to build workflows and AI systems

# q3 {'what does rohit work on?'}
# Jack has worked on following technology stacks - Langchain, Langgraph, Langsmith
# Jack likes to build workflows and AI systems

# q3 {'what projects has rohit worked on?'}
# Jack likes to build workflows and AI systems
# Jack has worked on following technology stacks - Langchain, Langgraph, Langsmith

q1 {'who is rohit?'}
Rohit likes to drink beer
Rohit likes to drink coffee

q2 {"what are rohit's hobbies?"}
Rohit likes to drink beer
Rohit likes to drink coffee

q3 {'what does rohit like to do?'}
Rohit likes to drink beer
Rohit likes to drink coffee

q3 {"what is rohit's favourite drink?"}
Rohit likes to drink beer
Rohit likes to drink coffee


# Setup qdrant TEST 2

In [None]:
from langchain_core.documents import Document
from langchain_qdrant import Qdrant, QdrantVectorStore
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers import ContextualCompressionRetriever
from langchain_openai import OpenAI, ChatOpenAI

QUERY1 = "what does rohit eat?"
QUERY2 = "what are rohit's hobbies?"
QUERY3 = "what does rohit drink?"


def create_vector_store_retriever():
  # llm_model = ChatOpenAI(
  #     model="gpt-3.5-turbo")

  embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


  # base_retriever = QdrantVectorStore.from_existing_collection(
  #   embedding = embedding_model,
  #   # documents = docs,
  #   url=os.environ["QDRANT_URL"],
  #   api_key=os.environ["QDRANT_API_KEY"],
  #   collection_name = "test_coll_name",
  # ).as_retriever(
  #     search_type="mmr",
  #     search_kwargs = { "k": 2 }
  # )

  base_retriever = QdrantVectorStore.from_existing_collection(
    embedding = embedding_model,
    # documents = docs,
    url=os.environ["QDRANT_URL"],
    api_key=os.environ["QDRANT_API_KEY"],
    collection_name = "test_coll_name",
  ).similarity_search(
      # search_type="mmr",
      # search_kwargs = { "k": 2 }
      query=QUERY3,
      k=2
  )

  pretty_print_docs(base_retriever)

  # print("base_retriever")
  # print(base_retriever)

  # base_compressor = LLMChainExtractor.from_llm(
  #     llm = llm_model, #needs to be llm_model, not embedding model here.
  # )
  # print("base_compressor")

  # compressor_retriever = ContextualCompressionRetriever(
  #     base_retriever = base_retriever,
  #     base_compressor = base_compressor
  # )
  # result_docs = compressor_retriever.invoke(QUERY3)
  # result_docs = base_retriever.invoke(QUERY3)

  # print(f"\nQUERY1 - {QUERY1}\n")
  # print(f"\nQUERY2 - {QUERY2}\n")
  # print(f"\nQUERY3 - {QUERY3}\n")
  # for doc in result_docs:
  #   print(doc.page_content)
  # return result_docs

print('calling...')
create_vector_store_retriever()

calling...
Rohit likes to drink beer
Rohit likes to drink coffee


# Next steps -
host n8n on aws ec2
- run cron workflows to test availability
- add to website
- test latency after obervability/ langSmith

- setup chainlit for chat UI
- modularise cells in nodes-blocks and langgraphy the whole thing
- setup fastapi/ cl handlers to interact
- observability
  - traces
  - latency
  - bottlenecks

# Optimisations - Future
- iterate and improve sematnic chunking and retrieval
- multi modal? nope.
- audio input - via chainlit cl.on_audio_chunk

Quetions
- choosing dimensions for embedding
- chunking size
- try sim search/ default, compress context search, MMR, compare results
- compare perf, text splittint types for better perf?