<a href="https://colab.research.google.com/github/rodiwaa/learnings-pocs/blob/main/notebooks/resume_rag_system_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain langchain.openai openai langchain-community langsmith chromadb python-dotenv sentence-transformers pypdf langchain_community langchain_experimental

## API keys from .env

In [None]:
import os
from google.colab import drive
from dotenv import load_dotenv

drive.mount("/content/drive")

load_dotenv(dotenv_path="/content/drive/MyDrive/Projects/.env/.env")

## Read pdf from drive
HOLD. will work with docs now, import PDF later.

## Create docs for "about me"

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_community.vectorstores import Chroma
# from langchain_openai import OpenAIEmbeddings
# from sentence_transformers import SentenceTransformer # does not work well w langchain/chroma; use SentenceTransformerEmbeddings instead
from langchain_community.embeddings import SentenceTransformerEmbeddings

docs = [
    Document(page_content="Jack likes to build workflows and AI systems"),
    Document(page_content="Jack has worked on following technology stacks - Langchain, Langgraph, Langsmith"),
    Document(page_content="Jack is friends with Tom and Sally."),
    Document(page_content="Jack loves to trek on weekends."),
    Document(page_content="Jack loves to watch movies and listen to music.")
]

embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

vectorstore = Chroma.from_documents(
    embedding = embedding_model,
    documents = docs,
    collection_name = "random_db_2",
    persist_directory = "random_db_2"
)

# check if docs are added
added_docs = vectorstore.get()

print(f"added {len(added_docs)} docs")
print(added_docs)

### Retrieve docs from VS based on query sim searches

In [None]:
# delete chromadb to start from scratch
# vectorstore.delete_collection()

added = vectorstore.get()
added

# Search Strategies
- similiarty
- MMR
- context compression
- semantic chunker

In [None]:
# queries
QUERY1 = "who are Jack's friends?"
QUERY2 = "what are jack's hobbies?"
QUERY3 = "what does jack work on?"

## Basic similarity search

In [None]:
# basic sim search
basic_search = vectorstore.similarity_search(
    query = QUERY2,
    k = 2
)
for doc in basic_search:
  print(doc.page_content)

# Output
# vectorstore.similary adhers to k; does not work invoking runnable (k)

# Jack loves to watch movies and listen to music.
# Jack loves to trek on weekends.


## MMR

In [None]:
# MMR
base_retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs = { "k": 2 }
)
res = base_retriever.invoke(QUERY2)
for doc in res:
  print(doc.page_content)

# notes - k is ignored. why?; need to used search_kwargs, not "k"
# output
# Jack loves to watch movies and listen to music.
# Jack likes to build workflows and AI systems


## contextual compression thingie

In [None]:
# needs llm, embedding, compression mod, base retr, LLMChainExtractor
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers import ContextualCompressionRetriever
from langchain_openai import OpenAI, ChatOpenAI
from langchain_core.output_parsers import StrOutputParser

model = ChatOpenAI(
    model="gpt-3.5-turbo")

base_retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs = { "k": 2 }
)

base_compressor = LLMChainExtractor.from_llm(
    llm = model,
)

compressor_retriever = ContextualCompressionRetriever(
    base_retriever = base_retriever,
    base_compressor = base_compressor
)
result_docs = compressor_retriever.invoke(QUERY3)

# OUTPUTS - IMPRESSIVE!!

# QUERY1
# Jack is friends with Tom and Sally.

# QUERY2
# Jack loves to watch movies and listen to music.
# Jack likes to build workflows and AI systems

# QUERY3
# Jack has worked on following technology stacks - Langchain, Langgraph, Langsmith

for doc in result_docs:
  print(doc.page_content)

## Load pdf from drive

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from google.colab import drive

drive.mount("/content/drive", force_remount=True)
file_path = "/content/drive/MyDrive/Projects/docs/rodi.pdf"

about_rodi_loader = PyPDFLoader(file_path)

about_pdf_docs = about_rodi_loader.load()
print(f"docs loaded = {len(docs)}")
print(docs[0].page_content)
doc = docs[0].page_content

## Simple recursive text splitter

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)

chunks = splitter.split_text(doc)
print(chunks)
for doc in chunks:
  print(doc)
len(chunks)



## Semantic text splitter/ experiment
to create semantic aware chunks

In [None]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

llm = OpenAIEmbeddings(model='text-embedding-3-small')

# about_loader is defined in another cell above. used Pydfloader to load PDF doc from gdrive.
# about_loader = PyPDFLoader(file_path)

about_rodi_docs = about_rodi_loader.load()
print("dasdasdasd")
print(len(about_rodi_docs))
print(about_rodi_docs)

splitter = SemanticChunker(
    embeddings = llm,
    breakpoint_threshold_type="percentile",
    breakpoint_threshold_amount=90
)

docs1 = splitter.create_documents([about_pdf_docs[0].page_content])

about_rodi_vs = vectorstore.from_documents(docs1, embedding=llm, collection_name="about_rodi1")

print(docs1)


In [None]:
# test queries
QUERY1 = "who is rohit?"
QUERY2 = "what are rohit's hobbies?"
QUERY3 = "what does rohit work on?"
QUERY4 = "what projects has rohit worked on?"


# retriever = vectorstore.as_retriever(
#     search_type="mmr",
#     search_kwargs = { "k": 2 }
# )

retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs = { "k": 2 }
)

result_docs_q1 = retriever.invoke(QUERY1)
result_docs_q2 = retriever.invoke(QUERY2)
result_docs_q3 = retriever.invoke(QUERY3)
result_docs_q4 = retriever.invoke(QUERY4)

print(f"q1", {QUERY1})
for doc in result_docs_q1:
  print(doc.page_content)

print(f"\nq2", {QUERY2})
for doc in result_docs_q2:
  print(doc.page_content)

print(f"\nq3", {QUERY3})
for doc in result_docs_q3:
  print(doc.page_content)

print(f"\nq3", {QUERY4})
for doc in result_docs_q4:
  print(doc.page_content)


# outputs

# q1 {'who is rohit?'}
# Jack likes to build workflows and AI systems
# Jack has worked on following technology stacks - Langchain, Langgraph, Langsmith

# q2 {"what are rohit's hobbies?"}
# Jack loves to watch movies and listen to music.
# Jack likes to build workflows and AI systems

# q3 {'what does rohit work on?'}
# Jack has worked on following technology stacks - Langchain, Langgraph, Langsmith
# Jack likes to build workflows and AI systems

# q3 {'what projects has rohit worked on?'}
# Jack likes to build workflows and AI systems
# Jack has worked on following technology stacks - Langchain, Langgraph, Langsmith

Quetions
- choosing dimensions for embedding
- chunking size
- try sim search/ default, compress context search, MMR, compare results
- compare perf, text splittint types for better perf?