<a href="https://colab.research.google.com/github/rodiwaa/learnings-pocs/blob/main/notebooks/resume_rag_system_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain langchain.openai openai langchain-community langsmith chromadb python-dotenv sentence-transformers

## API keys from .env

In [None]:
import os
from google.colab import drive
from dotenv import load_dotenv

drive.mount("/content/drive")

load_dotenv(dotenv_path="/content/drive/MyDrive/Projects/.env/.env")

## Read pdf from drive
HOLD. will work with docs now, import PDF later.

## Create docs for "about me"

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_community.vectorstores import Chroma
# from langchain_openai import OpenAIEmbeddings
# from sentence_transformers import SentenceTransformer # does not work well w langchain/chroma; use SentenceTransformerEmbeddings instead
from langchain_community.embeddings import SentenceTransformerEmbeddings

docs = [
    Document(page_content="Jack likes to build workflows and AI systems"),
    Document(page_content="Jack has worked on following technology stacks - Langchain, Langgraph, Langsmith"),
    Document(page_content="Jack is friends with Tom and Sally."),
    Document(page_content="Jack loves to trek on weekends."),
    Document(page_content="Jack loves to watch movies and listen to music.")
]

embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

vectorstore = Chroma.from_documents(
    embedding = embedding_model,
    documents = docs,
    collection_name = "random_db_2",
    persist_directory = "random_db_2"
)

# check if docs are added
added_docs = vectorstore.get()

print(f"added {len(added_docs)} docs")
print(added_docs)

### Retrieve docs from VS based on query sim searches

In [None]:
# delete chromadb to start from scratch
# vectorstore.delete_collection()

added = vectorstore.get()
added

In [None]:
# ignore this cell; i've broken this into indi cells below

# vector store sim search
QUERY1 = "who are Jack's friends?"
QUERY2 = "what are jack's hobbies?"
QUERY3 = "what does jack work on?"

# basic sim search
basic_search = vectorstore.similarity_search(
    query = QUERY2,
    k = 2
)
for doc in basic_search:
  print(doc.page_content)

# output
# Jack loves to watch movies and listen to music.
# Jack loves to trek on weekends.


# contextual compress search
base_retriever = vectorstore.as_retriever(
    search_type="mmr",
    k = 2
)
base_retriever.invoke(QUERY1)
base_retriever
# output
# [Document(metadata={}, page_content='Jack is friends with Tom and Sally.'),
#  Document(metadata={}, page_content='Jack loves to watch movies and listen to music.'),
#  Document(metadata={}, page_content='Jack loves to trek on weekends.'),
#  Document(metadata={}, page_content='Jack likes to build workflows and AI systems')]

similarityR = vectorstore.as_retriever(
    search_type="similarity",
    k = 2
)
print("x"*12)
res = similarityR.invoke(QUERY1)
for doc in res:
  print(doc.page_content)
# basic_search

# basic_search.invoke(QUERY1)

# try sim search/ default, compress context search, MMR, compare results

# compare perf, text splittint types for better perf?

# Search Strategies

In [None]:
# queries
QUERY1 = "who are Jack's friends?"
QUERY2 = "what are jack's hobbies?"
QUERY3 = "what does jack work on?"

## Basic similarity search

In [None]:
# basic sim search
basic_search = vectorstore.similarity_search(
    query = QUERY2,
    k = 2
)
for doc in basic_search:
  print(doc.page_content)

# Output
# vectorstore.similary adhers to k; does not work invoking runnable (k)

# Jack loves to watch movies and listen to music.
# Jack loves to trek on weekends.


## MMR

In [None]:
# MMR
base_retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs = { "k": 2 }
)
res = base_retriever.invoke(QUERY2)
for doc in res:
  print(doc.page_content)

# notes - k is ignored. why?; need to used search_kwargs, not "k"
# output
# Jack loves to watch movies and listen to music.
# Jack likes to build workflows and AI systems


## contextual compression thingie

In [None]:
# needs llm, embedding, compression mod, base retr, LLMChainExtractor
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers import ContextualCompressionRetriever
from langchain_openai import OpenAI, ChatOpenAI
from langchain_core.output_parsers import StrOutputParser

model = ChatOpenAI(
    model="gpt-3.5-turbo")

base_retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs = { "k": 2 }
)

base_compressor = LLMChainExtractor.from_llm(
    llm = model,
)

compressor_retriever = ContextualCompressionRetriever(
    base_retriever = base_retriever,
    base_compressor = base_compressor
)
result_docs = compressor_retriever.invoke(QUERY3)

# OUTPUTS - IMPRESSIVE!!

# QUERY1
# Jack is friends with Tom and Sally.

# QUERY2
# Jack loves to watch movies and listen to music.
# Jack likes to build workflows and AI systems

# QUERY3
# Jack has worked on following technology stacks - Langchain, Langgraph, Langsmith

for doc in result_docs:
  print(doc.page_content)

Quetions
- choosing dimensions for embedding
- chunking size
- try sim search/ default, compress context search, MMR, compare results
- compare perf, text splittint types for better perf?