In [27]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
# with open('/content/drive/MyDrive/Colab Notebooks/langchain_rag/<>', 'r', encoding='utf-8') as f:
#   text = f.read()

Mounted at /content/drive


In [3]:
# !pip install langchain_community
# !pip install langchain_openai
# !pip install chromadb

In [4]:
import os

from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

In [5]:
current_dir = os.getcwd()
file_path = os.path.join(current_dir, "drive/MyDrive/Colab Notebooks/langchain_rag", "odyssey.txt")
persistent_directory = os.path.join(current_dir, "drive/MyDrive/Colab Notebooks/db", "chroma_db")

In [17]:
from google.colab import userdata
import warnings

# Filter out all warnings
warnings.simplefilter('ignore', category=Warning)

os.environ["OPENAI_API_KEY"] = userdata.get("openai_api_key")
loader = TextLoader(file_path)
documents = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

print(f"Number of chunks: {len(docs)}")
print(f"Sample chunk:\n{docs[0].page_content}\n")

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small"
)
if not os.path.exists(persistent_directory):
  db = Chroma.from_documents(docs, embeddings, persist_directory=persistent_directory)
else:
  db = Chroma(persist_directory=persistent_directory,
            embedding_function=embeddings)



Number of chunks: 826
Sample chunk:
﻿The Project Gutenberg eBook of The Odyssey
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: The Odyssey

Author: Homer

Translator: Samuel Butler

Release date: April 1, 1999 [eBook #1727]
                Most recently updated: December 2, 2023

Language: English

Credits: Jim Tinsley and David Widger


*** START OF THE PROJECT GUTENBERG EBOOK THE ODYSSEY ***


[Illustration]


The Odyssey

by Homer

rendered into English prose for the use of those who cannot read the
original

Contents



In [23]:
query = "Who is Odysseus' wife?"

retriever = db.as_retriever(
    search_type="similarity", # Changed from similarity_score_threshold
    search_kwargs={"k":3}, # Changed k to 5 and removed score_threshold
)
relevant_docs = retriever.invoke(query)

for i,doc in enumerate(relevant_docs, 1):
  print(f"Document {i}:\n{doc.page_content}\n")
  if doc.metadata:
    print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")

Document 1:
“Happy Ulysses, son of Laertes,” replied the ghost of Agamemnon, “you
are indeed blessed in the possession of a wife endowed with such rare
excellence of understanding, and so faithful to her wedded lord as
Penelope the daughter of Icarius. The fame, therefore, of her virtue
shall never die, and the immortals shall compose a song that shall be
welcome to all mankind in honour of the constancy of Penelope. How far
otherwise was the wickedness of the daughter of Tyndareus who killed
her lawful husband; her song shall be hateful among men, for she has
brought disgrace on all womankind even on the good ones.”

Source: odyssey.txt

Document 2:
Then Ulysses answered, “Madam, wife of Ulysses, you need not defer your
tournament, for Ulysses will return ere ever they can string the bow,
handle it how they will, and send their arrows through the iron.”

To this Penelope said, “As long, sir, as you will sit here and talk to
me, I can have no desire to go to bed. Still, people cannot d

In [24]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

# Initialize the LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Create a RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=retriever,
    return_source_documents=True
)

# Get the answer from the RAG chain
response = qa_chain.invoke({"query": query})

print("Answer:", response["result"])
print("\nSource Documents:")
for i, doc in enumerate(response["source_documents"], 1):
    print(f"Document {i}:\n{doc.page_content}\n")
    if doc.metadata:
        print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")

Answer: Odysseus' wife is Penelope.

Source Documents:
Document 1:
“Happy Ulysses, son of Laertes,” replied the ghost of Agamemnon, “you
are indeed blessed in the possession of a wife endowed with such rare
excellence of understanding, and so faithful to her wedded lord as
Penelope the daughter of Icarius. The fame, therefore, of her virtue
shall never die, and the immortals shall compose a song that shall be
welcome to all mankind in honour of the constancy of Penelope. How far
otherwise was the wickedness of the daughter of Tyndareus who killed
her lawful husband; her song shall be hateful among men, for she has
brought disgrace on all womankind even on the good ones.”

Source: odyssey.txt

Document 2:
Then Ulysses answered, “Madam, wife of Ulysses, you need not defer your
tournament, for Ulysses will return ere ever they can string the bow,
handle it how they will, and send their arrows through the iron.”

To this Penelope said, “As long, sir, as you will sit here and talk to
me, I 

In [28]:
books_dir = os.path.join(current_dir, "drive/MyDrive/Colab Notebooks/langchain_rag")
persistent_directory = os.path.join(current_dir, "drive/MyDrive/Colab Notebooks/db_2", "chroma_db")

if not os.path.exists(persistent_directory):
  if not os.path.exists(books_dir):
    raise FileNotFoundError(f"{books_dir} does not exist")
  files = [f for f in os.listdir(books_dir) if f.endswith(".txt")]
  documents = []
  for file in files:
    file_path = os.path.join(books_dir, file)
    loader = TextLoader(file_path)
    book_docs = loader.load()
    for doc in book_docs:
      doc.metadata = {"source": file}
      documents.append(doc)

  text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
  docs = text_splitter.split_documents(documents)

  print((f"Number of document chunks: {len(docs)}"))

  embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
  db = Chroma.from_documents(
      docs, embeddings, persist_directory=persistent_directory
  )
else:
  print("Vector store already exists.")



Number of document chunks: 13347


InternalError: Database error: error returned from database: (code: 266) disk I/O error

In [20]:
query = "How did Juliet die?"

retriever = db.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k":3, "score_threshold": 0.7} # Added score_threshold
  )
relevant_docs = retriever.invoke(query)
for i, doc in enumerate(relevant_docs):
  print(f"Document {i}: \n{doc.page_content}\n")
  print(f"Source: {doc.metadata['source']}\n")



In [30]:
#text_splitting_deep_dive
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    SentenceTransformersTokenTextSplitter,
    TextSplitter,
    TokenTextSplitter,
)

file_path = os.path.join(current_dir, "drive/MyDrive/Colab Notebooks/langchain_rag", "romeo_and_juliet.txt")
db_dir = os.path.join(current_dir, "drive/MyDrive/Colab Notebooks/db")

def create_vector_store(docs, store_name):
  persistent_directory = os.path.join(db_dir, store_name)
  if not os.path.exists(persistent_directory):
    db = Chroma.from_documents(
        docs, embeddings, persist_directory=persistent_directory
    )
  else:
    print(f"Vector store already exists.")

char_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
char_docs = char_splitter.split_documents(documents)
create_vector_store(char_docs, "chroma_db_char")

sent_splitter = SentenceTransformersTokenTextSplitter(chunk_size=1000, chunk_overlap=100)
sent_docs = sent_splitter.split_documents(documents)
create_vector_store(sent_docs, "chroma_db_sent")

token_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=100)
token_docs = token_splitter.split_documents(documents)
create_vector_store(token_docs, "chroma_db_token")

recur_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
recur_docs = recur_splitter.split_documents(documents)
create_vector_store(recur_docs, "chroma_db_recur")

class CustomTextSplitter(TextSplitter):
  def split_text(self, text):
    return text.split("\n\n")

custom_splitter = CustomTextSplitter()
custom_docs = custom_splitter.split_text(documents)
create_vector_store(custom_docs, "chroma_db_custom")



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

BadRequestError: Error code: 400 - {'error': {'message': 'Requested 386727 tokens, max 300000 tokens per request', 'type': 'max_tokens_per_request', 'param': None, 'code': 'max_tokens_per_request'}}

In [33]:
def query_vector_store(store_name, query):
  persist_dir = os.path.join(db_dir, store_name)
  if os.path.exists(persist_dir):
    db = Chroma(
        persist_directory=persist_dir, embedding_function=embeddings
    )
    retriever = db.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={"k":3, "score_threshold":0.1},
    )

    relevant_docs = retriever.invoke(query)

    for i, doc in enumerate(relevant_docs):
      print(f"Document {i}:\n{doc.page_content}\n")
      if doc.metadata:
        print(f"Source: {doc.metadata.get('source', 'Unkown')}\n")

query = "How did Juliet die?"

query_vector_store("chroma_db_char", query)
query_vector_store("chroma_db_sent", query)
query_vector_store("chroma_db_token", query)
query_vector_store("chroma_db_recur", query)
query_vector_store("chroma_db_custom", query)



Document 0:
JULIET.
Shall I speak ill of him that is my husband?
Ah, poor my lord, what tongue shall smooth thy name,
When I thy three-hours’ wife have mangled it?
But wherefore, villain, didst thou kill my cousin?
That villain cousin would have kill’d my husband.
Back, foolish tears, back to your native spring,
Your tributary drops belong to woe,
Which you mistaking offer up to joy.
My husband lives, that Tybalt would have slain,
And Tybalt’s dead, that would have slain my husband.
All this is comfort; wherefore weep I then?
Some word there was, worser than Tybalt’s death,
That murder’d me. I would forget it fain,
But O, it presses to my memory
Like damned guilty deeds to sinners’ minds.
Tybalt is dead, and Romeo banished.
That ‘banished,’ that one word ‘banished,’
Hath slain ten thousand Tybalts. Tybalt’s death
Was woe enough, if it had ended there.
Or if sour woe delights in fellowship,
And needly will be rank’d with other griefs,
Why follow’d not, when she said Tybalt’s dead,
Thy f

In [34]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings

file_path = os.path.join(current_dir, "drive/MyDrive/Colab Notebooks/langchain_rag", "odyssey.txt")
db_dir = os.path.join(current_dir, "drive/MyDrive/Colab Notebooks/db")

loader = TextLoader(file_path)
documents = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)



In [38]:
def create_vector_store(docs, embeddings, store_name):
  persistent_directory = os.path.join(db_dir, store_name)
  if not os.path.exists(persistent_directory):
    db = Chroma.from_documents(
        docs, embeddings, persist_directory=persistent_directory
    )
  else:
    print(f"Vector store already exists.")

def query_vector_store(store_name, query, embeddings):
  persist_dir = os.path.join(db_dir, store_name)
  if os.path.exists(persist_dir):
    db = Chroma(
        persist_directory=persist_dir, embedding_function=embeddings
    )
    retriever = db.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={"k":3, "score_threshold":0.1},
    )

    relevant_docs = retriever.invoke(query)

    for i, doc in enumerate(relevant_docs):
      print(f"Document {i}:\n{doc.page_content}\n")
      if doc.metadata:
        print(f"Source: {doc.metadata.get('source', 'Unkown')}\n")

openai_embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
create_vector_store(docs, openai_embeddings, "chroma_db_openai")

query = "Who is Odysseus' wife?"

query_vector_store("chroma_db_openai", query, openai_embeddings)

Vector store already exists.
Document 0:
Now all the rest, as many as fled from sheer destruction, were at
    home, and had escaped both war and sea, but Odysseus only, craving
    for his wife and for his homeward path, the lady nymph Calypso
    held, that fair goddess, in her hollow caves, longing to have him
    for her lord. But when now the year had come in the courses of the
    seasons, wherein the gods had ordained that he should return home
    to Ithaca, not even there was he quit of labours, not even among
    his own; but all the gods had pity on him save Poseidon, who raged
    continually against godlike Odysseus, till he came to his own
    country. Howbeit Poseidon had now departed for the distant
    Ethiopians, the Ethiopians that are sundered in twain, the
    uttermost of men, abiding some where Hyperion sinks and some where
    he rises. There he looked to receive his hecatomb of bulls and
    rams, there he made merry sitting at the feast, but the other gods
   

In [41]:
def query_vector_store(store_name, query, embeddings, search_type, search_kwargs):
  persist_dir = os.path.join(db_dir, store_name)
  if os.path.exists(persist_dir):
    db = Chroma(
        persist_directory=persist_dir, embedding_function=embeddings
    )
    retriever = db.as_retriever(
        search_type=search_type,
        search_kwargs=search_kwargs,
    )

    relevant_docs = retriever.invoke(query)

    for i, doc in enumerate(relevant_docs):
      print(f"Document {i}:\n{doc.page_content}\n")
      if doc.metadata:
        print(f"Source: {doc.metadata.get('source', 'Unkown')}\n")

query_vector_store(
    "chroma_db_openai", query, openai_embeddings,
    "mmr",
     {"k":3, "fetch_k":20,"lambda_mult":0.5},
    )



Document 0:
Now all the rest, as many as fled from sheer destruction, were at
    home, and had escaped both war and sea, but Odysseus only, craving
    for his wife and for his homeward path, the lady nymph Calypso
    held, that fair goddess, in her hollow caves, longing to have him
    for her lord. But when now the year had come in the courses of the
    seasons, wherein the gods had ordained that he should return home
    to Ithaca, not even there was he quit of labours, not even among
    his own; but all the gods had pity on him save Poseidon, who raged
    continually against godlike Odysseus, till he came to his own
    country. Howbeit Poseidon had now departed for the distant
    Ethiopians, the Ethiopians that are sundered in twain, the
    uttermost of men, abiding some where Hyperion sinks and some where
    he rises. There he looked to receive his hecatomb of bulls and
    rams, there he made merry sitting at the feast, but the other gods
    were gathered in the halls o

In [43]:
from langchain_core.messages import HumanMessage, SystemMessage
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

persistent_directory = os.path.join(current_dir, "drive/MyDrive/Colab Notebooks/db", "chroma_db")

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

db = Chroma(persist_directory=persistent_directory, embedding_function=embeddings)

query = "How can I learn about LangChain?"

retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={"k":3},
)
#relevant_docs=retriever.invoke(query)

llm = ChatOpenAI(model="gpt-4o")

contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, just "
    "reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

qa_system_prompt = (
    "You are an assistant for question-answering tasks. Use "
    "the following pieces of retrieved context to answer the "
    "question. If you don't know the answer, just say that you "
    "don't know. Use three sentences maximum and keep the answer "
    "concise."
    "\n\n"
    "{context}"
)

qa_prompt = ChatPromptTemplate(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

def continual_chat():
  chat_history=[]
  while True:
    query = input("You: ")
    if query.lower() == "exit":
      break
    result = rag_chain.invoke({"input":query, "chat_history": chat_history})
    print(f"AI: {result['answer']}")

    chat_history.append(HumanMessage(content=query))
    chat_history.append(SystemMessage(content=result["answer"]))

continual_chat()

You: did odysseus reach home finally?
AI: Yes, Odysseus eventually reached his home in Ithaca, after overcoming numerous challenges and receiving help from the Phaeacians, who honored him and provided him the means to finally return.
You: when did he die?
AI: The details of Odysseus's death are not specified in the main Homeric texts. Later traditions and authors suggest various fates for Odysseus, including being killed by Telegonus, his son with the enchantress Circe, but these are not part of the original Odyssey.
You: is his character historically accurate? or is he a myhthological character?
AI: Odysseus is a mythological character and not historically accurate. He is a legendary hero from ancient Greek mythology, prominently featured in Homer's epics, the "Iliad" and the "Odyssey." While he may have been based on a historical figure or inspired by real events, his stories and adventures are mythological in nature.
You: exit


In [49]:
from langchain_community.document_loaders import WebBaseLoader
os.environ['USER_AGENT'] = 'myagent'
persistent_directory = os.path.join(db_dir, "chroma_db_apple")

urls = ["https://www.apple.com"]
loader = WebBaseLoader(urls)
documents = loader.load()

docs = text_splitter.split_documents(documents)

print(f"Number of chunks: {len(docs)}")



Number of chunks: 7


In [52]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
if not os.path.exists(persistent_directory):
  db = Chroma.from_documents(docs, embeddings, persist_directory=persistent_directory)
else:
  db = Chroma(persist_directory=persistent_directory, embedding_function=embeddings)

retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={"k":3},
)

query = "What new products are announced on Apple.com?"

relevant_docs = retriever.invoke(query)

for i, doc in enumerate(relevant_docs):
  print(f"Document {i}:\n{doc.page_content}\n")
  if doc.metadata:
    print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")