# Setting Environment

In [1]:
import os
os.environ["OPENAI_API_KEY"] = "" 

The os.environ line sets the OPENAI_API_KEY environment variable 

# Installing Chromadb

In [2]:
#!pip install chromadb

# Importing Necessary Libraries

In [3]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma

OpenAIEmbeddings : This is used to generate embeddings from text using OpenAI's models.

CharacterTextSplitter : This class splits a long document into smaller chunks of text.

Chroma : This is a vector database that allows for storing and retrieving text embeddings for similarity search.

# Loding and Splitting the Text File

In [4]:
full_text = open(r"D:\Generative AI\Alice's Adventures in Wonderland.txt").read()
text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
texts = text_splitter.split_text(full_text)

open("").read() : It opens and read the entire content of the file.

CharacterTextSplitter(chunk_size=2000, chunk_overlap=100) : The CharacterTextSplitter splits the full text into chunks, each 1000 characters long, with 100 character overlap between chunks.

text_splitter.split_text(full_text) : it splits the loaded full_text into multiple smaller text chunks based on the specified chunk size and overlap.

# Embedding the Text and Storing in Chroma

In [5]:
embeddings = OpenAIEmbeddings()
db = Chroma.from_texts(texts, embeddings)
retriever = db.as_retriever()

  embeddings = OpenAIEmbeddings()


OpenAIEmbeddings() : It is used to convert text into numerical vectors using OpenAI's API.

Chroma.from_texts(texts, embeddings) : It creats a chroma database where each chunk of text is converted into embeddings and stored for later retrieval.

retriever = db.as_retriever() :  It converts the chroma database into a retriever object, which will allow you to search through thestored embeddings based on similarity to a query.

# Initializing the Chat Model and MultiQuery Retriever and Outputing the Number of Unique Documents

In [6]:
from langchain.chat_models import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever

In [7]:
question = "What is the summary of Alice's Adventures in Wonderland?"
llm = ChatOpenAI(temperature=0)
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=retriever, llm=llm
)

  llm = ChatOpenAI(temperature=0)


In [8]:
unique_docs = retriever_from_llm.get_relevant_documents(query=question)
print("Number of unique documents:", len(unique_docs))

  unique_docs = retriever_from_llm.get_relevant_documents(query=question)


Number of unique documents: 5


In [9]:
unique_docs

[Document(metadata={}, page_content="Alice's Adventures in Wonderland\n\n                ALICE'S ADVENTURES IN WONDERLAND\n\n                          Lewis Carroll\n\n               THE MILLENNIUM FULCRUM EDITION 3.0\n\n\n                            CHAPTER I\n\n                      Down the Rabbit-Hole\n\n\n  Alice was beginning to get very tired of sitting by her sister\non the bank, and of having nothing to do:  once or twice she had\npeeped into the book her sister was reading, but it had no\npictures or conversations in it, `and what is the use of a book,'\nthought Alice `without pictures or conversation?'\n\n  So she was considering in her own mind (as well as she could,\nfor the hot day made her feel very sleepy and stupid), whether\nthe pleasure of making a daisy-chain would be worth the trouble\nof getting up and picking the daisies, when suddenly a White\nRabbit with pink eyes ran close by her.\n\n  There was nothing so VERY remarkable in that; nor did Alice\nthink it so VE