### ChromaDB

In [None]:
pip install chromadb

In [None]:
import chromadb
client = chromadb.PersistentClient(path="./chroma_db")

In [None]:
collection = client.create_collection(name="collection1")

# Insert sample data (ID, embeddings, metadata)
collection.add(
    ids=["1", "2", "3"],
    embeddings=[[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5]],
    metadatas=[{"name": "Keerti"}, {"name": "Amit"}, {"name": "Educosys"}]
)

In [None]:
print("Available Collections:", client.list_collections())

In [None]:
print("Fetching data with ID 1:", collection.get(ids=["2"]))

In [None]:
print("Fetching data with ID 1:", collection.get(ids=["2"], include=["embeddings", "metadatas"]))

In [None]:
collection.add(
    ids=["4"],
    embeddings=[[0.1, 0.2, 0.3]],
    documents=["Someone is a software engineer with 5 years of experience."]
)

In [None]:
collection.query(
    query_embeddings=[[0.3, 0.4, 0.5]],
    n_results=2,
    include=["documents"]
)

In [None]:
collection.update(
    ids=["2"],
    embeddings=[[0.5, 0.5, 0.5]],
    metadatas=[{"name": "Bob Updated"}]
)
print("Updated Entry:", collection.get(ids=["2"]))

In [None]:
print("Fetching data with ID 1:", collection.get(ids=["2"]))

In [None]:
collection.delete(ids=["3"])
print("After Deletion:", collection.get(ids=["3"]))

In [None]:
for collection in client.list_collections():
    client.delete_collection(collection.name)

In [None]:
print("Available Collections:", client.list_collections())

### **_OpenAI Embeddings_**

In [None]:
!pip install groq

In [None]:
client = chromadb.PersistentClient(path="./chroma_db")  # This persists data
collection = client.get_or_create_collection(name="collection2")

In [None]:
from dotenv import load_dotenv

load_dotenv()  # Load environment variables from .env file

import os
api_key = os.getenv('GROQ_KEY')  # Retrieve the key

In [None]:
from groq import Groq
import os

def get_groq_embedding(text):
    client = Groq(api_key=os.getenv("GROQ_KEY")) # Use Groq client with the API key
    response = client.embeddings.create(input=[text], model="nomic-embed-text-v1.5") # Use a suitable Groq embedding model
    return response.data[0].embedding

In [None]:
documents = [
    "The Eiffel Tower is located in Paris.",
    "The Colosseum is in Rome, Italy.",
    "The Taj Mahal is a famous monument in India.",
    "Mount Everest is the highest mountain in the world.",
    "Python is a popular programming language."
]

# Convert documents to embeddings
embeddings = [get_groq_embedding(doc) for doc in documents]

# Insert into ChromaDB
collection.add(
    ids=[str(i) for i in range(len(documents))],  # Unique IDs
    documents=documents,
    embeddings=embeddings
)

print("Data added successfully!")

In [None]:
query_text = "Where is the Eiffel Tower?"
query_embedding = get_openai_embedding(query_text)

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=2,  # Get top 2 matches
    include=["documents", "distances"]
)

print("Query:", query_text)
print("Most Similar Result:", results["documents"][0])
print("Distance:", results["distances"][0])

In [None]:
updated_text = "The Eiffel Tower is one of the most visited landmarks in the world."
updated_embedding = get_openai_embedding(updated_text)

collection.update(
    ids=["0"],  # ID of the document to update
    documents=[updated_text],
    embeddings=[updated_embedding]
)

print("Data updated successfully!")

In [None]:
query_text = "Where is the Eiffel Tower?"
query_embedding = get_openai_embedding(query_text)

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=2,  # Get top 2 matches
    include=["documents", "distances"]
)

print("Query:", query_text)
print("Most Similar Result:", results["documents"][0])
print("Distance:", results["distances"][0])

In [None]:
tower_ht_text = "Eiffel Tower is 330 tall."

collection.add(
    ids=["6"],
    embeddings=get_openai_embedding(tower_ht_text),
    documents=tower_ht_text
)

In [None]:
query_text = "Where is the Eiffel Tower?"
query_embedding = get_openai_embedding(query_text)

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=2,  # Get top 2 matches
    include=["documents", "distances"]
)

print("Query:", query_text)
print("Most Similar Result:", results["documents"][0])
print("Distance:", results["distances"][0])

In [None]:
collection.delete(ids=["0"])  # Delete document with ID "0"
print("Data deleted successfully! ❌")

In [None]:
query_text = "Where is the Eiffel Tower?"
query_embedding = get_openai_embedding(query_text)

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=2,  # Get top 2 matches
    include=["documents", "distances"]
)

print("Query:", query_text)
print("Most Similar Result:", results["documents"][0])
print("Distance:", results["distances"][0])

### **RAG**

In [None]:
from dotenv import load_dotenv

load_dotenv()  # Load environment variables from .env file

import os
os.getenv('OPENAI_API_KEY')
os.getenv('LANGCHAIN_TRACING_V2')
os.getenv('LANGCHAIN_API_KEY')
os.getenv('LANGSMITH_PROJECT')

In [None]:
!pip install langchain_community langchain-openai langchainhub chromadb langchain

In [None]:
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader(
    web_paths=["https://www.educosys.com/course/genai"]
)
docs = loader.load()
print(docs)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split
# Overlap of 200 characters to maintain context across chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [None]:
print(splits[0])
print(splits[1])
print(splits[2])

In [None]:
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

vectorstore = Chroma.from_documents(documents=splits,
                                    embedding=OpenAIEmbeddings())

In [None]:
print(vectorstore._collection.count())  # Check total stored chunks

In [None]:
print(vectorstore._collection.get())

In [None]:
print('\nCollection 1 - ', vectorstore._collection.get(ids=['1829471b-ca21-4bf7-a225-33eb8c5278ca'], include=["embeddings", "documents"]))
print('\nCollection 2 - ', vectorstore._collection.get(ids=['9f657e1b-cdc1-4b3c-bfac-161c0177ca0f'], include=["embeddings", "documents"]))
print('\nCollection 3 - ', vectorstore._collection.get(ids=['67062071-3860-481b-9355-711117d658ad'], include=["embeddings", "documents"]))
print('\nCollection 4 - ', vectorstore._collection.get(ids=['7221ceda-f647-42e0-872c-f3ad210d0a0a'], include=["embeddings", "documents"]))

In [None]:
retriever = vectorstore.as_retriever()

In [None]:
from langchain import hub
# Prompt
prompt = hub.pull("rlm/rag-prompt") # pulls a predefined RAG prompt template from LangChain Hub

In [None]:
# You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
# Question: {question}
# Context: {context}
# Answer:

In [None]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo")

In [None]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
rag_chain = ({"context" : retriever | format_docs, "question": RunnablePassthrough()}
             | prompt
             | llm
             | StrOutputParser())


In [None]:
rag_chain.invoke("What are the timings of the genai course?")

In [None]:
rag_chain.invoke("Give me the curriculum for week 1 for genai course")

In [None]:
rag_chain.invoke("Are the recordings for the course available?")

In [None]:
from langchain_core.runnables import RunnableLambda

In [None]:
def print_prompt(prompt_text):
    print("\nPrompt - ", prompt_text)
    return prompt_text

rag_chain = ({"context" : retriever | format_docs, "question": RunnablePassthrough()}
             | prompt
             | RunnableLambda(print_prompt)
             | llm
             | StrOutputParser())

In [None]:
rag_chain.invoke("What are the timings of the genai course?")