In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma

embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002")
docs_vectorstore = Chroma(
    collection_name="docs_store",
    embedding_function=embeddings_model,
    persist_directory="docs-db",
)

In [None]:
!pip install langchain langchain-openai chromadb renumics-spotlight 

In [None]:
from langchain_community.document_loaders import BSHTMLLoader, DirectoryLoader
loader = DirectoryLoader(
    "docs",
    glob="*.html",
    loader_cls=BSHTMLLoader,
    loader_kwargs={"open_encoding": "utf-8"},
    recursive=True,
    show_progress=True,
)
docs = loader.load()

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
splits = text_splitter.split_documents(docs)

In [None]:
print(len(splits))

In [None]:
import hashlib
import json
import uuid
from langchain_core.documents import Document

def stable_hash(doc: Document) -> str:
    """
    Stable hash document based on its metadata.
    """
    return hashlib.sha1(json.dumps(doc.metadata, sort_keys=True).encode()).hexdigest()

def generate_id(doc: Document) -> str:
    return f"id_{str(uuid.uuid4())}"

def batch_process(documents_arr, batch_size, process_function):     
    for i in range(start, len(documents_arr), batch_size):         
        batch = documents_arr[i:i + batch_size]    
        print(f"batch:{i}, batch_size:{len(batch)}")
        process_function(batch)  

def add_to_chroma_database(splits):   
    split_ids = list(map(generate_id, splits))
    docs_vectorstore.add_documents(splits, ids=split_ids)
    docs_vectorstore.persist()

start = 0 # 4000-5000, 22000-23000
batch_size = 1000
batch_process(splits, batch_size, add_to_chroma_database) 

In [None]:
#docs_vectorstore.persist()

In [None]:
import json
# Call the `count()` method on the collection
query = "Who built the Silverstone"
#docs = docs_vectorstore.get()['ids']
print("count before", docs_vectorstore._collection.count())
#docs_vectorstore._collection.delete_collection()
#print(f"Number of documents in 'docs_store': {len(docs)}")
docs = docs_vectorstore.similarity_search(query=query)
#print(json.dumps(docs,indent=4))
docs

In [None]:
import chromadb
from chromadb.utils import embedding_functions
client = chromadb.PersistentClient(path="docs-db")
client.get_version()
client.delete_collection(name="docs_store") 
client.list_collections()
print(len(collections))
#collections = client.get()
#collection = client.get_collection(name="docs_store")

# query_results = collection.query(
#     query_texts=[query]
# )


In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4", temperature=0.0)
retriever = docs_vectorstore.as_retriever(search_kwargs={"k": 20})

In [None]:
from langchain_core.prompts import ChatPromptTemplate

template = """
You are an assistant for question-answering tasks.
Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES").
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer.

QUESTION: {question}
=========
{source_documents}
=========
FINAL ANSWER: """
prompt = ChatPromptTemplate.from_template(template)

In [None]:
from typing import List

from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser


def format_docs(docs: List[Document]) -> str:
    return "\n\n".join(
        f"Content: {doc.page_content}\nSource: {doc.metadata['source']}" for doc in docs
    )


rag_chain_from_docs = (
    RunnablePassthrough.assign(
        source_documents=(lambda x: format_docs(x["source_documents"]))
    )
    | prompt
    | llm
    | StrOutputParser()
)
rag_chain = RunnableParallel(
    {
        "source_documents": retriever,
        "question": RunnablePassthrough(),
    }
).assign(answer=rag_chain_from_docs)

In [None]:
question = "which Circuits are from UK"
response = rag_chain.invoke(question)
response["answer"]