In [1]:
from langchain_community.document_loaders import BSHTMLLoader, DirectoryLoader

class HTMLDirectoryLoader:
    def __init__(self, directory):
        self.directory = directory

    def load(self):
        loader = DirectoryLoader(path=self.directory, glob="**/*.html", loader_cls=BSHTMLLoader)
        documents = loader.load()
        return documents

In [2]:
import os

input_dir = os.path.join(os.getcwd(), 'data')
db_dir = os.path.join(os.getcwd(), 'database')
loader = HTMLDirectoryLoader(input_dir)
documents = loader.load()

len(documents)

17

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=20000,
    chunk_overlap=200,
    length_function=len,
)

chunks = text_splitter.split_documents(documents)

chunks[1].to_json()

{'lc': 1,
 'type': 'constructor',
 'id': ['langchain', 'schema', 'document', 'Document'],
 'kwargs': {'metadata': {'source': '/home/metapercept/ChatbotTest/data/use_lazy_loading.html',
   'title': 'Use Lazy Loading'},
  'page_content': "Use Lazy LoadingIntroductionUse Angular CLIMaintain proper folder structureFollow consistent Angular coding stylesUse ES6 featuresUse trackBy along with ngForBreak down into small reusable componentsUse Lazy LoadingUse Index.tsAvoid logic in templates Cache API callsUse async pipe in templatesDeclare safe stringsAvoid any type when declaring constants and variablesState managementUse CDK Virtual ScrollUse Lazy LoadingTry to lazy load the modules in an Angular application whenever possible. Lazy loading loads something only when it is used. This reduces the size of the application load initial time and improves the application boot time by not loading unused modules.Without lazy loading// app.routing.tsimport { WithoutLazyLoadedComponent } from './withou

In [4]:
from langchain_community.embeddings.ollama import OllamaEmbeddings

embedding_func = OllamaEmbeddings(show_progress=True, model='mistral')
embedding_func

OllamaEmbeddings(base_url='http://localhost:11434', model='mistral', embed_instruction='passage: ', query_instruction='query: ', mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None, show_progress=True, headers=None, model_kwargs=None)

In [5]:
import chromadb
from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings
import ollama

chroma_client = chromadb.PersistentClient(
    path='chroma',
    settings=Settings(),
    tenant=DEFAULT_TENANT,
    database=DEFAULT_DATABASE,
)

collection = chroma_client.create_collection(name='docs')

idx = [str(i) for i in range(len(chunks))]
documents = [chunk.page_content for chunk in chunks]
metadatas = [chunk.metadata for chunk in chunks]
embeddings = [ollama.embeddings(model='mistral', prompt=doc.page_content) for doc in chunks]

collection.add(
    documents = documents,
    metadatas = metadatas,
    ids = idx
)
collection

ResponseError: model requires more system memory (5.5 GiB) than is available (5.3 GiB)

In [6]:
col1 = chroma_client.count_collections()
col1

NameError: name 'chroma_client' is not defined

In [8]:
from langchain.prompts import PromptTemplate

PROMPT_TEMPLATE = """
Answer the question based on the context below. If you can't answer the question, reply "I don't know".
Ensure your responses are clear, concise, and helpful.

Context: {context}

Question: {question}

"""

pt = PromptTemplate(
            template=PROMPT_TEMPLATE, input_variables=["context", "question"]
    )

pt

PromptTemplate(input_variables=['context', 'question'], template='\nAnswer the question based on the context below. If you can\'t answer the question, reply "I don\'t know".\nEnsure your responses are clear, concise, and helpful.\n\nContext: {context}\n\nQuestion: {question}\n\n')

In [9]:
from langchain.chains.retrieval_qa.base import RetrievalQA
from langchain.memory import ConversationSummaryMemory
from langchain_ollama.llms import OllamaLLM
from langchain_community.vectorstores import Chroma

model = OllamaLLM(model='mistral')

database = Chroma(embedding_function=embedding_func, client=chroma_client, persist_directory='chroma2')

rag = RetrievalQA.from_chain_type(
            llm=model,
            retriever=database.as_retriever(),
            memory=ConversationSummaryMemory(llm = model),
            chain_type_kwargs={"prompt": pt, "verbose": True},
        )

rag.invoke("What are features of angular?")

NameError: name 'chroma_client' is not defined