In [None]:
from langchain.document_loaders import DirectoryLoader

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
import chromadb
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama

In [None]:
loader = DirectoryLoader('Data', glob="**/*.txt")
docs = loader.load()

In [None]:
embeddings = OllamaEmbeddings(model="llama3.1")

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,)

In [None]:
document = text_splitter.split_documents(docs)

In [None]:
document

In [None]:
global_embeddings = [ embeddings.embed_query(txt) for txt in document]

In [None]:
# running chromadb locally
client = chromadb.HttpClient(host= "localhost", port= 8000)

In [None]:
collection = client.create_collection(name="rag_dataset_cosine",metadata={"hnsw:space": "cosine"})

In [None]:
client.list_collections()

In [None]:
client.get_collection('rag_dataset')

In [None]:
# for existing database
collection = client.get_collection('rag_dataset')

In [None]:
def creating_unique_chunk_ids():
    
    last_index = 0
    last_page_id = None
    for doc in document:
        source = doc.metadata.get('source')
        page = doc.metadata.get('page')
        current_page_id = f"{source}:{page}"
        if current_page_id == last_page_id:
            index += 1

        else:
            index = 0
        
        last_page_id = current_page_id
        chunk_id = f"{source}:{index}"
        doc.metadata["id"] = chunk_id
    return chunk_id

In [None]:
creating_unique_chunk_ids()

In [None]:
new_chunk_id = [chunk.metadata['id'] for chunk in document]

In [None]:
page_content = [page.page_content for page in document]
page_content

In [None]:
collection.add(documents= page_content, ids= new_chunk_id,embeddings = global_embeddings)

In [None]:
collection.count()

In [None]:
collection.peek()

In [None]:
embedded_querry = embeddings.embed_query("who crowned with plump shrimp and tender calamari")

In [None]:
querry_text = "who crowned with plump shrimp and tender calamari"

In [None]:
res = collection.query(embedded_querry,n_results=5,
            include=['distances','embeddings', 'documents', 'metadatas'])

In [None]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [None]:
context_text = "\n\n---\n\n".join([doc for doc in res['documents'][0]])

In [None]:
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=querry_text)

In [None]:
print(prompt)

In [None]:
model = Ollama(model="llama3.1")
response_text = model.invoke(prompt)

In [None]:
response_text