In [1]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List
from langchain_core.documents import Document
import os
import ollama
import chromadb

client = chromadb.Client()
collection = client.create_collection(name="docs")

In [2]:
def load_documents(folder_path: str) -> List[Document]:
    documents = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
        elif filename.endswith('.docx'):
            loader = Docx2txtLoader(file_path)
        else:
            print(f"unsupported file type: {filename}")
            continue
        documents.extend(loader.load())
    return documents

#chunking
def split_documents(documents: List[Document]) -> List[Document]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=150,
        chunk_overlap=50,
        length_function=len
    )
    return text_splitter.split_documents(documents)

def embednstore(splits, collection):
    for i, doc in enumerate(splits):
        print(f"Document {i}:", doc.page_content[:200])  # Preview first 200 chars

        text = doc.page_content
        response = ollama.embed(model="mxbai-embed-large", input=text)
        embedding = response["embeddings"][0]

        collection.add(
            ids=[str(i)],
            embeddings=[embedding],
            documents=[text]
        )
        print(f"Embedding {i} length: {len(embedding)} | Preview: {embedding[:5]}")
    

    
documents = load_documents("/home/ruta/irishep/test_files")
print(f"loaded {len(documents)} documents from the folder")

splits = split_documents(documents)
print(f"split the documents into {len(splits)} chunks")\
    
embednstore(splits, collection)
print(f"Stored {len(splits)} embedded chunks in ChromaDB.")


loaded 4 documents from the folder
split the documents into 38 chunks
Document 0: A famous fish market was opened in Šilutė almost 500 years ago, when Georg Tallat purchased the inn together with the land and fishing rights in 1511.
Embedding 0 length: 1024 | Preview: [-0.0049178577, 0.031262547, -0.009629161, 0.031176839, -0.016225727]
Document 1: with the land and fishing rights in 1511. The town was a gathering place for peasants from nearby Samogitia and Curonian and Prussian fishermen from
Embedding 1 length: 1024 | Preview: [0.02204336, 0.034912307, -0.0025095344, 0.0031121387, -0.03504274]
Document 2: and Curonian and Prussian fishermen from Rusnė, Karklė [lt], Nida, and Lesnoye [de]. Next to the inn a church of Werden (Verdainė) was built in 1550.
Embedding 2 length: 1024 | Preview: [-0.017402694, 0.048267562, -0.00036907513, 0.021875199, -0.023511147]
Document 3: a church of Werden (Verdainė) was built in 1550. It was a part of the Polish–Lithuanian Commonwealth, as a fief of 

In [4]:
input = "when was the famous fish market opened?"

resp = ollama.embed(model="mxbai-embed-large", input=input)
query_embedding = resp["embeddings"][0] 

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=3
)

data = results['documents'][0][0]

print("Top retrieved document:", data)

data = results['documents'][0][0]

output = ollama.generate(
    model="llama3",
    prompt = f"""You are a helpful assistant with access to this data: {data}
            Only use the above data to answer the following question, without hallucinating or making up your own statements: {input}
            If the answer is not in the provided data, say "I don't know based on the available information"
        """,    
)

print(output['response'])
client.delete_collection("docs")
collection = client.create_collection(name="docs")



Top retrieved document: A famous fish market was opened in Šilutė almost 500 years ago, when Georg Tallat purchased the inn together with the land and fishing rights in 1511.
According to the data, the famous fish market was opened almost 500 years ago, specifically when Georg Tallat purchased the inn together with the land and fishing rights in 1511. Therefore, my answer is:

The famous fish market was opened in 1511.
