In [None]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader

from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List
from langchain_core.documents import Document
import os
import ollama
import chromadb

client = chromadb.PersistentClient(path="chroma_storage")
collection = client.get_or_create_collection(name="docs")


In [None]:
def load_documents(folder_path: str) -> List[Document]:
    documents = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
        elif filename.endswith('.docx'):
            loader = Docx2txtLoader(file_path)
        elif filename.endswith('.md'):
            loader = UnstructuredMarkdownLoader(file_path)
        else:
            print(f"unsupported file type: {filename}")
            continue
        documents.extend(loader.load())
    return documents

MAX_LENGTH = 1000  

def split_documents(documents: List[Document]) -> List[Document]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=200,
        chunk_overlap=75,
        length_function=len
    )

    final_docs = []
    for doc in documents:
        content = doc.page_content

        if len(content) > MAX_LENGTH:
            chunks = text_splitter.split_documents([doc])
            print(f"Splitting document: {len(content)} chars → {len(chunks)} chunks")
            final_docs.extend(chunks)
        else:
            final_docs.append(doc)

    return final_docs


def embednstore(splits, collection):
    for i, doc in enumerate(splits):
        print(f"Document {i}:", doc.page_content[:200])  # Preview first 200 chars

        text = doc.page_content
        response = ollama.embed(model="mxbai-embed-large", input=text)
        embedding = response["embeddings"][0]

        collection.add(
            ids=[str(i)],
            embeddings=[embedding],
            documents=[text]
        )
        print(f"Embedding {i} length: {len(embedding)} | Preview: {embedding[:5]}")
    

    
documents = load_documents("/home/ruta/irishep/hint_files")
print(f"loaded {len(documents)} documents from the folder")

splits = split_documents(documents)
print(f"split the documents into {len(splits)} chunks")\
    
embednstore(splits, collection)
print(f"Stored {len(splits)} embedded chunks in ChromaDB.")


In [None]:
input = "how would you plot the muon pT of the file '4AAF4AB2-171D-F54C-8FE3-0D709B049A8A.root'?"

resp = ollama.embed(model="mxbai-embed-large", input=input)
query_embedding = resp["embeddings"][0] 

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5
)

data = results['documents'][0][0]

print("Top retrieved document:", data)

output = ollama.generate(
    model="llama3",
    prompt = f"""You are a helpful assistant with access to these CMS specific hint files with python code snippets: {data}
            Only use the above data to answer the following question, without hallucinating or making up your own statements: {input},
            The expected output is a python code snippet that can be run.
            If the answer is not in the provided data, say "I don't know based on the available information,
            PLEASE MAKE SURE TO IMPORT NECESSARY LIBRARIES."
        """,    
)
#try with no chunking
#no chuunking unless very large hints
#characterize how far the model can go with the data
#outline with goals
#in chunks basic workflow cuz this llama is kinda dumb


print("RESPONSE: \n " +   output['response'])
client.delete_collection("docs")
collection = client.create_collection(name="docs")
