In [None]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import MarkdownHeaderTextSplitter

from typing import List
from langchain_core.documents import Document
import os
import ollama
import chromadb

client = chromadb.PersistentClient(path="chroma_storage")
collection = client.get_or_create_collection(name="docs")

folder_path = "/home/ruta/irishep/hint_files"


In [None]:
def load_documents(folder_path: str) -> List[Document]:
    documents = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.md'):
            with open(file_path) as f:
                content = f.read()
            documents.append(Document(page_content=content, metadata={"source": filename}))
        #handle other file types if needed...
    return documents
'''
def load_documents(folder_path: str) -> List[Document]:
    documents = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
        elif filename.endswith('.docx'):
            loader = Docx2txtLoader(file_path)
        elif filename.endswith('.md'):
            loader = UnstructuredMarkdownLoader(file_path)
        else:
            print(f"unsupported file type: {filename}")
            continue
        documents.extend(loader.load())
        
    return documents
'''

headers_to_split_on = [
    #("#", "Header 1") because hint files have comments as well, headers start with ##
    ("##", "Header 2"),
    ("###", "Header 3"),
    ("####", "Header 4")
]

def split_docs(documents: List[Document]) -> List[Document]:
    final_docs = []
    splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    for doc in documents:
        if doc.metadata.get("source", "").endswith(".md"):
            md_chunks = splitter.split_text(doc.page_content)
            for chunk in md_chunks:
                final_docs.append(Document(page_content=chunk.page_content, metadata={**doc.metadata, **chunk.metadata}))
        else:
            final_docs.append(doc)
    return final_docs


def embednstore(splits, collection):
    for i, doc in enumerate(splits):
        print(f"Chunk {i}:", doc.page_content[:200])  #first 200 chars

        text = doc.page_content
        response = ollama.embed(model="mxbai-embed-large", input=text)
        embedding = response["embeddings"][0]

        collection.add(
            ids=[str(i)],
            embeddings=[embedding],
            documents=[text]
        )
        print(f"embedding {i} length: {len(embedding)} | preview: {embedding[:5]} \n")
    

    
documents = load_documents("/home/ruta/irishep/hint_files")
print(f"loaded {len(documents)} documents from the folder \n")

splits = split_docs(documents)
print(f"split the documents into {len(splits)} chunks \n")
    
embednstore(splits, collection)
print(f"stored {len(splits)} embedded chunks \n")


In [3]:
input = "how would you plot the leading muon eta of the file '4AAF4AB2-171D-F54C-8FE3-0D709B049A8A.root' for the first n = 5000 events?"

resp = ollama.embed(model="mxbai-embed-large", input=input)
query_embedding = resp["embeddings"][0]

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=3
)

chunks = results['documents'][0]  # list of top retrieved chunks
data = "\n\n".join(chunks)        # combine top chunks

max_tries = 5
success = False
error_message = ""
last_code = ""

for trial in range(max_tries):
    prompt = f"""You are a helpful assistant with access to these CMS specific hint files with python code snippets: {data}
        Only use the above data to answer the following question, without hallucinating or making up your own statements: {input}
        The expected output is a python code snippet that can be run.
        If the answer is not in the provided data, say "I don't know based on the available information."
        If you get an error, here is the error message: {error_message}
        If you tried code previously, here is the last attempt:
        {last_code}
        Please fix the code if there was an error, otherwise try again.
        """

    output = ollama.generate(
        model="llama3",
        prompt=prompt,
    )
    print(f"(TRY {trial+1}):\n", output['response'])
    # Extract code block
    code_start = output['response'].find("```")
    code_end = output['response'].rfind("```")
    if code_start != -1 and code_end != -1 and code_end > code_start:
        code = output['response'][code_start+3:code_end].strip()
        last_code = code  # save for next prompt
        try:
            exec(code)
            print("SUCCESS")
            success = True
            break
        except Exception as e:
            error_message = str(e)
            print(f"Error running code: {error_message}")
    else:
        print("No code block found in response.")
        error_message = "No code block found in response."
        last_code = ""
if not success:
    print("No valid code snippet ran without errors after max trials.")

KeyboardInterrupt: 

In [None]:
input = "how would you plot the muon eta of the file '4AAF4AB2-171D-F54C-8FE3-0D709B049A8A.root'?"

resp = ollama.embed(model="mxbai-embed-large", input=input)
query_embedding = resp["embeddings"][0] 

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=3
)

print(results)

top = results['documents'][0]  # list of 3 top retrieved text chunksS
data = "\n\n".join(top)
print("combined top chunks:\n", data)


In [None]:
output = ollama.generate(
    model="llama3",
    prompt = f"""You are an assistant with access to these CMS specific files with python code snippets that are written to help you come up with code: {data}
            Only use the provided data to answer the following question, without hallucinating or making up your own statements: {input},
            The expected output is a python code snippet.
            If the answer is not in the provided data, say "I don't know based on the available information."
        """,    
)

print("RESPONSE: \n " +   output['response'])
client.delete_collection("docs")
collection = client.create_collection(name="docs")