In [13]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import MarkdownHeaderTextSplitter

from typing import List
from langchain_core.documents import Document
import os
import ollama
import chromadb



client = chromadb.PersistentClient(path="chroma_storage")
collection = client.get_or_create_collection(name="docs")

folder_path = "/home/ruta/irishep/hint_files"

In [14]:
def load_documents(folder_path: str) -> List[Document]:
    documents = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.md'):
            with open(file_path) as f:
                content = f.read()
            documents.append(Document(page_content=content, metadata={"source": filename}))
        #handle other file types if needed...
    return documents
'''
def load_documents(folder_path: str) -> List[Document]:
    documents = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
        elif filename.endswith('.docx'):
            loader = Docx2txtLoader(file_path)
        elif filename.endswith('.md'):
            loader = UnstructuredMarkdownLoader(file_path)
        else:
            print(f"unsupported file type: {filename}")
            continue
        documents.extend(loader.load())
        
    return documents
'''

headers_to_split_on = [
    #("#", "Header 1") because hint files have comments as well, headers start with ##
    ("##", "Header 2"),
    ("###", "Header 3"),
    ("####", "Header 4")
]

def split_docs(documents: List[Document]) -> List[Document]:
    final_docs = []
    splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    for doc in documents:
        if doc.metadata.get("source", "").endswith(".md"):
            md_chunks = splitter.split_text(doc.page_content)
            for chunk in md_chunks:
                final_docs.append(Document(page_content=chunk.page_content, metadata={**doc.metadata, **chunk.metadata}))
        else:
            final_docs.append(doc)
    return final_docs


def embednstore(splits, collection):
    for i, doc in enumerate(splits):
        print(f"Chunk {i}:", doc.page_content[:200])  #first 200 chars

        text = doc.page_content
        response = ollama.embed(model="mxbai-embed-large", input=text)
        embedding = response["embeddings"][0]

        collection.add(
            ids=[str(i)],
            embeddings=[embedding],
            documents=[text]
        )
        print(f"embedding {i} length: {len(embedding)} | preview: {embedding[:5]} \n")
    

    
documents = load_documents("/home/ruta/irishep/hint_files")
print(f"loaded {len(documents)} documents from the folder \n")

splits = split_docs(documents)
print(f"split the documents into {len(splits)} chunks \n")
    
embednstore(splits, collection)
print(f"stored {len(splits)} embedded chunks \n")


loaded 2 documents from the folder 

split the documents into 21 chunks 

Chunk 0: import uproot  # For working with ROOT files
import awkward as ak  # For handling jagged arrays
import matplotlib.pyplot as plt  # For plotting histograms
embedding 0 length: 1024 | preview: [0.05479956, -0.0035231363, 0.019213218, -0.05781571, -0.0056689666] 

Chunk 1: file = uproot.open("file.root")  # Open the ROOT file
print(file.keys())  # Print all object keys in the ROOT file
print(file.classnames())  # Print class names of objects (e.g., TTree, TH1)
print(fil
embedding 1 length: 1024 | preview: [0.0048109367, 0.012186126, 0.013609938, -0.0526254, 0.014015761] 

Chunk 2: tree = file["Events"]  # Access the TTree named "Events"
branches = tree.arrays()  # Load all branches into an Awkward Array
selected = tree.arrays(["Muon_pt", "Muon_eta"])  # Load only specified bran
embedding 2 length: 1024 | preview: [0.013386331, -0.008463236, 0.008940103, -0.034150857, -0.004883577] 

Chunk 3: muon_pt = branc

In [15]:
input = "how would you plot the leading muon eta of the file '4AAF4AB2-171D-F54C-8FE3-0D709B049A8A.root' for the first n = 5000 events?"

resp = ollama.embed(model="mxbai-embed-large", input=input)
query_embedding = resp["embeddings"][0]

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=3
)

chunks = results['documents'][0]  # list of top retrieved chunks
data = "\n\n".join(chunks)        # combine top chunks

max_tries = 5
success = False
error_message = ""
last_code = ""

for trial in range(max_tries):
    prompt = f"""You are a helpful assistant with access to these CMS specific hint files with python code snippets: {data}
Only use the above data to answer the following question, without hallucinating or making up your own statements: {input}
The expected output is a python code snippet that only contains code between triple backticks like this: ``` [code] ```
If the answer is not in the provided data, say "I don't know based on the available information."
If you get an error, here is the error message: {error_message}
If you tried code previously, here is the last attempt:
{last_code}
Please fix the code if there was an error, otherwise try again.
"""

    output = ollama.generate(
        model="llama3",
        prompt=prompt,
    )
    print(f"(TRY {trial+1}):\n", output['response'])

    # Extract code block
    code_start = output['response'].find("```")
    code_end = output['response'].rfind("```")

    if code_start != -1 and code_end != -1 and code_end > code_start:
        # Adjust code_start based on whether it's ```python or just ```
        if output['response'].startswith("```python", code_start):
            code_start += len("```python")
        else:
            code_start += len("```")

        code = output['response'][code_start:code_end].strip()
        last_code = code  # Save for next prompt

        try:
            exec(code)
            print("SUCCESS")
            success = True
            break
        except Exception as e:
            error_message = str(e)
            print(f"Error running code: {error_message}")
    else:
        print("no code block found in response")
        error_message = "no code block found in response"
        last_code = ""

if not success:
    print("no valid code snippet ran without errors after max trials")


(TRY 1):
 ```python
import uproot
import awkward as ak
import matplotlib.pyplot as plt

# 1. Open the ROOT file
file = uproot.open("4AAF4AB2-171D-F54C-8FE3-0D709B049A8A.root")

# 2. Access the "Events" tree and load the Muon_eta branch as an awkward array
tree = file["Events"]
muon_eta = tree["Muon_eta"].array()  # jagged array: one entry per event, possibly multiple muons/event

# Select the first n=5000 events
muon_eta_lead = ak.flatten([event[0] for event in muon_eta[:5000]])

# Plot histogram of leading muon eta
plt.hist(muon_eta_lead, bins=50, range=(-2.5, 2.5))
plt.xlabel("Leading Muon Eta")
plt.ylabel("Counts")
plt.title("Leading Muon Eta (First 5000 Events)")
plt.show()
```
Error running code: cannot slice NumpyArray (of length 0) with 0: index 0 is out of bounds for axis 0 with size 0
(TRY 2):
 Here is the corrected Python code snippet to plot the leading muon eta of the file '4AAF4AB2-171D-F54C-8FE3-0D709B049A8A.root' for the first n = 5000 events:
```
```python
import uproot