In [1]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import MarkdownHeaderTextSplitter

from typing import List
from langchain_core.documents import Document
import os
import ollama
import chromadb

client = chromadb.PersistentClient(path="chroma_storage")
collection = client.get_or_create_collection(name="docs")

folder_path = "/home/ruta/irishep/hint_files"


In [2]:
def load_documents(folder_path: str) -> List[Document]:
    documents = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.md'):
            with open(file_path) as f:
                content = f.read()
            documents.append(Document(page_content=content, metadata={"source": filename}))
        #handle other file types if needed...
    return documents
'''
def load_documents(folder_path: str) -> List[Document]:
    documents = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
        elif filename.endswith('.docx'):
            loader = Docx2txtLoader(file_path)
        elif filename.endswith('.md'):
            loader = UnstructuredMarkdownLoader(file_path)
        else:
            print(f"unsupported file type: {filename}")
            continue
        documents.extend(loader.load())
        
    return documents
'''

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
    ("####", "Header 4")
]

def split_docs(documents: List[Document]) -> List[Document]:
    final_docs = []
    splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    for doc in documents:
        if doc.metadata.get("source", "").endswith(".md"):
            md_chunks = splitter.split_text(doc.page_content)
            for chunk in md_chunks:
                final_docs.append(Document(page_content=chunk.page_content, metadata={**doc.metadata, **chunk.metadata}))
        else:
            final_docs.append(doc)
    return final_docs


def embednstore(splits, collection):
    for i, doc in enumerate(splits):
        print(f"Chunk {i}:", doc.page_content[:200])  #first 200 chars

        text = doc.page_content
        response = ollama.embed(model="mxbai-embed-large", input=text)
        embedding = response["embeddings"][0]

        collection.add(
            ids=[str(i)],
            embeddings=[embedding],
            documents=[text]
        )
        print(f"embedding {i} length: {len(embedding)} | preview: {embedding[:5]} \n")
    

    
documents = load_documents("/home/ruta/irishep/hint_files")
print(f"loaded {len(documents)} documents from the folder \n")

splits = split_docs(documents)
print(f"split the documents into {len(splits)} chunks \n")
    
embednstore(splits, collection)
print(f"stored {len(splits)} embedded chunks \n")


loaded 3 documents from the folder 

split the documents into 19 chunks 

Chunk 0: import uproot
import awkward as ak
import matplotlib.pyplot as plt
embedding 0 length: 1024 | preview: [0.052296925, -0.0044632573, 0.017374478, -0.05518427, 0.005176358] 

Chunk 1: file = uproot.open("file.root")
print(file.keys())
print(file.classnames())
print(file["Events"].num_entries)
embedding 1 length: 1024 | preview: [0.018120183, 0.008217531, 0.013562795, -0.033676367, 0.018681712] 

Chunk 2: tree = file["Events"]
branches = tree.arrays()
selected = tree.arrays(["Muon_pt", "Muon_eta"])
embedding 2 length: 1024 | preview: [0.011310098, -0.0070539922, -0.004388093, -0.02311674, -0.009288715] 

Chunk 3: muon_pt = branches["Muon_pt"]
print(muon_pt[0].tolist())
print(ak.num(muon_pt))
print(ak.flatten(muon_pt))
first_muon_pt = ak.firsts(muon_pt)
print(first_muon_pt)
embedding 3 length: 1024 | preview: [0.026371421, -0.030111363, 0.016415477, -0.015239357, -0.0027696758] 

Chunk 4: good_pt = branches[

In [None]:
input = "how would you plot the muon pT of the file '4AAF4AB2-171D-F54C-8FE3-0D709B049A8A.root'?"

resp = ollama.embed(model="mxbai-embed-large", input=input)
query_embedding = resp["embeddings"][0]

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=3
)

chunks = results['documents'][0]  # list of top retrieved chunks
data = "\n\n".join(chunks)        # combine top chunks

max_tries = 3
success = False
error_message = ""

for trial in range(max_tries):
    prompt = f"""You are a helpful assistant with access to these CMS specific hint files with python code snippets: {data}
Only use the above data to answer the following question, without hallucinating or making up your own statements: {input}
The expected output is a python code snippet that can be run.
If the answer is not in the provided data, say "I don't know based on the available information."
PLEASE MAKE SURE TO IMPORT NECESSARY LIBRARIES.
If you get an error, here is the error message: {error_message}
"""
    output = ollama.generate(
        model="llama3",
        prompt=prompt,
    )
    print(f"(TRY {trial+1}):\n", output['response'])
    # run the generated python code
    code_start = output['response'].find("```")
    code_end = output['response'].rfind("```")
    if code_start != -1 and code_end != -1 and code_end > code_start:
        code = output['response'][code_start+3:code_end].strip()
        try:
            exec(code)
            print("SUCCESS")
            success = True
            break
        except Exception as e:
            error_message = str(e)
            print(f"Error running code: {error_message}")
    else:
        print("No code block found in response.")
        error_message = "No code block found in response."
if not success:
    print("No valid code snippet ran without errors after max trials.")

LLM RESPONSE (trial 1):
 Based on the provided hint files and python code snippets, it seems that you have access to a ROOT file named '4AAF4AB2-171D-F54C-8FE3-0D709B049A8A.root' and you want to plot the muon pT of this file. 

Here is the Python code snippet that can be used to plot the muon pT:
```
import uproot
import numpy as np
import matplotlib.pyplot as plt
from awkward import ak

# Load the ROOT file
with uproot.open('4AAF4AB2-171D-F54C-8FE3-0D709B049A8A.root') as file:
    branches = file['Muon']

# Extract muon pT array from the file
pt = ak.flatten(branches['Muon_pt'])

# Plot the distribution of muon pT
plt.hist(pt, bins=50, range=(0, 100))
plt.xlabel("Muon pT [GeV]")
plt.ylabel("Counts")
plt.title("Muon Transverse Momentum")
plt.show()
```
This code snippet uses the uproot library to load the ROOT file and extract the muon pT array. It then plots the distribution of muon pT using matplotlib.
Error running code: cannot import name 'ak' from 'awkward' (/home/ruta/irishep/.ve

In [18]:
input = "how would you plot the muon pT of the file '4AAF4AB2-171D-F54C-8FE3-0D709B049A8A.root'?"

resp = ollama.embed(model="mxbai-embed-large", input=input)
query_embedding = resp["embeddings"][0] 

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=3
)

print(results)

top = results['documents'][0]  # list of 3 top retrieved text chunksS
data = "\n\n".join(top)
print("combined top chunks:\n", data)


{'ids': [['17', '5', '13']], 'embeddings': None, 'documents': [['branches = tree.arrays(["Muon_eta", "Muon_phi"])  \neta = ak.flatten(branches["Muon_eta"])\nphi = ak.flatten(branches["Muon_phi"])  \nplt.hist2d(eta, phi, bins=50, range=[[-2.5, 2.5], [-np.pi, np.pi]])\nplt.xlabel("Muon η")\nplt.ylabel("Muon φ")\nplt.title("η-φ Distribution")\nplt.colorbar(label="Counts")\nplt.show()', 'plt.hist(ak.flatten(branches["Muon_pt"]), bins=50, range=(0, 100))\nplt.xlabel("Muon pT [GeV]")\nplt.ylabel("Counts")\nplt.title("Muon Transverse Momentum")\nplt.show()', 'plt.hist(ak.flatten(selected_muons["Muon_pt"]), bins=40, range=(0, 120))\nplt.xlabel("Muon pT [GeV]")\nplt.ylabel("Entries")\nplt.title("Selected Muons pT")\nplt.show()']], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[None, None, None]], 'distances': [[0.4269905090332031, 0.43645238876342773, 0.43893885612487793]]}
combined top chunks:
 branches = tree.arrays(["Muon_eta", "Muon_phi"])  


In [19]:
output = ollama.generate(
    model="llama3",
    prompt = f"""You are an assistant with access to these CMS specific files with python code snippets that are written to help you come up with code: {data}
            Only use the provided data to answer the following question, without hallucinating or making up your own statements: {input},
            The expected output is a python code snippet.
            If the answer is not in the provided data, say "I don't know based on the available information."
        """,    
)

print("RESPONSE: \n " +   output['response'])
client.delete_collection("docs")
collection = client.create_collection(name="docs")

RESPONSE: 
 Based on the provided Python code snippets, it appears that you need to access the "Muon_pt" branch from the file '4AAF4AB2-171D-F54C-8FE3-0D709B049A8A.root' using the `ak.flatten()` function and then plot the result with `plt.hist()`. Here's a Python code snippet that should accomplish this:

```python
import ak
import plt

selected_muons = ak.fromroot('4AAF4AB2-171D-F54C-8FE3-0D709B049A8A.root')
branches = selected_muons.arrays(["Muon_pt"])

pt = ak.flatten(branches["Muon_pt"])
plt.hist(pt, bins=50, range=(0, 100))
plt.xlabel("Muon pT [GeV]")
plt.ylabel("Entries")
plt.title("Muons pT")
plt.show()
```

Please note that the `ak.fromroot()` function is not a standard Python library and might require additional setup or imports depending on your environment.
