In [1]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader

from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List
from langchain_core.documents import Document
import os
import ollama
import chromadb

client = chromadb.Client()
collection = client.create_collection(name="docs")

In [2]:
def load_documents(folder_path: str) -> List[Document]:
    documents = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
        elif filename.endswith('.docx'):
            loader = Docx2txtLoader(file_path)
        elif filename.endswith('.md'):
            loader = UnstructuredMarkdownLoader(file_path)
        else:
            print(f"unsupported file type: {filename}")
            continue
        documents.extend(loader.load())
    return documents

#chunking
def split_documents(documents: List[Document]) -> List[Document]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=200,
        chunk_overlap=75,
        length_function=len
    )
    return text_splitter.split_documents(documents)

def embednstore(splits, collection):
    for i, doc in enumerate(splits):
        print(f"Document {i}:", doc.page_content[:200])  # Preview first 200 chars

        text = doc.page_content
        response = ollama.embed(model="mxbai-embed-large", input=text)
        embedding = response["embeddings"][0]

        collection.add(
            ids=[str(i)],
            embeddings=[embedding],
            documents=[text]
        )
        print(f"Embedding {i} length: {len(embedding)} | Preview: {embedding[:5]}")
    

    
documents = load_documents("/home/ruta/irishep/hint_files")
print(f"loaded {len(documents)} documents from the folder")

splits = split_documents(documents)
print(f"split the documents into {len(splits)} chunks")\
    
embednstore(splits, collection)
print(f"Stored {len(splits)} embedded chunks in ChromaDB.")


loaded 2 documents from the folder
split the documents into 27 chunks
Document 0: 0. LIBRARIES

import uproot import awkward as ak import matplotlib.pyplot as plt

1. BASIC FILE OPERATIONS
Embedding 0 length: 1024 | Preview: [0.051768765, 0.0059163184, 0.030181054, -0.058384478, 0.01449202]
Document 1: 1. BASIC FILE OPERATIONS

file = uproot.open("file.root") print(file.keys()) print(file.classnames()) print(file["Events"].num_entries)

2. LOADING DATA
Embedding 1 length: 1024 | Preview: [0.023202023, 0.0084609175, 0.01455583, -0.051943798, 0.005220387]
Document 2: 2. LOADING DATA

tree = file["Events"] branches = tree.arrays() selected = tree.arrays(["Muon_pt", "Muon_eta"])

3. WORKING WITH JAGGED ARRAYS
Embedding 2 length: 1024 | Preview: [0.0010778526, -0.002176215, 0.0013118689, -0.02484658, -0.014950862]
Document 3: 3. WORKING WITH JAGGED ARRAYS

muon_pt = branches["Muon_pt"] print(muon_pt[0].tolist()) print(ak.num(muon_pt)) print(ak.flatten(muon_pt)) print(muon_pt[:, 0])

4. SELE

In [3]:
input = "how would you plot the muon pT of the file '4AAF4AB2-171D-F54C-8FE3-0D709B049A8A.root'?"

resp = ollama.embed(model="mxbai-embed-large", input=input)
query_embedding = resp["embeddings"][0] 

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5
)

data = results['documents'][0][0]

print("Top retrieved document:", data)

output = ollama.generate(
    model="llama3",
    prompt = f"""You are a helpful assistant with access to this data: {data}
            Only use the above data to answer the following question, without hallucinating or making up your own statements: {input}
            If the answer is not in the provided data, say "I don't know based on the available information"
        """,    
)

print(output['response'])
client.delete_collection("docs")
collection = client.create_collection(name="docs")


Top retrieved document: 5. PLOTTING

plt.hist(ak.flatten(branches["Muon_pt"]), bins=50, range=(0, 100)) plt.xlabel("Muon pT [GeV]") plt.ylabel("Counts") plt.title("Muon Transverse Momentum") plt.show()

6. SAVING RESULTS
Based on the provided data, I would plot the muon pT of the file '4AAF4AB2-171D-F54C-8FE3-0D709B049A8A.root' by using the following command:

plt.hist(ak.flatten(branches["Muon_pt"]), bins=50, range=(0, 100)) 

This is because the data shows that `ak.flatten(branches["Muon_pt"])` is used to plot the muon pT distribution in a histogram with 50 bins and a range of (0, 100). The only difference would be replacing the placeholder root file name '4AAF4AB2-171D-F54C-8FE3-0D709B049A8A.root' with the actual file name.


In [None]:
# Run generated code
generated_code = output['response']

print("Running generated code:\n")
print(generated_code)
print("\n--- Output ---")

try:
    exec_namespace = {}
    exec(generated_code, globals(), exec_namespace)
except Exception as e:
    print(f"Error while executing generated code:\n{e}")


Running generated code:

Based on the provided data, I would plot the muon pT of the file '4AAF4AB2-171D-F54C-8FE3-0D709B049A8A.root' by using the following command:

plt.hist(ak.flatten(branches["Muon_pt"]), bins=50, range=(0, 100)) 

This is because the data shows that `ak.flatten(branches["Muon_pt"])` is used to plot the muon pT distribution in a histogram with 50 bins and a range of (0, 100). The only difference would be replacing the placeholder root file name '4AAF4AB2-171D-F54C-8FE3-0D709B049A8A.root' with the actual file name.

--- Output ---
Error while executing generated code:
invalid syntax (<string>, line 1)
