In [1]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader

from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List
from langchain_core.documents import Document
import os
import ollama
import chromadb

client = chromadb.Client()
collection = client.create_collection(name="docs")

In [2]:
def load_documents(folder_path: str) -> List[Document]:
    documents = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
        elif filename.endswith('.docx'):
            loader = Docx2txtLoader(file_path)
        elif filename.endswith('.md'):
            loader = UnstructuredMarkdownLoader(file_path)
        else:
            print(f"unsupported file type: {filename}")
            continue
        documents.extend(loader.load())
    return documents

#chunking
def split_documents(documents: List[Document]) -> List[Document]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=200,
        chunk_overlap=75,
        length_function=len
    )
    return text_splitter.split_documents(documents)

def embednstore(splits, collection):
    for i, doc in enumerate(splits):
        print(f"Document {i}:", doc.page_content[:200])  # Preview first 200 chars

        text = doc.page_content
        response = ollama.embed(model="mxbai-embed-large", input=text)
        embedding = response["embeddings"][0]

        collection.add(
            ids=[str(i)],
            embeddings=[embedding],
            documents=[text]
        )
        print(f"Embedding {i} length: {len(embedding)} | Preview: {embedding[:5]}")
    

    
documents = load_documents("/home/ruta/irishep/hint_files")
print(f"loaded {len(documents)} documents from the folder")

splits = split_documents(documents)
print(f"split the documents into {len(splits)} chunks")\
    
embednstore(splits, collection)
print(f"Stored {len(splits)} embedded chunks in ChromaDB.")


loaded 1 documents from the folder
split the documents into 21 chunks

UPROOT TUTORIAL - SINGLE CLICK PASTE VERSION


0. LIBRARIES
Embedding 0 length: 1024 | Preview: [-0.0021601743, 0.013665604, 0.013330263, -0.021397904, -0.04287649]

0. LIBRARIES

most of the queries use awkward to handle arrays, so it would be safe to:

import awkward as ak

and of course:

import uproot
Embedding 1 length: 1024 | Preview: [0.011688487, 0.008747531, 0.0061701667, -0.0091666505, -0.044203404]
Document 2: import awkward as ak

and of course:

import uproot

for plotting we use:

import matplotlib.pyplot as plt

1. BASIC FILE OPERATIONS
Embedding 2 length: 1024 | Preview: [0.058728278, -3.332652e-05, 0.013014842, -0.038258165, 0.0011817323]
Document 3: import matplotlib.pyplot as plt

1. BASIC FILE OPERATIONS

import uproot file = uproot.open("file.root") # or "root://server/path/file.root" for remote

View contents
Embedding 3 length: 1024 | Preview: [0.023952238, 3.6063615e-05, 0.018082887, -0.04919

In [3]:
input = "how would you plot the muon pT of the file '4AAF4AB2-171D-F54C-8FE3-0D709B049A8A.root'?"

resp = ollama.embed(model="mxbai-embed-large", input=input)
query_embedding = resp["embeddings"][0] 

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5
)

data = results['documents'][0][0]

print("Top retrieved document:", data)

output = ollama.generate(
    model="llama3",
    prompt = f"""You are a helpful assistant with access to this data: {data}
            Only use the above data to answer the following question, without hallucinating or making up your own statements: {input}
            If the answer is not in the provided data, say "I don't know based on the available information"
        """,    
)

print(output['response'])
client.delete_collection("docs")
collection = client.create_collection(name="docs")



Top retrieved document: import matplotlib.pyplot as plt plt.hist(ak.flatten(branches["Muon_pt"]), bins=50, range=(0, 100)) plt.xlabel("Muon pT [GeV]") plt.ylabel("Counts") plt.title("Muon Transverse Momentum") plt.show()
Based on the provided data, it appears that you can plot the muon pT of the file '4AAF4AB2-171D-F54C-8FE3-0D709B049A8A.root' using the following command:

```
import matplotlib.pyplot as plt
plt.hist(ak.flatten(branches["Muon_pt"]), bins=50, range=(0, 100))
plt.xlabel("Muon pT [GeV]")
plt.ylabel("Counts")
plt.title("Muon Transverse Momentum")
plt.show()
```

Note that this assumes the file '4AAF4AB2-171D-F54C-8FE3-0D709B049A8A.root' is a root file containing data related to muons, and that you have already imported the necessary libraries and defined the branches dictionary.


In [4]:
# After you get the output from ollama.generate
import re

# 1. Get the generated answer
generated_answer = output['response']

# 2. Extract the code block (Python) using regex
code_blocks = re.findall(r"```(.*?)```", generated_answer, re.DOTALL)

if code_blocks:
    code_to_run = code_blocks[0].strip()
    print("Running code:\n", code_to_run)
    # 3. Execute the code block
    exec(code_to_run)
else:
    print("No Python code block found in the answer.")

Running code:
 import matplotlib.pyplot as plt
plt.hist(ak.flatten(branches["Muon_pt"]), bins=50, range=(0, 100))
plt.xlabel("Muon pT [GeV]")
plt.ylabel("Counts")
plt.title("Muon Transverse Momentum")
plt.show()


NameError: name 'ak' is not defined