In [21]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import MarkdownHeaderTextSplitter

from typing import List
from langchain_core.documents import Document
import os
import ollama
import chromadb

import re

client = chromadb.PersistentClient(path="chroma_storage")
collection = client.get_or_create_collection(name="docs")

folder_path = "/home/ruta/irishep/hint_files"


In [None]:
def load_documents(folder_path: str) -> List[Document]:
    documents = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.md'):
            with open(file_path) as f:
                content = f.read()
            documents.append(Document(page_content=content, metadata={"source": filename}))
        #handle other file types if needed...
    return documents
'''
def load_documents(folder_path: str) -> List[Document]:
    documents = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
        elif filename.endswith('.docx'):
            loader = Docx2txtLoader(file_path)
        elif filename.endswith('.md'):
            loader = UnstructuredMarkdownLoader(file_path)
        else:
            print(f"unsupported file type: {filename}")
            continue
        documents.extend(loader.load())
        
    return documents
'''

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

def split_docs(documents: List[Document]) -> List[Document]:
    final_docs = []
    splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    for doc in documents:
        if doc.metadata.get("source", "").endswith(".md"):
            md_chunks = splitter.split_text(doc.page_content)
            for chunk in md_chunks:
                final_docs.append(Document(page_content=chunk.page_content, metadata={**doc.metadata, **chunk.metadata}))
        else:
            final_docs.append(doc)
    return final_docs


def embednstore(splits, collection):
    for i, doc in enumerate(splits):
        print(f"Chunk {i}:", doc.page_content[:200])  #first 200 chars

        text = doc.page_content
        response = ollama.embed(model="mxbai-embed-large", input=text)
        embedding = response["embeddings"][0]

        collection.add(
            ids=[str(i)],
            embeddings=[embedding],
            documents=[text]
        )
        print(f"embedding {i} length: {len(embedding)} | preview: {embedding[:5]}")
    

    
documents = load_documents("/home/ruta/irishep/hint_files")
print(f"loaded {len(documents)} documents from the folder \n")

splits = split_docs(documents)
print(f"split the documents into {len(splits)} chunks \n")
    
embednstore(splits, collection)
print(f"stored {len(splits)} embedded chunks \n")


loaded 2 documents from the folder
split the documents into 19 chunks
Document 0: import uproot
import awkward as ak
import matplotlib.pyplot as plt
Embedding 0 length: 1024 | Preview: [0.052296925, -0.0044632573, 0.017374478, -0.05518427, 0.005176358]
Document 1: file = uproot.open("file.root")
print(file.keys())
print(file.classnames())
print(file["Events"].num_entries)
Embedding 1 length: 1024 | Preview: [0.018120183, 0.008217531, 0.013562795, -0.033676367, 0.018681712]
Document 2: tree = file["Events"]
branches = tree.arrays()
selected = tree.arrays(["Muon_pt", "Muon_eta"])
Embedding 2 length: 1024 | Preview: [0.011310098, -0.0070539922, -0.004388093, -0.02311674, -0.009288715]
Document 3: muon_pt = branches["Muon_pt"]
print(muon_pt[0].tolist())
print(ak.num(muon_pt))
print(ak.flatten(muon_pt))
first_muon_pt = ak.firsts(muon_pt)
print(first_muon_pt)
Embedding 3 length: 1024 | Preview: [0.026371421, -0.030111363, 0.016415477, -0.015239357, -0.0027696758]
Document 4: good_pt = branch

In [None]:
input = "how would you plot the muon pT of the file '4AAF4AB2-171D-F54C-8FE3-0D709B049A8A.root'?"

resp = ollama.embed(model="mxbai-embed-large", input=input)
query_embedding = resp["embeddings"][0] 

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=3
)

data = results['documents'][0][0]

print("Top retrieved document:", data)

output = ollama.generate(
    model="llama3",
    prompt = f"""You are a helpful assistant with access to these CMS specific hint files with python code snippets: {data}
            Only use the above data to answer the following question, without hallucinating or making up your own statements: {input},
            The expected output is a python code snippet that can be run.
            If the answer is not in the provided data, say "I don't know based on the available information,
            PLEASE MAKE SURE TO IMPORT NECESSARY LIBRARIES."
        """,    
)

print("RESPONSE: \n " +   output['response'])
client.delete_collection("docs")
collection = client.create_collection(name="docs")


Top retrieved document: 0. LIBRARIES

import uproot import awkward as ak import matplotlib.pyplot as plt

1. BASIC FILE OPERATIONS

file = uproot.open("file.root") print(file.keys()) print(file.classnames()) print(file["Events"].num_entries)

2. LOADING DATA

tree = file["Events"] branches = tree.arrays() selected = tree.arrays(["Muon_pt", "Muon_eta"])

3. WORKING WITH JAGGED ARRAYS

muon_pt = branches["Muon_pt"] print(muon_pt[0].tolist()) print(ak.num(muon_pt)) print(ak.flatten(muon_pt)) first_muon_pt = ak.firsts(muon_pt) print(first_muon_pt)

4. SELECTIONS AND FILTERING

good_pt = branches["Muon_pt"] > 20 good_muons = branches["Muon_pt"][good_pt]

mask = (branches["Muon_pt"] > 20) & (abs(branches["Muon_eta"]) < 2.4) events_with_good_muons = ak.any(mask, axis=1) filtered_events = branches[events_with_good_muons]

5. PLOTTING

plt.hist(ak.flatten(branches["Muon_pt"]), bins=50, range=(0, 100)) plt.xlabel("Muon pT [GeV]") plt.ylabel("Counts") plt.title("Muon Transverse Momentum") plt.sho

In [19]:
from langchain_core.documents import Document
from langchain_text_splitters import MarkdownHeaderTextSplitter

# Define headers to split on
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

# Load markdown file as text
with open("/home/ruta/irishep/hint_files/hint1.md") as f:
    md_text = f.read()

# Wrap as Document
doc = Document(page_content=md_text, metadata={"source": "hint1.md"})

# Chunk using MarkdownHeaderTextSplitter
splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
chunks = splitter.split_text(doc.page_content)

# Print chunk contents and metadata
for i, chunk in enumerate(chunks):
    print(f"Chunk {i}:\n{chunk.page_content[:200]}\nMetadata: {chunk.metadata}\n{'-'*40}")

Chunk 0:
import uproot
import awkward as ak
import matplotlib.pyplot as plt
Metadata: {'Header 1': '0. LIBRARIES'}
----------------------------------------
Chunk 1:
file = uproot.open("file.root")
print(file.keys())
print(file.classnames())
print(file["Events"].num_entries)
Metadata: {'Header 1': '1. BASIC FILE OPERATIONS'}
----------------------------------------
Chunk 2:
tree = file["Events"]
branches = tree.arrays()
selected = tree.arrays(["Muon_pt", "Muon_eta"])
Metadata: {'Header 1': '2. LOADING DATA'}
----------------------------------------
Chunk 3:
muon_pt = branches["Muon_pt"]
print(muon_pt[0].tolist())
print(ak.num(muon_pt))
print(ak.flatten(muon_pt))
first_muon_pt = ak.firsts(muon_pt)
print(first_muon_pt)
Metadata: {'Header 1': '3. WORKING WITH JAGGED ARRAYS'}
----------------------------------------
Chunk 4:
good_pt = branches["Muon_pt"] > 20
good_muons = branches["Muon_pt"][good_pt]  
mask = (branches["Muon_pt"] > 20) & (abs(branches["Muon_eta"]) < 2.4)
events_with_good_mu