In [2]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import MarkdownHeaderTextSplitter

from typing import List
from langchain_core.documents import Document
import os
import ollama
import chromadb

import tiktoken

client = chromadb.PersistentClient(path="chroma_storage")
collection = client.get_or_create_collection(name="docs")

folder_path = "/home/ruta/irishep/hint_files/code"
with open("/home/ruta/irishep/hint_files/variables/variables.md") as f:
    variable_hints_text = f.read()
with open("/home/ruta/irishep/hint_files/errors/errors.md") as f:
    error_hints_text = f.read()

In [3]:
def load_documents(folder_path: str) -> List[Document]:
    documents = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.md'):
            with open(file_path) as f:
                content = f.read()
            documents.append(Document(page_content=content, metadata={"source": filename}))
        #handle other file types if needed...
    return documents

headers_to_split_on = [
    #("#", "Header 1") because hint files have comments as well, headers start with ##
    ("##", "Header 2"),
    ("###", "Header 3"),
    ("####", "Header 4")
]

def split_docs(documents: List[Document]) -> List[Document]:
    final_docs = []
    splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    for doc in documents:
        if doc.metadata.get("source", "").endswith(".md"):
            md_chunks = splitter.split_text(doc.page_content)
            for chunk in md_chunks:
                final_docs.append(Document(page_content=chunk.page_content, metadata={**doc.metadata, **chunk.metadata}))
        else:
            final_docs.append(doc)
    return final_docs


def embednstore(splits, collection):
    for i, doc in enumerate(splits):
        print(f"Chunk {i}:", doc.page_content[:200])  #first 200 chars

        text = doc.page_content
        response = ollama.embed(model="mxbai-embed-large", input=text)
        embedding = response["embeddings"][0]

        collection.add(
            ids=[str(i)],
            embeddings=[embedding],
            documents=[text]
        )
        print(f"embedding {i} length: {len(embedding)} | preview: {embedding[:5]} \n")
    

    
documents = load_documents("/home/ruta/irishep/hint_files/code")
print(f"loaded {len(documents)} documents from the folder \n")

splits = split_docs(documents)
#splits = documents
print(f"split the documents into {len(splits)} chunks \n")
    
embednstore(splits, collection)
print(f"stored {len(splits)} embedded chunks \n")


loaded 2 documents from the folder 

split the documents into 14 chunks 

Chunk 0: ```python
import uproot                                     # Import uproot to read ROOT files
import matplotlib.pyplot as plt                   # Import matplotlib for plotting
import awkward as ak  
embedding 0 length: 1024 | preview: [0.050355043, 0.017445704, 0.01795148, -0.048105292, 0.0027914813] 

Chunk 1: ```python
import uproot                                     # Import uproot for ROOT file reading
import matplotlib.pyplot as plt                   # Import matplotlib for plotting
import awkward as a
embedding 1 length: 1024 | preview: [0.027734384, -0.011742053, 0.012901092, -0.04851596, -0.013517232] 

Chunk 2: ```python
import uproot                                     # Import uproot for ROOT file access
import matplotlib.pyplot as plt                   # Import matplotlib for plotting
import awkward as ak
embedding 2 length: 1024 | preview: [0.040613133, 0.0006674163, 0.019603018, -0.04264

In [5]:
max_context_tokens = 7000
encoding = tiktoken.get_encoding("cl100k_base")

with open("questions.txt") as f:
    questions = [line.strip() for line in f if line.strip()]

for question in questions:
    print(f"\n=== QUESTION: {question} ===\n")
    resp = ollama.embed(model="mxbai-embed-large", input=question)
    query_embedding = resp["embeddings"][0]

    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=5
    )

    chunks = results['documents'][0]

    print("=== TOP CHUNKS ===")
    for i, doc in enumerate(chunks):
        preview = doc[:500]
        print(f"Chunk {i} preview: {preview}\n")

    # --- BUILD CONTEXT ---
    full_context = (
        variable_hints_text + "\n\n" +
        error_hints_text + "\n\n" +
        "\n\n".join(chunks)
    )

    # --- TRUNCATE IF TOO LONG ---
    all_tokens = encoding.encode(question + full_context)
    if len(all_tokens) > max_context_tokens:
        all_tokens = all_tokens[:max_context_tokens]
        full_context = encoding.decode(all_tokens)

    print(f"Final context length: {len(all_tokens)} tokens")

    # --- CODE GENERATION LOOP ---
    max_tries = 5
    success = False
    error_message = ""
    last_code = ""

    for trial in range(max_tries):
        prompt = f"""You are a helpful assistant with access to CMS hint files containing Python code snippets, variable names, and common error messages with solutions. 
            Use only this data to answer the following question: {question}
            Context: {full_context}
            Expected output: Python code snippet only, enclosed in triple backticks ```[code]```.
            If the answer is not in the data, respond: "I don't know based on the available information."
            Last attempt: {last_code}
            Error message (if any): {error_message}
            Please fix the code if there was an error; otherwise, provide a solution.
"""

        output = ollama.generate(
            model="llama3",
            prompt=prompt,
            options={"temperature": 0}
        )
        print(f"(TRY {trial+1}):\n", output['response'])

        # Extract code
        code_start = output['response'].find("```")
        code_end = output['response'].rfind("```")
        if code_start != -1 and code_end != -1 and code_end > code_start:
            if output['response'].startswith("```python", code_start):
                code_start += len("```python")
            else:
                code_start += len("```")
            code = output['response'][code_start:code_end].strip()
            last_code = code
            try:
                exec(code)
                print("SUCCESS")
                success = True
                break
            except Exception as e:
                error_message = str(e)
                print(f"Error running code: {error_message}")
        else:
            print("no code block found in response")
            error_message = "no code block found in response: make sure to put code in brackets like this: ``` [code] ```"
            last_code = ""

    if not success:
        print("no valid code snippet ran without errors after max trials")


=== QUESTION: Q1: Plot the missing missing transverse energy of all events in XYZ dataset (or file) ===

=== TOP CHUNKS ===
Chunk 0 preview: # Assume we have a jagged array of some physics object, e.g., "Muon_eta"
muon_eta = tree["Muon_eta"].array()  # Jagged array: one list per event  
# Limit to first N events
n = 5000
muon_eta_subset = muon_eta[:n]  
# Filter out events with no entries (e.g., no muons)
nonempty = muon_eta_subset[ak.num(muon_eta_subset) > 0]  
# Extract leading value (first element in each non-empty event)
leading_muon_eta = nonempty[:, 0]  
# Plot the distribution of the leading value
plt.hist(leading_muon_eta, bi

Chunk 1 preview: plt.hist(ak.flatten(good_muons), bins=50, range=(20, 200))  # Flatten array and plot histogram
plt.xlabel("Muon pT [GeV]")
plt.ylabel("Counts")
plt.title("Muon Transverse Momentum")
plt.show()

Chunk 2 preview: ```python
import uproot                                     # Import uproot for ROOT file access
import matplotlib.pyplot as plt

KeyboardInterrupt: 