In [1]:
import json
import pathlib
import pandas as pd

In [2]:
# 1. Load the file
path = pathlib.Path("data/JSONS")
spell_path = path / "spells_pdf.json"
weapon_path = path / "weapons_pdf.json"
feats_path = path / "feats_pdf.json"
class_path = path / "classes_pdf.json"


spells = json.loads(spell_path.read_text())  # <-- use plural; expect a
weapons = json.loads(weapon_path.read_text())  # <-- use plural; expect a list of dicts
feats = json.loads(feats_path.read_text()) 
classes =json.loads(class_path.read_text())  

# # 2. Flatten each spell record into a plain-text blob
# def record_to_text(rec: dict, rec_type: str) -> str:
#     """
#     Turn a dict into a multi-line string.
#     Keeps the large description under the 'desc' or 'description' key.
#     """
#     lines = [f"type: {rec_type}"]
#     for k, v in rec.items():
#         # Prefer a single key name for description so we don't embed it twice
#         if k in {"desc", "description"}:
#             lines.append(f"description: {v}")
#         else:
#             lines.append(f"{k}: {v}")
#     return "\n".join(lines)

def clean_record(rec: dict, rec_type: str = "record") -> str:
    """
    Clean and format a record into a human-readable, embedding-friendly string.
    Ensures 'description' is included (normalizes 'desc'), preserves key ordering,
    and includes extra requested fields.
    """
    # Normalize desc → description
    if "desc" in rec and "description" not in rec:
        rec["description"] = rec.pop("desc")

    # Fields to include in this preferred order
    priority_keys = [
        "name", "description", "school", "level", "spell_level", "level_int", "page",
        "casting_time", "range", "components", "material", "duration", "concentration",
        "ritual", "dnd_class", "spell_lists", "archetype", "circles"
    ]

    # Fields to exclude entirely
    exclude_keys = {
        "slug", "document__url", "document__title", "document__license_url",
        "target_range_sort", "document__slug"
    }

    def format_value(v):
        if isinstance(v, bool):
            return "yes" if v else "no"
        elif isinstance(v, list):
            return ", ".join(str(i) for i in v)
        return str(v)

    lines = [f"type: {rec_type}"]

    # Add fields in preferred order
    for key in priority_keys:
        if key in rec and key not in exclude_keys:
            val = rec[key]
            if val not in (None, ""):
                lines.append(f"{key}: {format_value(val)}")

    # Add any remaining keys not already included or excluded
    for key, val in rec.items():
        if key not in priority_keys and key not in exclude_keys:
            if val not in (None, ""):
                lines.append(f"{key}: {format_value(val)}")

    return "\n".join(lines)





# docs   = [record_to_text(r, "spell")  for r in spells] + \
#          [record_to_text(r, "weapon") for r in weapons]+ \
#          [record_to_text(r, "feats") for r in feats]+ \
#          [record_to_text(r, "classes") for r in classes]

docs = [clean_record(s, "spell")   for s in spells] + \
       [clean_record(w, "weapon")  for w in weapons] + \
       [clean_record(f, "feat")    for f in feats] + \
       [clean_record(c, "class")   for c in classes]


In [3]:
print(f"Total docs embedded: {len(docs)}")

Total docs embedded: 324


In [4]:
docs[:1]

['type: spell\nname: Aid\ndescription: Your spell bolsters your allies with toughness and \nresolve. C hoose up to three creatures within range.\nEach target’s hit point maximum and current hit points \nincrease by 5 for the duration.\nAt Higher Levels. W hen you cast this spell using \na spell slot of 3rd level or higher, a target’s hit points \nincrease by an additional 5 for each slot level above 2nd.\nschool: abjuration\nlevel: 2nd-level\ncasting_time: 1 action\nrange: 30 feet\ncomponents: V, S, M (a tiny strip o f white cloth)\nduration: 8 hours']

In [5]:
!pip install sentence-transformers faiss-cpu




In [6]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

In [7]:
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embs = embedder.encode(docs, batch_size=64, convert_to_numpy=True)
index = faiss.IndexFlatL2(embs.shape[1])
index.add(embs)

  return forward_call(*args, **kwargs)


In [8]:
def retrieve(question: str, k: int = 5):
    q_emb = embedder.encode([question]).astype("float32")
    D, I = index.search(q_emb, k)                # distances & indices
    hits = [docs[i] for i in I[0]]
    return "\n\n".join(hits)


In [9]:
from transformers import pipeline, AutoTokenizer

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"   # any small instruct model works
tok  = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
llm  = pipeline("text-generation",
                model=model_name,
                tokenizer=tok,
                device="cpu",            # change to 0 if you have a GPU
                max_new_tokens=100,
                temperature=.01)         # deterministic

def answer(question: str):
    context = retrieve(question, k=2)
    prompt = f"""You are a helpful assistant. 
    Answer the question using only the context below. 
    If the answer is not in the context, say you don't know.

    ### Context
    {context}

    ### Question
    {question}

    ### Answer
    """
    resp = llm(prompt)[0]["generated_text"]
    return resp.split("### Answer",1)[-1].strip()




Device set to use cpu


In [10]:
print(answer("can you tel me about the spell acid splash and where you got this information?"))

Yes, I can tell you about the spell acid splash. It is a spell that creates a small, acidic mist that can be used to damage or daze enemies. The spell is typically cast on a target that is within 120 feet of the caster. The mist is created by pouring a small amount of acid into a container, such as a flask or a bottle. The container is then placed in the spell's focus, which is usually a


In [11]:
# print(answer("what is the damage of a longsword?"))

In [12]:
print(answer("Im not sure what spells to cast, I'm a druid, can you help me?"))

Druid: Sure, I'd be happy to help you. Augury is a spell that allows you to detect the presence of spirits, animals, and other supernatural beings. It's a powerful tool for divination, but it's also dangerous if you're not careful. Here's how to cast it:

    1. Choose a location where you'll be casting the spell. This could be a clearing, a forest, or


In [13]:
print(answer("what weapons can a druid use?"))

1. Lance
    2. Spear
    3. Sword
    4. Dagger
    5. Mace
    6. Quarterstaff
    7. Sling
    8. Spear
    9. Dagger
    10. Mace
    11. Quarterstaff
    12. Sling
    13. Spear
    14. Dagger
    15. Mace


In [14]:
print(answer('what is a feat?'))

1. A feat is a special ability that grants a bonus to a character's ability score.
    2. A feat is a special ability that grants a bonus to a character's proficiency in a skill or tool.
    3. A feat is a special ability that grants a bonus to a character's proficiency in a language or code.


In [15]:
# # %%
# import json
# import pathlib
# import pandas as pd

# # %%
# # 1. Load the file
# path = pathlib.Path("data")
# spell_path = path / "spells.json"
# weapon_path = path / "weapons.json"

# spells = json.loads(spell_path.read_text())
# weapons = json.loads(weapon_path.read_text())

# # 2. Flatten each spell/weapon into a text blob
# def record_to_text(rec: dict, rec_type: str) -> str:
#     lines = [f"type: {rec_type}"]
#     for k, v in rec.items():
#         if k in {"desc", "description"}:
#             lines.append(f"description: {v}")
#         else:
#             lines.append(f"{k}: {v}")
#     return "\n".join(lines)

# docs = [record_to_text(r, "spell") for r in spells] + \
#        [record_to_text(r, "weapon") for r in weapons]

# print(f"Total docs embedded: {len(docs)}")

# # %%
# from sentence_transformers import SentenceTransformer
# import hnswlib
# import numpy as np

# # %%
# embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# embs = embedder.encode(docs, batch_size=64, convert_to_numpy=True).astype("float32")

# dim = embs.shape[1]
# index = hnswlib.Index(space="cosine", dim=dim)
# index.init_index(max_elements=len(docs), ef_construction=200, M=16)
# index.add_items(embs)

# # Save index and doc IDs
# index.save_index("data/spell_weapon.index")
# np.save("data/spell_weapon.embeddings.npy", np.arange(len(docs)))

# # %%
# def retrieve(question: str, k: int = 5):
#     q_emb = embedder.encode([question], convert_to_numpy=True).astype("float32")
#     labels, distances = index.knn_query(q_emb, k=k)
#     return "\n\n".join(docs[i] for i in labels[0])

# # %%
# from transformers import pipeline, AutoTokenizer

# model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# llm = pipeline(
#     "text-generation",
#     model=model_name,
#     tokenizer=tok,
#     device="cpu",  # change to 0 if you have GPU
#     max_new_tokens=128,
#     temperature=0.0
# )

# def answer(question: str):
#     context = retrieve(question, k=2)
#     prompt = f"""You are a helpful assistant. 
# Answer the question using only the context below. 
# If the answer is not in the context, say you don't know.

# ### Context
# {context}

# ### Question
# {question}

# ### Answer
# """
#     resp = llm(prompt)[0]["generated_text"]
#     return resp.split("### Answer", 1)[-1].strip()

