In [1]:
%pip install -U sentence-transformers starpoint

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Obtaining dependency information for transformers<5.0.0,>=4.6.0 from https://files.pythonhosted.org/packages/e1/9d/4d9fe5c3b820db10773392ac5f4a0c8dab668f70b245ce2ce09785166128/transformers-4.33.0-py3-none-any.whl.metadata
  Downloading transformers-4.33.0-py3-none-any.whl.metadata (119 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.9/119.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting torch>=1.6.0 (from sentence-transformers)
  Downloading torch-2.0.1-cp311-cp311-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m4.8 MB/s[0m eta [36m0:00:

In [2]:
!unzip data.zip -d data/

Archive:  data.zip
 extracting: data/actions.json       
   creating: data/adventure/
 extracting: data/adventure/adventure-dosi.json  
 extracting: data/adventure/adventure-aitfr-avt.json  
 extracting: data/adventure/adventure-aitfr-fcd.json  
 extracting: data/adventure/adventure-efr.json  
 extracting: data/adventure/adventure-aitfr-thp.json  
 extracting: data/adventure/adventure-azfyt.json  
 extracting: data/adventure/adventure-fs.json  
 extracting: data/adventure/adventure-dd.json  
 extracting: data/adventure/adventure-aitfr-isf.json  
 extracting: data/adventure/adventure-aitfr-dn.json  
 extracting: data/adventure/adventure-dc.json  
 extracting: data/adventure/adventure-gotsf.json  
 extracting: data/adventure/adventure-dsotdq.json  
 extracting: data/adventure/adventure-dip.json  
 extracting: data/adventure/adventure-bgdia.json  
 extracting: data/adventure/adventure-hftt.json  
 extracting: data/adventure/adventure-cm.json  
 extracting: data/adventure/adventure-crcotn.

In [5]:
# we're going to write a utility function to flatten out the entries
# in any given data, this pattern is used often in 5e tools
def flatten_entries(data):
    # Base case: If data is a string, return it in a list
    if isinstance(data, str):
        return [data]

    # For lists: iterate through each item and flatten
    if isinstance(data, list):
        result = []
        for item in data:
            result.extend(flatten_entries(item))
        return result

    # For dictionaries: look for the "entries" key and flatten its content
    if isinstance(data, dict):
        if "entries" in data:
            return flatten_entries(data["entries"])
        return []
    return []

In [6]:
import os
import json

# list all of the files in the bestiary data directory
# filtering out the ones that are fluff or purely metadata
# with no relevant monster information
base_bestiary_files = list(filter(lambda x: x.startswith('bestiary'), os.listdir('data/bestiary')))

# fluff files (monster descriptions are separate from the core data files)
# so we're going to join them all together into one big dictionary
# keyed on the monster name + source (there are sometimes duplicate names)
monsters = {}

for base_file in base_bestiary_files: 
    with open('data/bestiary/' + base_file, 'r') as f:
        # json parse the file and then get the monster data
        # from the json
        monster_data = json.loads(f.read())['monster']

        # iterate through the monster data
        for monster in monster_data:
            key = monster['name'] + '-' + monster['source']
            # if the monster name is already in the dictionary
            # then we need to merge the data
            if key in monster_data:
                # merge the data
                monsters[key] = {**monsters[key], **monster}
            else:
                # otherwise just add the data to the dictionary
                monsters[key] = monster


In [7]:
# now we want to get the monster descriptions
# and add them to the dictionary
for fluff_file in filter(lambda x: x.startswith('fluff-bestiary'), os.listdir('data/bestiary')):
    with open('data/bestiary/' + fluff_file, 'r') as f:
        fluff_data = json.loads(f.read())['monsterFluff']

        # iterate through the fluff data
        for fluff in fluff_data:
            key = fluff['name'] + '-' + fluff['source']
            # if the monster name is already in the dictionary
            # then we need to merge the data
            if 'entries' in fluff and key in monsters:
                # merge the data
                monsters[key]['descriptions'] = flatten_entries(fluff['entries'])


In [8]:
# print the first 10 monsters
for key, value in list(monsters.items())[:10]:
    # pretty print the monster data
    print(json.dumps(value, indent=4))


{
    "name": "Ancient Deep Crow",
    "source": "AI",
    "page": 211,
    "size": [
        "H"
    ],
    "type": "monstrosity",
    "alignment": [
        "U"
    ],
    "ac": [
        {
            "ac": 18,
            "from": [
                "natural armor"
            ]
        }
    ],
    "hp": {
        "average": 187,
        "formula": "15d12 + 90"
    },
    "speed": {
        "walk": 20,
        "fly": 80
    },
    "str": 23,
    "dex": 16,
    "con": 23,
    "int": 10,
    "wis": 15,
    "cha": 19,
    "save": {
        "con": "+11",
        "wis": "+7"
    },
    "skill": {
        "perception": "+7",
        "stealth": "+13"
    },
    "senses": [
        "blindsight 60 ft.",
        "darkvision 120 ft."
    ],
    "passive": 17,
    "resist": [
        {
            "resist": [
                "bludgeoning",
                "piercing",
                "slashing"
            ],
            "note": "from nonmagical attacks",
            "cond": true
        }
    ]

In [9]:
from collections import defaultdict

monster_texts = defaultdict(dict)

def add_text_to_embed(key, field, monster_entry):
    text = ""
    for trait in monster_entry.get(field):
        if type(trait) == str:
            text += trait + "\n"
            continue
        if trait.get("name") is not None:
            text += trait["name"] + "\n"
        for entry in trait["entries"]:
            text += "\n".join(flatten_entries(entry)) + "\n"
    monster_texts[key][field] = text

for key, monster in monsters.items():
    for field in ["trait", "action", "legendary", "bonus", "descriptions"]:
        if field in monster and monster[field] is not None:
            add_text_to_embed(key, field, monster)

for key, value in list(monster_texts.items())[:10]:
    print(key, value)

Ancient Deep Crow-AI {'trait': 'Magic Resistance\nThe ancient deep crow has advantage on saving throws against spells and other magical effects.\nShadow Stealth\nWhile in dim light or darkness, the ancient deep crow can take the Hide action as a bonus action.\n', 'action': "Multiattack\nThe ancient deep crow makes three attacks: one with its mandibles and two with its claws.\nMandibles\n{@atk mw} {@hit 11} to hit, reach 10 ft., one target. {@h}17 ({@damage 2d10 + 6}) piercing damage, and the target is {@condition grappled} (escape {@dc 19}). Until this grapple ends, the target is {@condition restrained}, and the ancient deep crow can't use its mandibles on another target.\nClaw\n{@atk mw} {@hit 11} to hit, reach 5 ft., one target. {@h}13 ({@damage 2d6 + 6}) slashing damage.\nShadow Caw\nThe ancient deep crow releases an ear-splitting caw. Each creature within 60 feet of the crow and able to hear it must make a {@dc 17} Constitution saving throw. On a failure, a creature takes 10 ({@dam

Dump the monster text to be separately processed

In [11]:
with open("./monster_text.json", "w") as fp:
    json.dump(monster_texts, fp)

In [None]:
from sentence_transformers import SentenceTransformer

embeddings_data = defaultdict(dict)

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
for key, texts in monster_texts.items():
    for text_field, text in texts.items():
        embedding = model.encode(text)
        embeddings_data[key][text_field] = embedding

In [None]:
import copy
from starpoint.db import Client

COLLECTION_NAME = 'dnd'
API_KEY = '2a3ed27a-ced2-4ad6-b411-511a5c71f6e3'

documents_to_upload = []

for key, embeddings in embeddings_data.items():
    metadata = monsters[key]
    for text_field, embedding in embeddings.items():
        new_metadata = copy.deepcopy(metadata)
        new_metadata['embedding_source'] = text_field
        documents_to_upload.append({
            'metadata': new_metadata,
            'embedding': embedding.tolist()
        })

client = Client(api_key=API_KEY)
client.insert(documents=documents_to_upload, collection_name=COLLECTION_NAME)


{'collection_id': '7eda4d2c-1f99-44ff-bc0f-5f2d49ce7397',
 'documents': [{'id': 'wiyf9mubdc2l'},
  {'id': 'bomfi3kl5w5o'},
  {'id': 'reezxivjjy1t'},
  {'id': 'nvrqud37f436'},
  {'id': 'pfmihaewo0wz'},
  {'id': 'uli0yvtqb2jn'},
  {'id': 'eybti67jla5v'},
  {'id': 'znbi01h7yljm'},
  {'id': 'azl1ddcmj5lo'},
  {'id': 'my71n0iqwf56'},
  {'id': 'jwtk2u7e9vyx'},
  {'id': 'qtkr5lawrgwj'},
  {'id': 'lcgs2i4la810'},
  {'id': 'yw3kcf4y9v6v'},
  {'id': 'koekpbc0qf5o'},
  {'id': 'lxqtmfhoghny'},
  {'id': 'j4wpb4lh8bb7'},
  {'id': 'eh5veln7qmpb'},
  {'id': '300wgkyem48v'},
  {'id': 'auqzy2q8zxfo'},
  {'id': 'my1uvtp58r9v'},
  {'id': '2zx7b6h36dof'},
  {'id': 'bqy4irksnw5a'},
  {'id': 'ddfdo9nty29j'},
  {'id': '0i3vzehelq8i'},
  {'id': 'v5lob1gsn386'},
  {'id': 'snpmolx0v5rj'},
  {'id': '7xzxzblmj5yg'},
  {'id': '5lh0o7f728k3'},
  {'id': 't0kt0vkqw91c'},
  {'id': 'kpxww79feetg'},
  {'id': 'bqd1xy27sana'},
  {'id': 'eflo7m65gsgf'},
  {'id': 'teepicfr34pn'},
  {'id': 'p2addjaewx6p'},
  {'id': '8xs5j5s34