In [78]:
%pip install -U starpoint openai sentence-transformers tokenizers python-dotenv

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting starpoint
  Obtaining dependency information for starpoint from https://files.pythonhosted.org/packages/87/44/2f2e6d80a5eca7b2606fe497d6ff2539e7401e836febd0501b19ea542f9d/starpoint-0.4.2-py3-none-any.whl.metadata
  Downloading starpoint-0.4.2-py3-none-any.whl.metadata (1.5 kB)
Collecting tokenizers
  Obtaining dependency information for tokenizers from https://files.pythonhosted.org/packages/57/bd/45b5ef6b088880779f70acf60027f7043ca5fa1b98f4a4345cf3aea09044/tokenizers-0.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached tokenizers-0.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading starpoint-0.4.2-py3-none-any.whl (14

In [101]:
import os
import openai
from dotenv import load_dotenv
from starpoint.db import Client

load_dotenv()

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
openai.api_key = OPENAI_API_KEY
STARPOINT_API_KEY = os.environ["STARPOINT_API_KEY"]
starpoint_client = Client(api_key=STARPOINT_API_KEY)
COLLECTION_NAME = 'dnd'

In [192]:

def embed(texts):
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer('BAAI/bge-small-en-v1.5')
    return model.encode(texts)

def prompt_openai(prompt):
    return openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}]
    )["choices"][0]["message"]["content"]

def query_starpoint(query, use_hyde = True):
    subjects = prompt_openai(f"What is the following query about? Answer with only the nouns and nothing else. Make a comma separated list. Be concise.\nQUERY: {query}\nNOUNS:")
    print("subject:")
    subjects = subjects.split(",")
    print(subjects)

    query_embedding = []
    if use_hyde:
        query_embedding = embed([hypothetical_answer])[0].tolist()
        hypothetical_answer = prompt_openai(f"You are an information retrieval expert. Please write me an example response for the following query in 30 words or less. Be concise.\nQUERY: {query}\nRESPONSE:")
        print("hypothetical answer:")
        print(hypothetical_answer)
    else:
        query_embedding = embed([query])[0].tolist()

    relevant_monsters = starpoint_client.query(
        collection_name=COLLECTION_NAME,
        query_embedding=query_embedding,
        sql="SELECT * FROM collection LIMIT 10",
        text_search_query=subjects
    )
    return relevant_monsters['results']


In [103]:
import json

with open('./monster_text.json') as f:
    monster_texts = json.load(f)

documents_to_upload = []
embeddings = embed([monster["text"] for monster in monster_texts])

for index, monster in enumerate(monster_texts):
    documents_to_upload.append({
        "embedding": embeddings[index].tolist(),
        "metadata": monster
    })

print(documents_to_upload[:10])



Downloading (…)8683f/.gitattributes: 100%|██████████| 1.52k/1.52k [00:00<00:00, 13.2MB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 1.73MB/s]
Downloading (…)19c878683f/README.md: 100%|██████████| 89.1k/89.1k [00:00<00:00, 16.8MB/s]
Downloading (…)c878683f/config.json: 100%|██████████| 743/743 [00:00<00:00, 7.03MB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 124/124 [00:00<00:00, 1.16MB/s]
Downloading pytorch_model.bin: 100%|██████████| 134M/134M [00:01<00:00, 103MB/s]  
Downloading (…)nce_bert_config.json: 100%|██████████| 52.0/52.0 [00:00<00:00, 488kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 125/125 [00:00<00:00, 1.09MB/s]
Downloading (…)8683f/tokenizer.json: 100%|██████████| 711k/711k [00:00<00:00, 5.46MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 394/394 [00:00<00:00, 3.62MB/s]
Downloading (…)19c878683f/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 3.84MB/s]
Downloading (…)878683f/modules.json: 100%|██

[{'embedding': [-0.5253125429153442, -0.026173902675509453, 0.40217453241348267, -0.11282476782798767, 0.31034380197525024, 0.10116838663816452, 0.8613144755363464, 0.3772578537464142, -0.19478753209114075, -0.3582358658313751, -0.08373577147722244, -0.42025360465049744, -0.3416472375392914, 0.2081923633813858, -0.1580861210823059, 0.24088767170906067, -0.07864091545343399, -0.09935761988162994, -0.5267653465270996, 0.3976350426673889, 0.13927297294139862, -0.38473087549209595, -0.004427427425980568, 0.2284201979637146, 0.22414511442184448, 0.3580842614173889, -0.14410188794136047, -0.24781948328018188, 0.11653687804937363, -1.6242610216140747, 0.323579877614975, 0.4455656409263611, -0.09442572295665741, -0.10164912790060043, -0.5440677404403687, 0.06367896497249603, -0.007222630549222231, 0.47418496012687683, 0.15812785923480988, 0.24926821887493134, 0.3355726897716522, 0.8233051896095276, -0.16823065280914307, -0.5243478417396545, 0.056901123374700546, -0.42162981629371643, -0.184097

In [104]:
starpoint_client.insert(documents=documents_to_upload, collection_name=COLLECTION_NAME)

{'collection_id': '8bc84a35-a59e-4345-9f37-cdb3ff550696',
 'documents': [{'id': 'hwoakojpw4gn'},
  {'id': '2fvrc24tcxou'},
  {'id': '8qtghgdmzyaf'},
  {'id': 'fcnfk7e5vysy'},
  {'id': '1dgm06tfwpqa'},
  {'id': '0hxq3mhh78un'},
  {'id': '4mok6sx3zmpa'},
  {'id': '42d85omhy4cw'},
  {'id': '22q7yblz8zfl'},
  {'id': 'j5jkn3ei6e6d'},
  {'id': 'z3fy2lcg4mol'},
  {'id': '0r8236nxg968'},
  {'id': 'is1011s4lcam'},
  {'id': 'phtu26hc7rr0'},
  {'id': '9uz5391haui7'},
  {'id': 'f3lqcdhasu0f'},
  {'id': 'lpayq605se7b'},
  {'id': 's5w53conrmbb'},
  {'id': 'duz0nl1qqp5d'},
  {'id': 't1iazfac7lbi'},
  {'id': '4ixkes0lsu8m'},
  {'id': 'uygcmz7sekuv'},
  {'id': '5axy1w3ak0sv'},
  {'id': 'zzmiej6u6wxi'},
  {'id': 'vwl2hh1xg2zn'},
  {'id': '5nhgmowad4pl'},
  {'id': 'r4n8wcjryhz0'},
  {'id': 'np4qriq3mfy5'},
  {'id': '6gvs2t39kh19'},
  {'id': 'h9n949xyvsa9'},
  {'id': 'xk9nelw850j7'},
  {'id': '0s8pcdch0rgp'},
  {'id': 'dpewlm9qg1wr'},
  {'id': '93z4x3wx7832'},
  {'id': 'mhbftqhkx7v4'},
  {'id': '21b621wtu

In [187]:
results = query_starpoint("What is the name of the monster with the trait 'Unusual Nature'?", use_hyde=False)

subject:
['name', ' monster', ' trait', ' Unusual Nature']
[-0.2607116103172302, -0.038899119943380356, 0.24471591413021088, 0.4452670216560364, 0.24072416126728058, -0.5954854488372803, 0.7518364191055298, 0.08806334435939789, -0.40530991554260254, -0.30016905069351196, 0.07619094103574753, -0.21660088002681732, 0.18659038841724396, -0.2024955004453659, 0.07255195081233978, -0.12274166941642761, 0.08794427663087845, 0.20168505609035492, -0.2220877707004547, 0.592282235622406, 0.5710530877113342, 0.20968161523342133, 0.26590368151664734, -0.08389724791049957, -0.3352871239185333, 0.004026954062283039, -0.20581048727035522, 0.03378773108124733, 0.15069222450256348, -0.8970978260040283, -0.32717496156692505, -0.04397695139050484, -0.26104018092155457, -0.3419091999530792, 0.10643505305051804, -0.011372470296919346, -0.6556589007377625, 0.10125923156738281, 0.5766210556030273, 0.9820993542671204, 0.0338529497385025, 0.6222852468490601, 0.018283957615494728, -0.21230031549930573, 0.0875285

In [190]:
import pprint 

pp = pprint.PrettyPrinter(indent=4)
pp.pprint([result['monster_name'] + ' - ' + result['text'] for result in results])

[   'Shoal Serpent - The oceans, bays, and swamps of Zendikar are home to a '
    'variety of aquatic creatures that are at least as deadly as those on '
    'land, including monstrosities that can face the largest Eldrazi on almost '
    'equal footing. Given the dangers of Zendikar, even mundane animals such '
    'as octopuses, frogs, turtles, crabs, and crocodiles can grow to '
    'tremendous size (using the appropriate statistics from appendix A of the '
    'Monster Manual). The crabs of Ondu, the crocodiles of Guul Draz, the '
    'tortoises of Tazeem, and the octopuses of the deep sea (of which Lorthos '
    'the Tidemaker is but one giant specimen) are examples of these aquatic '
    'monstrosities. Enormous shoal serpents—sometimes compared to "a reef that '
    'runs aground on ships"—are a persistent danger to vessels along the '
    'Onduan coast. The {@creature plesiosaurus} in the Monster Manual can '
    'represent these serpents.',
    'Manticore - A monster in every 