In [1]:
%pip install -U weaviate-client # weaviate-client[agents]

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import requests, json
import weaviate
from weaviate.classes.config import Configure, VectorDistances
import weaviate.classes as wvc

# Best practice: store your credentials in environment variables
# weaviate_url = os.environ["WEAVIATE_URL"]
# weaviate_api_key = os.environ["WEAVIATE_API_KEY"]
# openai_api_key = os.environ["OPENAI_APIKEY"]

client = weaviate.connect_to_local()

In [3]:
try:
    dsm = client.collections.create(
        name="DSM",
        vector_config=Configure.Vectors.text2vec_transformers(
            name="text_vector",
            source_properties=["text"],
            vector_index_config=Configure.VectorIndex.hnsw(
                # https://docs.weaviate.io/weaviate/config-refs/distances
                distance_metric=VectorDistances.COSINE
            ),
        ),
    )

    directory = "./chunks"
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.endswith(".json"):
            with open(os.path.join(directory, filename)) as f:
                jsn = json.loads(f.read())

                cks = jsn["chunks"]
                for i,c in enumerate(cks):
                    with dsm.batch.dynamic() as batch:
                        batch.add_object(
                            {
                                "key": f"{jsn['id']}::{i}",
                                "title": jsn["title"],
                                "text": c,
                            }
                        )
except Exception as ex:
    print("probably already exists", ex)

In [12]:
# client = weaviate.connect_to_weaviate_cloud(
#     cluster_url=weaviate_url,                                    # Replace with your Weaviate Cloud URL
#     # auth_credentials=wvc.init.Auth.api_key(weaviate_api_key),    # Replace with your Weaviate Cloud key
#     # headers={"X-OpenAI-Api-Key": openai_api_key}            # Replace with appropriate header key/value pair for the required API
# )
import pprint

client = weaviate.connect_to_local()
try:
    collection = client.collections.use("DSM")

    # results = collection.query.hybrid(
    #     # https://deepwiki.com/weaviate/weaviate-python-client/5.2-keyword-and-hybrid-search
    #     return_metadata=wvc.query.MetadataQuery(
    #         score=True,
    #         explain_score=True,
    #         distance=True,
    #         certainty=True
    #     ),
    #     query="patient has been complaining of lack of sleep and low mood",
    #     alpha=0.5,  # Equal balance between BM25 and vector search
    #     target_vector="text_vector",
    #     limit=4,
    # )
    # pprint.pprint(results)

    # response = collection.query.bm25()
    # https://deepwiki.com/weaviate/weaviate/6.2-keyword-search-(bm25)
    # https://medium.com/etoai/hybrid-search-combining-bm25-and-semantic-search-for-better-results-with-lan-1358038fe7e6

    results = collection.query.near_text(
       # Because near_text is a purely vector search, you get a distance instead of a score.
        return_metadata=wvc.query.MetadataQuery(
            score=True, explain_score=True, distance=True, certainty=True
        ),
        query="patient complains of inablity to stay asleep, low mood, and doesn't care about day to day activities",
        limit=4
    )
    for r in results.objects:
        print(f"distance: {r.metadata.distance}, certainty: {r.metadata.certainty}, score: {round((1-r.metadata.distance)*100, 1)}%")
        print(r.properties['title'])
        print(r.properties['text'][:180].replace("\n",""))
        print("----------------------------")

finally:
    client.close()  # Close client gracefully

# Most text embeddings (e.g., from BERT, Sentence-BERT, etc.) are normalized, so cosine similarity is between 0 and 1.

distance: 0.14292281866073608, certainty: 0.9285385608673096, score: 85.7%
Anxiety Disorders
persistent and excessive fear or reluctance about being alone or without major attachment figures at home or in other settings. Children with separation anxiety disorder may be una
----------------------------
distance: 0.1457483172416687, certainty: 0.9271258115768433, score: 85.4%
Sleep-Wake Disorders
###### **Associated Features**  Insomnia is often associated with physiological and cognitive arousal and conditioning factors that interfere with sleep. A preoccupation with sleep
----------------------------
distance: 0.145846426486969, certainty: 0.9270768165588379, score: 85.4%
Bipolar and Related Disorders
Inflated self-esteem is typically present, ranging from uncritical selfconfidence to marked grandiosity, and may reach delusional proportions (Criterion B1). Despite lack of any pa
----------------------------
distance: 0.14879298210144043, certainty: 0.9256035089492798, score: 85.1%
Depr

In [5]:
# resp = requests.get(
#     "https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json"
# )
# data = json.loads(resp.text)


# with questions.batch.dynamic() as batch:
#     for d in data:
#         batch.add_object(
#             {
#                 "answer": d["Answer"],
#                 "question": d["Question"],
#                 "category": d["Category"],
#             }
#         )
#         if batch.number_errors > 10:
#             print("Batch import stopped due to excessive errors.")
#             break

# failed_objects = questions.batch.failed_objects
# if failed_objects:
#     print(f"Number of failed imports: {len(failed_objects)}")
#     print(f"First failed object: {failed_objects[0]}")

# response = questions.query.near_text(query="biology", limit=2)

# for obj in response.objects:
#     print(json.dumps(obj.properties, indent=2))

# client.close()