In [15]:
%pip install -U weaviate-client # weaviate-client[agents]

Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import requests, json
import weaviate
from weaviate.classes.config import Configure, VectorDistances
import weaviate.classes as wvc

# Best practice: store your credentials in environment variables
# weaviate_url = os.environ["WEAVIATE_URL"]
# weaviate_api_key = os.environ["WEAVIATE_API_KEY"]
# openai_api_key = os.environ["OPENAI_APIKEY"]

client = weaviate.connect_to_local()

In [None]:
try:
    dsm = client.collections.create(
        name="DSM",
        vector_config=Configure.Vectors.text2vec_transformers(
            name="text_vector",
            source_properties=["text"],
            vector_index_config=Configure.VectorIndex.hnsw(
                # https://docs.weaviate.io/weaviate/config-refs/distances
                distance_metric=VectorDistances.COSINE
            ),
        ),
    )

    directory = "./chunks"
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.endswith(".json"):
            with open(os.path.join(directory, filename)) as f:
                jsn = json.loads(f.read())

                cks = jsn["chunks"]
                for i,c in enumerate(cks):
                    with dsm.batch.dynamic() as batch:
                        batch.add_object(
                            {
                                "key": f"{jsn['id']}::{i}",
                                "title": jsn["title"],
                                "text": c,
                            }
                        )
except Exception as ex:
    print("probably already exists", ex)

probably already exists Collection may not have been created properly.! Unexpected status code: 422, with response body: {'error': [{'message': 'class name DSM already exists'}]}.


In [41]:
# client = weaviate.connect_to_weaviate_cloud(
#     cluster_url=weaviate_url,                                    # Replace with your Weaviate Cloud URL
#     # auth_credentials=wvc.init.Auth.api_key(weaviate_api_key),    # Replace with your Weaviate Cloud key
#     # headers={"X-OpenAI-Api-Key": openai_api_key}            # Replace with appropriate header key/value pair for the required API
# )
import pprint

client = weaviate.connect_to_local()
try:
    collection = client.collections.use("DSM")

    # results = collection.query.hybrid(
    #     # https://deepwiki.com/weaviate/weaviate-python-client/5.2-keyword-and-hybrid-search
    #     return_metadata=wvc.query.MetadataQuery(
    #         score=True,
    #         explain_score=True,
    #         distance=True,
    #         certainty=True
    #     ),
    #     query="patient has been complaining of lack of sleep and low mood",
    #     alpha=0.5,  # Equal balance between BM25 and vector search
    #     target_vector="text_vector",
    #     limit=4,
    # )
    # pprint.pprint(results)

    # response = collection.query.bm25()
    # https://deepwiki.com/weaviate/weaviate/6.2-keyword-search-(bm25)
    # https://medium.com/etoai/hybrid-search-combining-bm25-and-semantic-search-for-better-results-with-lan-1358038fe7e6

    results = collection.query.near_text(
       # Because near_text is a purely vector search, you get a distance instead of a score.
        return_metadata=wvc.query.MetadataQuery(
            score=True, explain_score=True, distance=True, certainty=True
        ),
        query="patient has been complaining of lack of sleep and low mood",
        limit=4
    )
    for r in results.objects:
        print(f"distance: {r.metadata.distance}, certainty: {r.metadata.certainty}, score: {round((1-r.metadata.distance)*100, 1)}%")
        print(r.properties['title'])
        print(r.properties['text'][:180].replace("\n",""))
        print("----------------------------")

finally:
    client.close()  # Close client gracefully

# Most text embeddings (e.g., from BERT, Sentence-BERT, etc.) are normalized, so cosine similarity is between 0 and 1.

distance: 0.1520099639892578, certainty: 0.9239950180053711, score: 84.8%
Sleep-Wake Disorders
morning obligations, such as arriving at work on time. Unintentionaldaytime sleep episodes can be embarrassing and even dangerous, if, forinstance, the individual is driving or o
----------------------------
distance: 0.154221773147583, certainty: 0.9228891134262085, score: 84.6%
Sleep-Wake Disorders
###### **Differential Diagnosis****Other hypersomnias.** Hypersomnolence disorder (also known as idiopathichypersomnia) and narcolepsy are similar with respect to the presence 
----------------------------
distance: 0.15458935499191284, certainty: 0.9227052927017212, score: 84.5%
Breathing-Related Sleep Disorders
asymptomatic periods. Starting with the asymptomatic period, when theindividual's sleep phase is aligned to the external environment, sleeplatency will gradually increase and the
----------------------------
distance: 0.15590494871139526, certainty: 0.92204749584198, score: 84.4%
Breathi

In [19]:
# resp = requests.get(
#     "https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json"
# )
# data = json.loads(resp.text)


# with questions.batch.dynamic() as batch:
#     for d in data:
#         batch.add_object(
#             {
#                 "answer": d["Answer"],
#                 "question": d["Question"],
#                 "category": d["Category"],
#             }
#         )
#         if batch.number_errors > 10:
#             print("Batch import stopped due to excessive errors.")
#             break

# failed_objects = questions.batch.failed_objects
# if failed_objects:
#     print(f"Number of failed imports: {len(failed_objects)}")
#     print(f"First failed object: {failed_objects[0]}")

# response = questions.query.near_text(query="biology", limit=2)

# for obj in response.objects:
#     print(json.dumps(obj.properties, indent=2))

# client.close()