In [2]:
import requests
import json

# Download the data
resp = requests.get('https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json')
data = json.loads(resp.text)  # Load data

# Parse the JSON and preview it
print(type(data), len(data))
print(json.dumps(data[0], indent=2))

def jprint(data):
    print(json.dumps(data, indent=2))

<class 'list'> 10
{
  "Category": "SCIENCE",
  "Question": "This organ removes excess glucose from the blood & stores it as glycogen",
  "Answer": "Liver"
}


In [25]:
import weaviate
from weaviate.classes.config import Property, DataType
from weaviate.embedded import EmbeddedOptions
import weaviate.classes.config as wvcc
import weaviate.classes as wvc
import os

In [4]:
client = weaviate.connect_to_embedded(#embedded_options=EmbeddedOptions(),
                        headers={
                            "X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]
                        })

Started /Users/sharif.shaker/.cache/weaviate-embedded: process ID 90118


{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2024-03-22T16:58:59-05:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2024-03-22T16:58:59-05:00"}
{"level":"info","msg":"No resource limits set, weaviate will use all available memory and CPU. To limit resources, set LIMIT_RESOURCES=true","time":"2024-03-22T16:58:59-05:00"}
{"action":"grpc_startup","level":"info","msg":"grpc server listening at [::]:50050","time":"2024-03-22T16:58:59-05:00"}
{"action":"restapi_management","level":"info","msg":"Serving weaviate at http://127.0.0.1:8079","time":"2024-03-22T16:58:59-05:00"}
{"level":"info","msg":"Completed loading shard question_zii33D9fEfYp in 25.219459ms","time":"2024-03-22T16:59:00-05:00"}
{"action":"hnsw_vector_cache_pre

In [5]:
jprint(client.get_meta())

{
  "hostname": "http://127.0.0.1:8079",
  "modules": {
    "generative-openai": {
      "documentationHref": "https://platform.openai.com/docs/api-reference/completions",
      "name": "Generative Search - OpenAI"
    },
    "qna-openai": {
      "documentationHref": "https://platform.openai.com/docs/api-reference/completions",
      "name": "OpenAI Question & Answering Module"
    },
    "ref2vec-centroid": {},
    "reranker-cohere": {
      "documentationHref": "https://txt.cohere.com/rerank/",
      "name": "Reranker - Cohere"
    },
    "text2vec-cohere": {
      "documentationHref": "https://docs.cohere.ai/embedding-wiki/",
      "name": "Cohere Module"
    },
    "text2vec-huggingface": {
      "documentationHref": "https://huggingface.co/docs/api-inference/detailed_parameters#feature-extraction-task",
      "name": "Hugging Face Module"
    },
    "text2vec-openai": {
      "documentationHref": "https://platform.openai.com/docs/guides/embeddings/what-are-embeddings",
      "nam

In [6]:
#Delete the schema if it alredy exists
if client.collections.exists("Question"):
    print("deleting Questions")
    client.collections.delete("Question")
else:
    print("no comprendo")

deleting Questions


In [7]:
try:
    # Create the class schema in Weaviate
    collection = client.collections.create(
        name="Question",
        vectorizer_config=wvcc.Configure.Vectorizer.text2vec_openai(),
        properties=[
            Property(name="answer", data_type=DataType.TEXT),
            Property(name="question", data_type=DataType.TEXT),
            Property(name="category", data_type=DataType.TEXT),
        ]
    )

    print("Class schema has been successfully created.")
except Exception as e:
    print(f"Failed to create class schema: {e}")

Class schema has been successfully created.


{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-03-22T16:59:45-05:00","took":51083}
{"level":"info","msg":"Created shard question_RsEY66UJx6d7 in 18.241958ms","time":"2024-03-22T16:59:45-05:00"}


In [8]:
with collection.batch.dynamic() as batch:
    for i, d in enumerate(data):  # Batch import data
        
        print(f"importing question: {i+1}")
            
        #Specify the properties we want to import into Weviate
        
        data_obj = {
            "answer": d["Answer"],
            "question": d["Question"],
            "category": d["Category"],
        }
        
        #Add data to Weaviate
        
        batch.add_object(properties=data_obj)

importing question: 1
importing question: 2
importing question: 3
importing question: 4
importing question: 5
importing question: 6
importing question: 7
importing question: 8
importing question: 9
importing question: 10


In [9]:
#Check how many objects we've loaded into the database
response = collection.aggregate.over_all(
    total_count=True,
)

print(response.total_count)
print(response.properties)

10
{}


In [10]:
#Extract and show any 3 questions and answers

response = collection.query.fetch_objects(
        return_properties=["question", "answer"],
        limit=3
    )

for o in response.objects:
    print(o.properties)  # Inspect returned objects

{'answer': 'species', 'question': "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification"}
{'answer': 'Sound barrier', 'question': 'In 70-degree air, a plane traveling at about 1,130 feet per second breaks it'}
{'answer': 'Elephant', 'question': "It's the only living mammal in the order Proboseidea"}


----

## Lets Extract the vector that represents each question!

In [18]:
# write a query to extract the vector for a question

response = collection.query.fetch_objects(
        return_properties=["question", "answer", "category"],
        limit=1,
        include_vector=True,
    )

for o in response.objects:
    print(o.properties)
    print(o.vector)  # Inspect returned objects

result = response.objects[0]

{'question': "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification", 'category': 'SCIENCE', 'answer': 'species'}
{'default': [-0.00015833110956009477, 0.019465867429971695, -0.0054935370571911335, -0.016090359538793564, -0.0006825227756053209, 0.01315513625741005, 0.004436189774423838, -0.02316157892346382, -0.03023279830813408, -0.03148693963885307, -0.01302171777933836, 0.01857195794582367, -0.012307924218475819, -0.006130613852292299, 0.017504604533314705, 0.04266747087240219, 0.02921881340444088, 0.020946819335222244, 0.03236750513315201, -0.006324071902781725, -0.020853426307439804, 0.019078951328992844, 0.0012533069821074605, -0.01709100417792797, 0.011460712179541588, 0.021774020045995712, 0.009679565206170082, -0.015756811946630478, -0.021427128463983536, 0.030739791691303253, -0.009252623654901981, 0.026190195232629776, -0.009232611395418644, -0.018905505537986755, -0.026763899251818657, -0.018358487635850906, 0.0015

In [19]:
#This is the question corresponding to this vector

print(result.properties['question'])

2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification


In [20]:
#This is the answer to this question

print(result.properties['question'])

2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification


In [21]:
#Now display the vector representation of the above question and answer

print(result.vector['default'])

[-0.00015833110956009477, 0.019465867429971695, -0.0054935370571911335, -0.016090359538793564, -0.0006825227756053209, 0.01315513625741005, 0.004436189774423838, -0.02316157892346382, -0.03023279830813408, -0.03148693963885307, -0.01302171777933836, 0.01857195794582367, -0.012307924218475819, -0.006130613852292299, 0.017504604533314705, 0.04266747087240219, 0.02921881340444088, 0.020946819335222244, 0.03236750513315201, -0.006324071902781725, -0.020853426307439804, 0.019078951328992844, 0.0012533069821074605, -0.01709100417792797, 0.011460712179541588, 0.021774020045995712, 0.009679565206170082, -0.015756811946630478, -0.021427128463983536, 0.030739791691303253, -0.009252623654901981, 0.026190195232629776, -0.009232611395418644, -0.018905505537986755, -0.026763899251818657, -0.018358487635850906, 0.001559337368234992, -0.037010494619607925, 0.01737118512392044, -0.022707954049110413, 0.021160291507840157, 0.020960161462426186, -0.0022214301861822605, -0.00634408462792635, -0.0167040880

In [22]:
#How many numbers are there in this vector?

print(len(result.vector['default']))

1536


## We have sucessfully extracted the vector for this datapoint!

## Lets see if we can search for a relevant answer using vector search!

In [23]:
#Build a vector search query to extract questions ,answers and categories related to "biology"


response = collection.query.near_text(
    query=["biology"],
    return_properties=["question", "answer", "category"],
    limit=2,
)

for o in response.objects:
    print(o.properties)
    print(o.metadata)



{'question': 'In 1953 Watson & Crick built a model of the molecular structure of this, the gene-carrying substance', 'answer': 'DNA', 'category': 'SCIENCE'}
MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None)
{'answer': 'species', 'category': 'SCIENCE', 'question': "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification"}
MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None)


## What is the distance between the `query`: `biology` and the returned objects?

In [26]:
#Write code to extract the distance between the query and returned object vectors

response = collection.query.near_text(
    query=["biology"],
    return_properties=["question", "answer", "category"],
    limit=2,
    return_metadata=wvc.query.MetadataQuery(distance=True)
)

for o in response.objects:
    print(o.properties)
    print(o.metadata.distance)



{'category': 'SCIENCE', 'question': 'In 1953 Watson & Crick built a model of the molecular structure of this, the gene-carrying substance', 'answer': 'DNA'}
0.20132899284362793
{'answer': 'species', 'category': 'SCIENCE', 'question': "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification"}
0.21332907676696777


In [27]:
#Extract all 10 questions and analyze them based on distance/similarity to the query vector

response = collection.query.near_text(
    query=["animals"],
    return_properties=["question", "answer", "category"],
    limit=10,
    return_metadata=wvc.query.MetadataQuery(distance=True)
)

for o in response.objects:
    print(o.properties)
    print(o.metadata.distance)

{'category': 'ANIMALS', 'answer': 'Elephant', 'question': "It's the only living mammal in the order Proboseidea"}
0.19792455434799194
{'question': 'The gavial looks very much like a crocodile except for this bodily feature', 'category': 'ANIMALS', 'answer': 'the nose or snout'}
0.20364397764205933
{'question': 'Weighing around a ton, the eland is the largest species of this animal in Africa', 'category': 'ANIMALS', 'answer': 'Antelope'}
0.2089288830757141
{'answer': 'species', 'question': "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification", 'category': 'SCIENCE'}
0.2241668701171875
{'answer': 'the diamondback rattler', 'category': 'ANIMALS', 'question': 'Heaviest of all poisonous snakes is this North American rattlesnake'}
0.24009263515472412
{'category': 'SCIENCE', 'question': 'In 1953 Watson & Crick built a model of the molecular structure of this, the gene-carrying substance', 'answer': 'DNA'}
0.2463170289993286
{'answe

## Notice how as the responses get more irrelevant to the question that the distance between the `query`:`"animals"` and the response increases! - *The vectors are getting farther from each other!*

---

## We can let the vector database know to remove results after a threshold distance!

In [28]:
#Set a max distance threshold - What should the max distance be?

response = collection.query.near_text(
    query=["animals"],
    return_properties=["question", "answer", "category"],
    limit=10,
    distance=0.21,
    return_metadata=wvc.query.MetadataQuery(distance=True)
)

for o in response.objects:
    print(o.properties)
    print(o.metadata.distance)

{'question': "It's the only living mammal in the order Proboseidea", 'category': 'ANIMALS', 'answer': 'Elephant'}
0.19792455434799194
{'answer': 'the nose or snout', 'question': 'The gavial looks very much like a crocodile except for this bodily feature', 'category': 'ANIMALS'}
0.20364397764205933
{'category': 'ANIMALS', 'answer': 'Antelope', 'question': 'Weighing around a ton, the eland is the largest species of this animal in Africa'}
0.2089288830757141


## Now we prevented irrelevant results by removing vectors further then `max_distance` away!

In [29]:
client.close()

{"action":"restapi_management","level":"info","msg":"Shutting down... ","time":"2024-03-22T17:32:34-05:00"}
{"action":"restapi_management","level":"info","msg":"Stopped serving weaviate at http://127.0.0.1:8079","time":"2024-03-22T17:32:34-05:00"}
