# Sparse Dense and Hybrid Search

## Remove old Weaviate DB files

In [10]:
!rm -rf ~/.local/share/weaviate

{"action":"read_disk_use","level":"error","msg":"failed to read disk usage: no such file or directory","path":"/home/jovyan/.local/share/weaviate","time":"2025-12-05T06:32:33Z"}


## Recreate the example
With the same data as in the previous lesson

In [11]:
import requests
import json

# Download the data
resp = requests.get('https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json')
data = json.loads(resp.text)  # Load data

# Parse the JSON and preview it
print(type(data), len(data))

def json_print(data):
    print(json.dumps(data, indent=2))

<class 'list'> 10


{"action":"read_disk_use","level":"error","msg":"failed to read disk usage: no such file or directory","path":"/home/jovyan/.local/share/weaviate","time":"2025-12-05T06:32:34Z"}


In [12]:
import weaviate, os
from weaviate import EmbeddedOptions
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

client = weaviate.Client(
    embedded_options=EmbeddedOptions(),
    additional_headers={
        "X-OpenAI-Api-BaseURL": os.environ['OPENAI_API_BASE'],
        "X-OpenAI-Api-Key": openai.api_key,  # Replace this with your actual key
    }
)
print(f"Client created? {client.is_ready()}")

embedded weaviate is already listening on port 8079
Client created? True


In [13]:
# Uncomment the following two lines if you want to run this block for a second time.
if client.schema.exists("Question"):
   client.schema.delete_class("Question")
 
class_obj = {
    "class": "Question",
    "vectorizer": "text2vec-openai",  # Use OpenAI as the vectorizer
    "moduleConfig": {
        "text2vec-openai": {
            "model": "ada",
            "modelVersion": "002",
            "type": "text",
            "baseURL": os.environ["OPENAI_API_BASE"]
        }
    }
}

client.schema.create_class(class_obj)

time="2025-12-05T06:32:34Z" level=error msg="stop lsmkv store: shutdown bucket \"objects\": delete commit log file: remove /home/jovyan/.local/share/weaviate/question_W7WlMSd8ILQ9_lsm/objects/segment-1764916134413783146.wal: no such file or directory" action=drop_shard class=Question id=question_W7WlMSd8ILQ9
{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"question_JKA2TKfeHS3q","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2025-12-05T06:32:34Z","took":54812}


In [14]:
with client.batch.configure(batch_size=5) as batch:
    for i, d in enumerate(data):  # Batch import data
        
        print(f"importing question: {i+1}")
        
        properties = {
            "answer": d["Answer"],
            "question": d["Question"],
            "category": d["Category"],
        }
        
        batch.add_data_object(
            data_object=properties,
            class_name="Question"
        )

{"action":"restapi_management","level":"info","msg":"Shutting down... ","time":"2025-12-05T06:32:34Z"}
{"action":"restapi_management","level":"info","msg":"Stopped serving weaviate at http://127.0.0.1:8079","time":"2025-12-05T06:32:34Z"}
Exception in thread batchSizeRefresh:
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/site-packages/urllib3/connection.py", line 174, in _new_conn
    conn = connection.create_connection(
  File "/usr/local/lib/python3.9/site-packages/urllib3/util/connection.py", line 95, in create_connection
    raise err
  File "/usr/local/lib/python3.9/site-packages/urllib3/util/connection.py", line 85, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.9/site-packages/urllib3/connectionpool.py", line 715, in urlopen
    httplib_response = self._make_request(
  File "

importing question: 1
importing question: 2
importing question: 3
importing question: 4
importing question: 5
Embedded weaviate wasn't listening on port 8079, so starting embedded weaviate again
Started /home/jovyan/.cache/weaviate-embedded: process ID 196
importing question: 6
importing question: 7
importing question: 8
importing question: 9
importing question: 10


    _threading_Thread_run(self)
  File "/usr/local/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.9/site-packages/weaviate/batch/crud_batch.py", line 472, in periodic_check
    status = cluster.get_nodes_status()
  File "/usr/local/lib/python3.9/site-packages/weaviate/cluster/cluster.py", line 62, in get_nodes_status
    raise RequestsConnectionError(
requests.exceptions.ConnectionError: Get nodes status failed due to connection error


## Queries

### Dense Search

In [15]:
response = (
    client.query
    .get("Question", ["question", "answer"])
    .with_near_text({"concepts":["animal"]})
    .with_limit(3)
    .do()
)

json_print(response)

{
  "errors": [
    {
      "locations": [
        {
          "column": 24,
          "line": 1
        }
      ],
      "message": "Unknown argument \"nearText\" on field \"Question\" of type \"GetObjectsObj\". Did you mean \"nearObject\" or \"nearVector\"?",
      "path": null
    }
  ]
}


{"action":"requests_total","api":"graphql","class_name":"","error":"Unknown argument \"nearText\" on field \"Question\" of type \"GetObjectsObj\". Did you mean \"nearObject\" or \"nearVector\"?","level":"error","msg":"unexpected error","query_type":"","time":"2025-12-05T06:32:35Z"}


### Sparse Search - BM25

In [16]:
response = (
    client.query
    .get("Question",["question","answer"])
    .with_bm25(query="animal")
    .with_limit(3)
    .do()
)

json_print(response)

{
  "data": {
    "Get": {
      "Question": [
        {
          "answer": "Antelope",
          "question": "Weighing around a ton, the eland is the largest species of this animal in Africa"
        }
      ]
    }
  }
}


### Hybrid Search

In [17]:
response = (
    client.query
    .get("Question",["question","answer"])
    .with_hybrid(query="animal", alpha=0.5)
    .with_limit(3)
    .do()
)

json_print(response)

{
  "data": {
    "Get": {
      "Question": null
    }
  },
  "errors": [
    {
      "locations": [
        {
          "column": 6,
          "line": 1
        }
      ],
      "message": "get vector input from modules provider: VectorFromInput was called without vectorizer",
      "path": [
        "Get",
        "Question"
      ]
    }
  ]
}


In [18]:
response = (
    client.query
    .get("Question",["question","answer"])
    .with_hybrid(query="animal", alpha=0)
    .with_limit(3)
    .do()
)

json_print(response)

{
  "data": {
    "Get": {
      "Question": [
        {
          "answer": "Antelope",
          "question": "Weighing around a ton, the eland is the largest species of this animal in Africa"
        }
      ]
    }
  }
}


In [19]:
response = (
    client.query
    .get("Question",["question","answer"])
    .with_hybrid(query="animal", alpha=1)
    .with_limit(3)
    .do()
)

json_print(response)

{
  "data": {
    "Get": {
      "Question": null
    }
  },
  "errors": [
    {
      "locations": [
        {
          "column": 6,
          "line": 1
        }
      ],
      "message": "get vector input from modules provider: VectorFromInput was called without vectorizer",
      "path": [
        "Get",
        "Question"
      ]
    }
  ]
}
