source: https://app.datacamp.com/workspace/w/3d7a27c0-db15-4dab-8bc9-f137e4571857/edit

In [None]:
! pip install -U weaviate-client

### Get the data

We'll use a subset of the Jeopardy! quiz dataset:
> https://www.kaggle.com/datasets/tunguz/200000-jeopardy-questions

Pre-processed version:
> https://raw.githubusercontent.com/databyjp/wv_demo_uploader/main/weaviate_datasets/data/jeopardy_1k.json

### Load (or download) the data, and preview it

In [1]:
import requests
import json

def load_data():
    with open("jeopardy_1k.json", "r") as f:
        raw_data = f.read()
    return raw_data

In [2]:
response = requests.get('https://raw.githubusercontent.com/databyjp/wv_demo_uploader/main/weaviate_datasets/data/jeopardy_1k.json')
raw_data = response.text

In [3]:
# Parse the JSON and preview it
data = json.loads(raw_data)


In [5]:
with open('jeopardy_1k.json', 'w') as file:
    json.dump(data, file, indent=2)

### Step 1: Create a Weaviate instance

We'll use Embedded Weaviate - this is a quick way to create a Weaviate database.

In [6]:
import openai
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

openai_key = os.getenv('OPENAI_API_KEY')

In [7]:
import weaviate

client = weaviate.Client(
    embedded_options=weaviate.EmbeddedOptions(),
    additional_headers={
        "X-OpenAI-Api-Key": openai_key
    }
)

Started /Users/r337555/.cache/weaviate-embedded: process ID 64490


{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2023-12-30T10:23:51-03:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2023-12-30T10:23:51-03:00"}
{"level":"info","msg":"No resource limits set, weaviate will use all available memory and CPU. To limit resources, set LIMIT_RESOURCES=true","time":"2023-12-30T10:23:51-03:00"}
{"action":"grpc_startup","level":"info","msg":"grpc server listening at [::]:50060","time":"2023-12-30T10:23:52-03:00"}
adding route GET /v1/.well-known/live "weaviate.wellknown.liveness"
operation: spec.Operation{VendorExtensible:spec.VendorExtensible{Extensions:spec.Extensions(nil)}, OperationProps:spec.OperationProps{Description:"Determines whether the application is alive. Can be used for kubernete

In [8]:
def jprint(data_in):
    print(json.dumps(data_in, indent=2))

In [9]:
jprint(client.get_meta())

{
  "hostname": "http://127.0.0.1:8079",
  "modules": {
    "generative-openai": {
      "documentationHref": "https://platform.openai.com/docs/api-reference/completions",
      "name": "Generative Search - OpenAI"
    },
    "qna-openai": {
      "documentationHref": "https://platform.openai.com/docs/api-reference/completions",
      "name": "OpenAI Question & Answering Module"
    },
    "ref2vec-centroid": {},
    "reranker-cohere": {
      "documentationHref": "https://txt.cohere.com/rerank/",
      "name": "Reranker - Cohere"
    },
    "text2vec-cohere": {
      "documentationHref": "https://docs.cohere.ai/embedding-wiki/",
      "name": "Cohere Module"
    },
    "text2vec-huggingface": {
      "documentationHref": "https://huggingface.co/docs/api-inference/detailed_parameters#feature-extraction-task",
      "name": "Hugging Face Module"
    },
    "text2vec-openai": {
      "documentationHref": "https://platform.openai.com/docs/guides/embeddings/what-are-embeddings",
      "nam

looking up route for GET /v1/meta
got a router for PATCH
got a router for PUT
got a router for POST
got a router for GET
got a router for HEAD
got a router for DELETE
found a route for GET /v1/meta with 0 parameters
responding to GET /v1/meta with produces: [application/json]
offers: [application/json]
[GET /v1/meta] set response format "application/json" in context
[GET /v1/meta] negotiated response format "application/json"


In [10]:
if client.schema.exists("Question"):
    client.schema.delete_class("Question")

looking up route for GET /v1/schema/Question
got a router for GET
got a router for HEAD
got a router for DELETE
got a router for PATCH
got a router for PUT
got a router for POST
found a route for GET /v1/schema/Question with 1 parameters
responding to GET /v1/schema/Question with produces: [application/json]
offers: [application/json]
[GET /v1/schema/Question] set response format "application/json" in context
[GET /v1/schema/Question] negotiated response format "application/json"


In [11]:
class_definition = {
    "class": "Question",
    "vectorizer": "text2vec-openai",
    "vectorIndexConfig": {
        "distance": "cosine",
    },
    "moduleConfig": {
        "generative-cohere": {}
    },
    "properties": [
        {
            "name": "question",
            "dataType": ["text"]
        },
        {
            "name": "answer",
            "dataType": ["text"]
        },
    ],
}

client.schema.create_class(class_definition)

looking up route for POST /v1/schema
got a router for HEAD
got a router for DELETE
got a router for PATCH
got a router for PUT
got a router for POST
got a router for GET
found a route for POST /v1/schema with 1 parameters
validating content type for "application/json" against [application/json, application/yaml]
{"level":"info","msg":"Created shard question_1tAIoNqfSg0d in 14.057743ms","time":"2023-12-30T10:30:37-03:00"}
responding to POST /v1/schema with produces: [application/json]
offers: [application/json]
[POST /v1/schema] set response format "application/json" in context
[POST /v1/schema] negotiated response format "application/json"
{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2023-12-30T10:30:37-03:00","took":463640}


Was our class created successfully? Let's take a look

In [12]:
jprint(client.schema.get("Question"))

{
  "class": "Question",
  "invertedIndexConfig": {
    "bm25": {
      "b": 0.75,
      "k1": 1.2
    },
    "cleanupIntervalSeconds": 60,
    "stopwords": {
      "additions": null,
      "preset": "en",
      "removals": null
    }
  },
  "moduleConfig": {
    "generative-cohere": {},
    "text2vec-openai": {
      "baseURL": "https://api.openai.com",
      "model": "ada",
      "modelVersion": "002",
      "type": "text",
      "vectorizeClassName": true
    }
  },
  "multiTenancyConfig": {
    "enabled": false
  },
  "properties": [
    {
      "dataType": [
        "text"
      ],
      "indexFilterable": true,
      "indexSearchable": true,
      "moduleConfig": {
        "text2vec-openai": {
          "skip": false,
          "vectorizePropertyName": false
        }
      },
      "name": "question",
      "tokenization": "word"
    },
    {
      "dataType": [
        "text"
      ],
      "indexFilterable": true,
      "indexSearchable": true,
      "moduleConfig": {
        

looking up route for GET /v1/schema/Question
got a router for PATCH
got a router for PUT
got a router for POST
got a router for GET
got a router for HEAD
got a router for DELETE
found a route for GET /v1/schema/Question with 1 parameters
responding to GET /v1/schema/Question with produces: [application/json]
offers: [application/json]
[GET /v1/schema/Question] set response format "application/json" in context
[GET /v1/schema/Question] negotiated response format "application/json"


### Add Data

We'll add actual objects (SQL rows) to our data.

First, let's build objects to add - and take a look at a couple.

In [13]:
for row in data[:2]:
    data_obj = {
        "question": row["Question"],
        "answer": row["Answer"]
    }
    print(data_obj)

{'question': 'Abraham Lincoln died across the street from this theatre on April 15, 1865', 'answer': "Ford's Theatre (the Ford Theatre accepted)"}
{'question': 'Any pigment on the wall so faded you can barely see it', 'answer': 'faint paint'}


If it all looks fine - let's add objects:

https://weaviate.io/developers/weaviate/manage-data/import

In [14]:
with client.batch() as batch:
    for row in data:
        data_obj = {
            "question": row["Question"],
            "answer": row["Answer"]
        }
        batch.add_data_object(
            data_object=data_obj,
            class_name="Question"
        )  

            Please instead use the `client.batch.configure()` method to configure your batch and `client.batch` to enter the context manager.
            See https://weaviate.io/developers/weaviate/client-libraries/python for details.
looking up route for POST /v1/batch/objects
got a router for PATCH
got a router for PUT
got a router for POST
got a router for GET
got a router for HEAD
got a router for DELETE
looking up route for GET /v1/nodes
got a router for DELETE
got a router for PATCH
got a router for PUT
got a router for POST
got a router for GET
got a router for HEAD
found a route for POST /v1/batch/objects with 2 parameters
found a route for GET /v1/nodes with 1 parameters
validating content type for "application/json" against [application/json, application/yaml]
responding to GET /v1/nodes with produces: [application/json]
offers: [application/json]
[GET /v1/nodes] set response format "application/json" in context
[GET /v1/nodes] negotiated response format "application/json"
re

In [15]:
len(data)

1000

Confirm data load

Do we have data?

Let's get an object count

In [16]:
client.query.aggregate("Question").with_meta_count().do()

looking up route for POST /v1/graphql
got a router for DELETE
got a router for PATCH
got a router for PUT
got a router for POST
got a router for GET
got a router for HEAD
found a route for POST /v1/graphql with 1 parameters
validating content type for "application/json" against [application/json, application/yaml]
responding to POST /v1/graphql with produces: [application/json]
offers: [application/json]
[POST /v1/graphql] set response format "application/json" in context
[POST /v1/graphql] negotiated response format "application/json"


{'data': {'Aggregate': {'Question': [{'meta': {'count': 1000}}]}}}

Does the data look right?

Let's grab a few objects from Weaviate!

In [17]:
response = (
    client.query
    .get("Question", ["question", "answer"])
    .with_limit(3)
    .do()
)

jprint(response)

{
  "data": {
    "Get": {
      "Question": [
        {
          "answer": "Liberia",
          "question": "At the beginning of World War I, this was the only independent nation in West Africa"
        },
        {
          "answer": "icebergs",
          "question": "Tracking these, coast guard rates them as bergy bits, growlers, small, medium, & large"
        },
        {
          "answer": "Moors",
          "question": "The Visigoth Empire went out of business in 711 when it was defeated by this north African group in Spain"
        }
      ]
    }
  }
}


looking up route for POST /v1/graphql
got a router for DELETE
got a router for PATCH
got a router for PUT
got a router for POST
got a router for GET
got a router for HEAD
found a route for POST /v1/graphql with 1 parameters
validating content type for "application/json" against [application/json, application/yaml]
responding to POST /v1/graphql with produces: [application/json]
offers: [application/json]
[POST /v1/graphql] set response format "application/json" in context
[POST /v1/graphql] negotiated response format "application/json"


### Step 3: Work with the data
Let's try a few more involved queries

Filtering (similar to WHERE filter in SQL)

Let's find objects that meet a particular condition.

In [18]:
where_filter = {
    "path": ["answer"],
    "operator": "Like",
    "valueText": "*history*"
}

response = (
    client.query
    .get("Question", ["question", "answer"])
    .with_limit(3)
    .with_where(where_filter)
    .do()
)

jprint(response)

{
  "data": {
    "Get": {
      "Question": [
        {
          "answer": "\"A Brief History Of Time In A Bottle\"",
          "question": "Stephen Hawking's 1988 bio of the universe that was a No. 1 hit for Jim Croce"
        }
      ]
    }
  }
}


looking up route for POST /v1/graphql
got a router for POST
got a router for GET
got a router for HEAD
got a router for DELETE
got a router for PATCH
got a router for PUT
found a route for POST /v1/graphql with 1 parameters
validating content type for "application/json" against [application/json, application/yaml]
responding to POST /v1/graphql with produces: [application/json]
offers: [application/json]
[POST /v1/graphql] set response format "application/json" in context
[POST /v1/graphql] negotiated response format "application/json"


We can also use multiple filters

In [19]:
where_filter = {
    "operator": "Or",
    "operands": [
        {
            "path": ["answer"],
            "operator": "Like",
            "valueText": "*history*"
        },
        {
            "path": ["question"],
            "operator": "Like",
            "valueText": "*history*"
        },        
    ]
}

response = (
    client.query
    .get("Question", ["question", "answer"])
    .with_limit(3)
    .with_where(where_filter)
    .do()
)

jprint(response)

{
  "data": {
    "Get": {
      "Question": [
        {
          "answer": "the Field Museum",
          "question": "What was once the Chicago Natural History Museum is now called this, after its founder"
        },
        {
          "answer": "Greyhound",
          "question": "A Hibbing, Minn. museum traces the history of this bus company founded there in 1914 using Hupmobiles"
        },
        {
          "answer": "\"A Brief History Of Time In A Bottle\"",
          "question": "Stephen Hawking's 1988 bio of the universe that was a No. 1 hit for Jim Croce"
        }
      ]
    }
  }
}


looking up route for POST /v1/graphql
got a router for POST
got a router for GET
got a router for HEAD
got a router for DELETE
got a router for PATCH
got a router for PUT
found a route for POST /v1/graphql with 1 parameters
validating content type for "application/json" against [application/json, application/yaml]
responding to POST /v1/graphql with produces: [application/json]
offers: [application/json]
[POST /v1/graphql] set response format "application/json" in context
[POST /v1/graphql] negotiated response format "application/json"


But this does not rank the result in any meaningful way.

For that, we need a keyword search (as opposed to a keyword filter).

Keyword search
Unlike a keyword filter, a keyword search will search for, and rank results based on the frequency of the keyword.

In [20]:
response = (
    client.query
    .get("Question", ["question", "answer"])
    .with_limit(3)
    .with_bm25(query="history")
    .do()
)

jprint(response)

{
  "data": {
    "Get": {
      "Question": [
        {
          "answer": "\"A Brief History Of Time In A Bottle\"",
          "question": "Stephen Hawking's 1988 bio of the universe that was a No. 1 hit for Jim Croce"
        },
        {
          "answer": "Oil",
          "question": "The Drake Well Museum in Titusville, Penn. is dedicated to the history of this industry"
        },
        {
          "answer": "the Field Museum",
          "question": "What was once the Chicago Natural History Museum is now called this, after its founder"
        }
      ]
    }
  }
}


looking up route for POST /v1/graphql
got a router for GET
got a router for HEAD
got a router for DELETE
got a router for PATCH
got a router for PUT
got a router for POST
found a route for POST /v1/graphql with 1 parameters
validating content type for "application/json" against [application/json, application/yaml]
responding to POST /v1/graphql with produces: [application/json]
offers: [application/json]
[POST /v1/graphql] set response format "application/json" in context
[POST /v1/graphql] negotiated response format "application/json"


Semantic search

A semantic search, on the other hand, searches objects based on similarity

In [21]:
response = (
    client.query
    .get("Question", ["question", "answer"])
    .with_limit(3)
    .with_near_text({"concepts": ["history"]})
    .do()
)

jprint(response)

looking up route for POST /v1/graphql
got a router for PATCH
got a router for PUT
got a router for POST
got a router for GET
got a router for HEAD
got a router for DELETE
found a route for POST /v1/graphql with 1 parameters
validating content type for "application/json" against [application/json, application/yaml]


{
  "data": {
    "Get": {
      "Question": [
        {
          "answer": "Greyhound",
          "question": "A Hibbing, Minn. museum traces the history of this bus company founded there in 1914 using Hupmobiles"
        },
        {
          "answer": "The Rijksmuseum",
          "question": "This Dutch national art museum had its origins in one founded by Louis Bonaparte in 1808"
        },
        {
          "answer": "Shinto",
          "question": "Compiled in 712, the Kojiki, \"Records of Ancient Matters\", is one of this religion's oldest texts"
        }
      ]
    }
  }
}


responding to POST /v1/graphql with produces: [application/json]
offers: [application/json]
[POST /v1/graphql] set response format "application/json" in context
[POST /v1/graphql] negotiated response format "application/json"


How does this work?

- Under the hood, this uses a vector search. It looks for objects which are the most similar to a text input.
- We can inspect the similarity along with the results.


In [22]:
response = (
    client.query
    .get("Question", ["question", "answer"])
    .with_limit(3)
    .with_near_text({"concepts": ["history"]})
    .with_additional("distance")
    .do()
)

jprint(response)

looking up route for POST /v1/graphql
got a router for PUT
got a router for POST
got a router for GET
got a router for HEAD
got a router for DELETE
got a router for PATCH
found a route for POST /v1/graphql with 1 parameters
validating content type for "application/json" against [application/json, application/yaml]


{
  "data": {
    "Get": {
      "Question": [
        {
          "_additional": {
            "distance": 0.19917047
          },
          "answer": "Greyhound",
          "question": "A Hibbing, Minn. museum traces the history of this bus company founded there in 1914 using Hupmobiles"
        },
        {
          "_additional": {
            "distance": 0.20580983
          },
          "answer": "The Rijksmuseum",
          "question": "This Dutch national art museum had its origins in one founded by Louis Bonaparte in 1808"
        },
        {
          "_additional": {
            "distance": 0.20851183
          },
          "answer": "Shinto",
          "question": "Compiled in 712, the Kojiki, \"Records of Ancient Matters\", is one of this religion's oldest texts"
        }
      ]
    }
  }
}


responding to POST /v1/graphql with produces: [application/json]
offers: [application/json]
[POST /v1/graphql] set response format "application/json" in context
[POST /v1/graphql] negotiated response format "application/json"


This is where "vectors" come in.

Each object in Weaviate includes a vector - like so:

In [23]:
response = (
    client.query
    .get("Question", ["question", "answer"])
    .with_limit(3)
    .with_near_text({"concepts": ["history"]})
    .with_additional("vector")
    .do()
)

jprint(response)

looking up route for POST /v1/graphql
got a router for PATCH
got a router for PUT
got a router for POST
got a router for GET
got a router for HEAD
got a router for DELETE
found a route for POST /v1/graphql with 1 parameters
validating content type for "application/json" against [application/json, application/yaml]


{
  "data": {
    "Get": {
      "Question": [
        {
          "_additional": {
            "vector": [
              -0.01900998,
              -0.011876199,
              -0.0006049775,
              -0.034365743,
              -0.022086507,
              0.011291794,
              0.0020403822,
              -0.013918261,
              -0.009175842,
              -0.031087698,
              -0.028293299,
              0.01688731,
              -0.0029069148,
              -0.023618054,
              -0.030765269,
              0.0055417786,
              0.013165923,
              0.013065163,
              0.012977838,
              0.010573042,
              -0.008389917,
              0.0015852846,
              0.010835017,
              -0.0084033515,
              0.0010646932,
              0.0012309466,
              0.026627406,
              -0.0087728035,
              -0.00096057495,
              0.015920019,
              -0.0046987566,
              -0.0071539325,

responding to POST /v1/graphql with produces: [application/json]
offers: [application/json]
[POST /v1/graphql] set response format "application/json" in context
[POST /v1/graphql] negotiated response format "application/json"
