In [7]:
#Loading environment variables

import os

WEAVIATE_CLUSTER_URL = os.getenv("WEAVIATE_CLUSTER_URL")
WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
COHERE_API_KEY = os.getenv("COHERE_API_KEY")

## Connecting to Weaviate client

* Create a sandbox cluster on [Weaviate Cloud Services](https://console.weaviate.cloud/dashboard) and get the cluster_url and api key for authentication.
* Create OpenAI/Cohere API Keys to include them into your headers.

In [11]:
import weaviate, os

#Connecting to WCS instance

client = weaviate.connect_to_wcs(
    cluster_url=WEAVIATE_CLUSTER_URL,
    auth_credentials=weaviate.auth.AuthApiKey(WEAVIATE_API_KEY),
    headers={
        "X-OpenAI-Api-Key": OPENAI_API_KEY,
        "X-Cohere-Api-Key": COHERE_API_KEY,
    }
)

client.is_ready()

True

## Create a collection

In [14]:
import weaviate.classes.config as wc

if client.collections.exists("Questions"):
    client.collections.delete("Questions")

client.collections.create(
    name="Questions",
    vectorizer_config=wc.Configure.Vectorizer.text2vec_openai()
)

<weaviate.collections.collection.Collection at 0x11075a510>

## Import Data

#### Sample Data

In [15]:
import json

data_10 = json.load(open("./data/jeopardy_tiny.json"))
print(json.dumps(data_10, indent=2))

[
  {
    "Category": "SCIENCE",
    "Question": "This organ removes excess glucose from the blood & stores it as glycogen",
    "Answer": "Liver"
  },
  {
    "Category": "ANIMALS",
    "Question": "It's the only living mammal in the order Proboseidea",
    "Answer": "Elephant"
  },
  {
    "Category": "ANIMALS",
    "Question": "The gavial looks very much like a crocodile except for this bodily feature",
    "Answer": "the nose or snout"
  },
  {
    "Category": "ANIMALS",
    "Question": "Weighing around a ton, the eland is the largest species of this animal in Africa",
    "Answer": "Antelope"
  },
  {
    "Category": "ANIMALS",
    "Question": "Heaviest of all poisonous snakes is this North American rattlesnake",
    "Answer": "the diamondback rattler"
  },
  {
    "Category": "SCIENCE",
    "Question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification",
    "Answer": "species"
  },
  {
    "Category": "SCIENCE",
   

  data_10 = json.load(open("./data/jeopardy_tiny.json"))


#### Insert Many

In [16]:
collection = client.collections.get("Questions")
collection.data.insert_many(data_10)

BatchObjectReturn(all_responses=[UUID('ea018278-0d1c-4bdc-b4ef-acc2fd6bda40'), UUID('8edaa000-a548-4a5e-86c8-3c05aedec1c8'), UUID('7337ee25-a090-4c53-9931-0bd86ea967c2'), UUID('0bbaf7ab-4106-49ee-be34-193f4cb25d41'), UUID('55ac5456-3580-407a-8e73-14513056d853'), UUID('8f1c5bd4-d479-4d4f-ad28-fbce6c1a899a'), UUID('d1ca1c44-0444-41c3-ad79-8ca163bcad7d'), UUID('9ea0af91-4281-4542-aafe-769caa5ea081'), UUID('047010e2-de6d-4429-9183-97cf5582f9af'), UUID('94f26695-794c-4e16-a9b1-f60e6cb7880e')], elapsed_seconds=1.4283568859100342, errors={}, uuids={0: UUID('ea018278-0d1c-4bdc-b4ef-acc2fd6bda40'), 1: UUID('8edaa000-a548-4a5e-86c8-3c05aedec1c8'), 2: UUID('7337ee25-a090-4c53-9931-0bd86ea967c2'), 3: UUID('0bbaf7ab-4106-49ee-be34-193f4cb25d41'), 4: UUID('55ac5456-3580-407a-8e73-14513056d853'), 5: UUID('8f1c5bd4-d479-4d4f-ad28-fbce6c1a899a'), 6: UUID('d1ca1c44-0444-41c3-ad79-8ca163bcad7d'), 7: UUID('9ea0af91-4281-4542-aafe-769caa5ea081'), 8: UUID('047010e2-de6d-4429-9183-97cf5582f9af'), 9: UUID('94

#### Data preview

In [18]:
#Fetch data from vector store
response = collection.query.fetch_objects(limit=5)

for item in response.objects:
    print(item.properties)

{'answer': 'the atmosphere', 'question': 'Changes in the tropospheric layer of this are what gives us weather', 'category': 'SCIENCE'}
{'answer': 'Antelope', 'question': 'Weighing around a ton, the eland is the largest species of this animal in Africa', 'category': 'ANIMALS'}
{'answer': 'the diamondback rattler', 'question': 'Heaviest of all poisonous snakes is this North American rattlesnake', 'category': 'ANIMALS'}
{'answer': 'the nose or snout', 'question': 'The gavial looks very much like a crocodile except for this bodily feature', 'category': 'ANIMALS'}
{'answer': 'Elephant', 'question': "It's the only living mammal in the order Proboseidea", 'category': 'ANIMALS'}


In [21]:
#Fetch data from vector store alongwith its vector
response = collection.query.fetch_objects(limit=5, include_vector=True)

for item in response.objects:
    print(item.properties)
    print(item.vector)
    #print(len(item.vector["default"])) ##1536 for OpenAI

{'answer': 'the atmosphere', 'question': 'Changes in the tropospheric layer of this are what gives us weather', 'category': 'SCIENCE'}
{'default': [0.019746119156479836, 0.015562730841338634, 0.002706704894080758, -0.04217592254281044, 0.0045418706722557545, 0.015273313969373703, -0.0017677446594461799, -0.015194381587207317, -0.013549968600273132, -0.024955620989203453, -0.00842597521841526, 0.005512074567377567, -0.023995283991098404, 0.005748869851231575, 0.009294225834310055, 0.0021212936844676733, 0.023613780736923218, 0.022324560210108757, -0.019140975549817085, -0.012273903004825115, -0.011346453800797462, 0.0009751372854225338, -0.001521904836408794, 0.010780775919556618, 0.017614958807826042, -0.01686510629951954, 0.0314675010740757, -0.040307868272066116, -0.011747690849006176, 0.014326131902635098, 0.015694282948970795, 0.0009619820048101246, -0.03428273648023605, -0.026126444339752197, 0.0013418415328487754, -0.03904495760798454, -0.0026359951589256525, -0.0141682680696249,

#### Query Example

In [22]:
questions = client.collections.get("Questions")
response = questions.query.near_text(
    "Afrikan animals",
    # "Zwierzęta afrykańskie", #African animals in Polish
    # "アフリカの動物", #African animals in Japanese
    limit=2
)

for item in response.objects:
    print(item.properties)

{'answer': 'Antelope', 'question': 'Weighing around a ton, the eland is the largest species of this animal in Africa', 'category': 'ANIMALS'}
{'answer': 'Elephant', 'question': "It's the only living mammal in the order Proboseidea", 'category': 'ANIMALS'}


## Creating a custom collection with LLM and Generative Model

In [23]:
#Creating custom collection using OpenAI vectorizer and Generative model with required properties

# Skip vectorization is usually applied on similar/repeating fields.
# There is a possibility that one object can be stored as multiple vectors capturing different information. (screen in "name", fridge screen category)

if client.collections.exists("Questions"):
    client.collections.delete("Questions")

client.collections.create(
    name="Questions",
    vectorizer_config=wc.Configure.Vectorizer.text2vec_openai(),
    generative_config=wc.Configure.Generative.openai("gpt-4"),
    properties=[
        wc.Property(name="Question", data_type=wc.DataType.TEXT),
        wc.Property(name="Round", data_type=wc.DataType.TEXT, skip_vectorization=True),
        wc.Property(name="Category", data_type=wc.DataType.TEXT, skip_vectorization=True),
        wc.Property(name="Answer", data_type=wc.DataType.TEXT)
    ]
)

<weaviate.collections.collection.Collection at 0x10f29d450>

#### Import data - 1k objects

In [26]:
data_1k = json.load(open("./data/jeopardy_1k.json"))

print(json.dumps(data_1k, indent=2))

[
  {
    "AirDate": "2006-11-08T00:00:00+00:00",
    "Round": "Double Jeopardy!",
    "Points": 800,
    "Category": "AMERICAN HISTORY",
    "Question": "Abraham Lincoln died across the street from this theatre on April 15, 1865",
    "Answer": "Ford's Theatre (the Ford Theatre accepted)"
  },
  {
    "AirDate": "2005-11-18T00:00:00+00:00",
    "Round": "Jeopardy!",
    "Points": 200,
    "Category": "RHYME TIME",
    "Question": "Any pigment on the wall so faded you can barely see it",
    "Answer": "faint paint"
  },
  {
    "AirDate": "1987-06-23T00:00:00+00:00",
    "Round": "Double Jeopardy!",
    "Points": 600,
    "Category": "AMERICAN HISTORY",
    "Question": "After the original 13, this was the 1st state admitted to the union",
    "Answer": "Vermont"
  },
  {
    "AirDate": "2011-01-13T00:00:00+00:00",
    "Round": "Jeopardy!",
    "Points": 400,
    "Category": "TRANSPORTATION",
    "Question": "In 1922 Warren Harding said that this \"gauges the speed of our present-day li

  data_1k = json.load(open("./data/jeopardy_1k.json"))


In [None]:
# Insert data
questions = client.collections.get("Questions")


with questions.batch.fixed_size(batch_size=100, concurrent_requests=2) as batch:
    for item in data_1k:
        batch.add_object({
            "Round": item["Round"],
            "Points": item["Points"],
            "Category": item["Category"],
            "Question": item["Question"],
            "Answer": item["Answer"]
        })

In [27]:
if(len(questions.batch.failed_objects)>0):
    print("Import complete with errors")
    for err in questions.batch.failed_objects:
        print(err)
else:
    print("Import complete with no errors")

Import complete with no errors
