# Collection setup and data load

## Get keys and urls

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

WEAVIATE_URL = os.getenv("WEAVIATE_URL")
WEAVIATE_KEY = os.getenv("WEAVIATE_KEY")

print(WEAVIATE_URL)
print(WEAVIATE_KEY)

https://rwxzavyuspepzg2fkhjag.c0.us-west3.gcp.weaviate.cloud
IwgDlvGkKqzCylsFoLh6wAuPgYqY9bvyp7yR


## Connect to Weaviate

You need to pass in your Weaviate Cloud URL and KEY.

In [2]:
import weaviate
from weaviate.classes.init import Auth
# from weaviate.classes.init import AdditionalConfig, Timeout

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(WEAVIATE_KEY),

    # additional_config=AdditionalConfig(
    #     timeout=Timeout(init=2, query=45, insert=120),  # Values in seconds
    # )
)

client.is_ready()

True

## Create a collection with a vectorizer

* [Weaviate Docs - collection creation and configuration](https://weaviate.io/developers/weaviate/manage-data/collections)
* [Weaviate integrated embedding models](https://weaviate.io/developers/weaviate/model-providers/weaviate/embeddings)

Examples of other embedding models:
* [Cohere](https://weaviate.io/developers/weaviate/model-providers/cohere/embeddings)
* [HuggingFace 🤗](https://weaviate.io/developers/weaviate/model-providers/huggingface/embeddings)
* [Ollama (self-hosted)](https://weaviate.io/developers/weaviate/model-providers/ollama/embeddings)
* [OpenAI](https://weaviate.io/developers/weaviate/model-providers/openai/embeddings)

In [3]:
from weaviate.classes.config import Configure

if client.collections.exists("Jeopardy"):
    client.collections.delete("Jeopardy")

client.collections.create(
    name="Jeopardy",
    vector_config=Configure.Vectors.text2vec_weaviate(
        model="Snowflake/snowflake-arctic-embed-l-v2.0",
        dimensions=256  # options: 1024 (default) and 256
    ),
    # Optionally, define properties here
)

<weaviate.collections.collection.sync.Collection at 0x10c5773e0>

## Import data
### Sample Data

In [5]:
import json

with open("../jeopardy_tiny.json") as file:
    data_10 = json.load(file)

print(json.dumps(data_10[0:2], indent=2))

[
  {
    "Category": "SCIENCE",
    "Question": "This organ removes excess glucose from the blood & stores it as glycogen",
    "Answer": "Liver"
  },
  {
    "Category": "ANIMALS",
    "Question": "It's the only living mammal in the order Proboseidea",
    "Answer": "Elephant"
  }
]


### Insert Many

> `insert_many` is only used for inserting small batches of data - must complete within the timeout.

[Weaviate Docs - insert many](https://weaviate.io/developers/weaviate/manage-data/import)

In [6]:
# Insert data
jeopardy = client.collections.get("Jeopardy")
jeopardy.data.insert_many(data_10)

BatchObjectReturn(_all_responses=[UUID('66e1f0e6-8db1-422d-a8f1-cbe57af5fd77'), UUID('2d772480-6aa5-4e23-a8cd-10275a18868d'), UUID('032c263b-863a-4b68-af1b-260befdaeea4'), UUID('44f9933e-06ec-4eb0-ac94-678ee4d8af44'), UUID('bc3a4b3b-ecb0-41b7-9a12-10b8f779f026'), UUID('4bfb93e2-3c28-438d-8217-5a6688153942'), UUID('1927d265-d0f7-4116-8968-6cf8880963ff'), UUID('a5736c13-3ceb-4f3f-b571-ee9db6c9470c'), UUID('d520f6d8-7ca7-4eef-9fc7-8897bace4d6d'), UUID('1e8729bb-c68a-44de-a246-5c550c19e8a1')], elapsed_seconds=0.45962095260620117, errors={}, uuids={0: UUID('66e1f0e6-8db1-422d-a8f1-cbe57af5fd77'), 1: UUID('2d772480-6aa5-4e23-a8cd-10275a18868d'), 2: UUID('032c263b-863a-4b68-af1b-260befdaeea4'), 3: UUID('44f9933e-06ec-4eb0-ac94-678ee4d8af44'), 4: UUID('bc3a4b3b-ecb0-41b7-9a12-10b8f779f026'), 5: UUID('4bfb93e2-3c28-438d-8217-5a6688153942'), 6: UUID('1927d265-d0f7-4116-8968-6cf8880963ff'), 7: UUID('a5736c13-3ceb-4f3f-b571-ee9db6c9470c'), 8: UUID('d520f6d8-7ca7-4eef-9fc7-8897bace4d6d'), 9: UUID('

### Data preview

In [7]:
# Show data preview
jeopardy = client.collections.get("Jeopardy")
response = jeopardy.query.fetch_objects(limit=4)

for item in response.objects:
    print(item.uuid, item.properties)

032c263b-863a-4b68-af1b-260befdaeea4 {'answer': 'the nose or snout', 'question': 'The gavial looks very much like a crocodile except for this bodily feature', 'category': 'ANIMALS'}
1927d265-d0f7-4116-8968-6cf8880963ff {'answer': 'wire', 'question': 'A metal that is ductile can be pulled into this while cold & under pressure', 'category': 'SCIENCE'}
1e8729bb-c68a-44de-a246-5c550c19e8a1 {'answer': 'Sound barrier', 'question': 'In 70-degree air, a plane traveling at about 1,130 feet per second breaks it', 'category': 'SCIENCE'}
2d772480-6aa5-4e23-a8cd-10275a18868d {'answer': 'Elephant', 'question': "It's the only living mammal in the order Proboseidea", 'category': 'ANIMALS'}


In [8]:
# Show data preview - with vectors
jeopardy = client.collections.get("Jeopardy")
response = jeopardy.query.fetch_objects(
    limit=4,
    include_vector=True
)

for item in response.objects:
    print(item.properties)
    print(item.vector, '\n')

{'answer': 'the nose or snout', 'question': 'The gavial looks very much like a crocodile except for this bodily feature', 'category': 'ANIMALS'}
{'default': [-0.08220794051885605, -0.02602316252887249, 0.05796641856431961, -0.11761627346277237, 0.013390353880822659, -0.006968736182898283, -0.05319667235016823, 0.03321986645460129, -0.06918934732675552, 0.04811830073595047, -0.019584007561206818, -0.06632749736309052, -0.07165838778018951, 0.04654708877205849, -0.1061127707362175, 0.02396094985306263, 0.029460184276103973, 0.048960018903017044, -0.04278741031885147, -0.05459953844547272, -0.09387978166341782, 0.02316131442785263, -0.04340466856956482, -0.011987488716840744, -0.03206951171159744, -0.030077442526817322, -0.06060380861163139, 0.025041155517101288, 0.08434029668569565, 0.08636041730642319, 0.09466539323329926, -0.011833174154162407, -0.02821163274347782, -0.017676109448075294, 0.0593692846596241, -0.0073580313473939896, 0.026247620582580566, 0.05538514629006386, 0.064419597

### Super quick query example

In [9]:
response = jeopardy.query.near_text(
    query="African animals",
    # query="weather",
    limit=2
)

for item in response.objects:
    print(item.properties)

{'answer': 'Antelope', 'question': 'Weighing around a ton, the eland is the largest species of this animal in Africa', 'category': 'ANIMALS'}
{'answer': 'Elephant', 'question': "It's the only living mammal in the order Proboseidea", 'category': 'ANIMALS'}


## A bit bigger example - 2k objects

### Load data

In [10]:
import json

with open("../wiki-2k.json") as file:
    data_2k = json.load(file)

print(json.dumps(data_2k[0:2], indent=2))

[
  {
    "text": "At this point in the siege, Lee's army had strengthened the Petersburg line. They dug breastworks out of rifle pits. At night, with pick and shovel, they then turned the breastworks into  deep trenches. Pointed stakes turned outwards were designed to break up any frontal attacks. The area between the two lines became a no man's land. The summer that year was hot and dry. Streams and springs were quickly drying up causing a water shortage on both sides. The siege was quickly becoming a stalemate.",
    "title": "Siege of Petersburg",
    "url": "https://simple.wikipedia.org/wiki/Siege%20of%20Petersburg",
    "wiki_id": "20231101.simple_550339_9"
  },
  {
    "text": "1944  Holocaust: Anne Frank and her family are placed on the last transport train from the Westerbork transit camp to Auschwitz.",
    "title": "September 3",
    "url": "https://simple.wikipedia.org/wiki/September%203",
    "wiki_id": "20231101.simple_8532_17"
  }
]


### Create a collection with Named Vectors and SourceProperties

In [13]:
from weaviate.classes.config import Configure, Property, DataType

def create_wiki_collection():
    if client.collections.exists("Wiki"):
        client.collections.delete("Wiki")

    # Create a collection here - with OpenAI vectorizer and define source properties
    client.collections.create(
        name="Wiki",

        vector_config=[
            Configure.Vectors.text2vec_weaviate(
                name="main_vector",
                model="Snowflake/snowflake-arctic-embed-l-v2.0", # default
                source_properties=['title', 'text'] # which properties should be used to generate a vector
            )
        ],

        # Example: how to define property schema (Optional)
        # properties=[  
        #     Property(name="title", data_type=DataType.TEXT),
        #     Property(name="text", data_type=DataType.TEXT),
        #     Property(name="url", data_type=DataType.TEXT),
        #     Property(name="wiki_id", data_type=DataType.TEXT),
        # ],
    )

create_wiki_collection()

### Import data - 2k objects with Batch

Batch speeds up the import process by grouping objects to be added in bigger batch groups.

Batch creates an internal buffer to collect objects to be added.<br>
Each time the buffer count reaches `batch_size`, batch sends the new objects to Weaviate.

Types of batch:
* `dynamic` - let batch calculate the optimal batch_size based on detected latency
* `fixed_size` - provide a fixed batch_size
* `rate_limit` - limit the number of requests (per minute), useful for working with models with a rate limit

### Take 1 – import sample 100

In [14]:
from tqdm import tqdm

sample_100 = data_2k[0:100]

wiki = client.collections.get("Wiki")

with wiki.batch.dynamic() as batch:
    for item in tqdm(sample_100):
        batch.add_object(item)

print(f"Wiki count: {len(wiki)}")

100%|██████████| 100/100 [00:00<00:00, 6237.53it/s]


Wiki count: 100


In [15]:
# check for errors
if(len(wiki.batch.failed_objects)>0):
    print("Import complete with errors")
    for err in wiki.batch.failed_objects:
        print(err)
else:
    print("Import complete with no errors")

Import complete with no errors


### Take 2 – import sample 100 – with UUID

To avoid inserting duplicates, you can generate a UUID based on the whole object or a unique property.

In [16]:
from weaviate.util import generate_uuid5

print(generate_uuid5("This UUID is always the same"))
print(generate_uuid5("This UUID is always the same"))
print(generate_uuid5("This UUID is always the same"))
print("====================================")

print(generate_uuid5("This UUID is different"))
print(generate_uuid5("This UUID is different"))
print("====================================")

obj1 = { "title": "this is an object", "count": 1 }
obj2 = { "title": "this is an object", "count": 2 }
print(generate_uuid5(obj1))
print(generate_uuid5(obj2))


8d3441c0-c1d1-5859-8a5e-efce9e7d3bd8
8d3441c0-c1d1-5859-8a5e-efce9e7d3bd8
8d3441c0-c1d1-5859-8a5e-efce9e7d3bd8
09f975a6-0e62-565a-982e-e6ce148eac86
09f975a6-0e62-565a-982e-e6ce148eac86
c3c3ad32-fa65-5944-a021-415f8fda02af
4d0b77d3-4862-59bc-bf9f-9fe2b9bf89f0


In [17]:
# recreate the collection to start again
create_wiki_collection()

> Rerun the import script multiple times.

> Starting from the second run, the script should finish a lot faster, and the wiki count shouldn't increase.

In [18]:
from tqdm import tqdm
from weaviate.util import generate_uuid5

sample_100 = data_2k[0:100]

wiki = client.collections.get("Wiki")

with wiki.batch.fixed_size(batch_size=20, concurrent_requests=2) as batch:
    for item in tqdm(sample_100):
        id = generate_uuid5(item["wiki_id"])

        batch.add_object(
            item,
            uuid=id
        )

print(f"Wiki count: {len(wiki)}")

100%|██████████| 100/100 [00:00<00:00, 155.62it/s]


Wiki count: 100


### Take 2 - import the rest of the data - but break if multiple errors

In [19]:
from tqdm import tqdm
from weaviate.util import generate_uuid5

wiki = client.collections.get("Wiki")

with wiki.batch.fixed_size(batch_size=2000, concurrent_requests=2) as batch:
    for item in tqdm(data_2k):
        id = generate_uuid5(item["wiki_id"])
        batch.add_object(item, uuid=id)

        # Check number of errors while running
        if(batch.number_errors > 10):
            print("Errors during batch import")
            break

100%|██████████| 2000/2000 [00:00<00:00, 90473.45it/s]


### Check for errors

In [21]:
if(len(wiki.batch.failed_objects)>0):
    print("Import complete with errors")
    for err in wiki.batch.failed_objects:
        print(err)
else:
    print("Import complete with no errors")

Import complete with no errors


## Bonus - iterate through all collection data

The client has a built-in function that allows you to iterate through all collection data.

In [22]:
wiki = client.collections.get("Wiki")

counter = 100

for item in wiki.iterator():
    print(item.properties)

    if (counter == 0): break
    
    counter -= 1

{'text': 'Mission Hills is an urban residential community of the San Fernando Valley, within the city of Los Angeles, California.', 'title': 'Mission Hills, Los Angeles', 'wiki_id': '20231101.simple_512123_0', 'url': 'https://simple.wikipedia.org/wiki/Mission%20Hills%2C%20Los%20Angeles'}
{'text': 'Pavlov researched classical conditioning through the use of dogs and their natural ability to salivate, produce water in their mouths. Thorndike and Watson rejected looking at one\'s own conscious thoughts and feelings ("Introspection"). They wanted to restrict psychology to experimental methods. Skinner\'s research leant mainly on behavior shaping using positive reinforcement (rewards rather than punishments).', 'title': 'Behaviorism', 'wiki_id': '20231101.simple_260848_4', 'url': 'https://simple.wikipedia.org/wiki/Behaviorism'}
{'text': 'On 27 April 1970 he was selected as a cosmonaut and flew as Commander on Soyuz T-5. He retired on 31 October 1992 due to medical reasons after injuries he 

You can also get `vector embeddings`, by using `include_vector`.

In [23]:
counter = 10

for item in wiki.iterator(include_vector=True):
    print(item.properties)
    print(item.vector)

    if (counter == 0): break
    
    counter -= 1

{'text': 'Mission Hills is an urban residential community of the San Fernando Valley, within the city of Los Angeles, California.', 'title': 'Mission Hills, Los Angeles', 'wiki_id': '20231101.simple_512123_0', 'url': 'https://simple.wikipedia.org/wiki/Mission%20Hills%2C%20Los%20Angeles'}
{'main_vector': [-0.04594915360212326, 0.04193487763404846, 0.04189903289079666, -0.0003488972724881023, 0.017983609810471535, -0.021612590178847313, 0.02356596663594246, -0.025286372750997543, -0.022867051884531975, 0.008664748631417751, -0.05046521872282028, 0.009910251013934612, 0.012741751037538052, -0.01751766726374626, 0.010895899496972561, -0.10021360218524933, 0.06365498900413513, -0.06265142560005188, 0.0028359803836792707, 0.018781090155243874, -0.06383419781923294, -0.003908993676304817, 0.009731042198836803, -0.02849421091377735, 0.040644571185112, -0.05121789500117302, 0.038135647773742676, -0.023350916802883148, -0.013861806131899357, -0.07899527251720428, -0.07820675522089005, -0.0289422

## Close the client

In [24]:
client.close()