# Load Data with Vectors

## Get keys and urls

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

WEAVIATE_KEY = os.getenv("WEAVIATE_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_URL = os.getenv("OPENAI_URL")

print(f"Weaviate Key:{WEAVIATE_KEY}")
print(f"OpenAI API Key: {OPENAI_API_KEY[:20]}")
print(f"OpenAI URL: {OPENAI_URL}")



Weaviate Key:root-user-key
OpenAI API Key: sk-proj-iuwKF1Q94jnW
OpenAI URL: https://api.openai.com


## Connect to Weaviate

In [2]:
import weaviate
from weaviate.classes.init import Auth

# Connect to the local instance
client = weaviate.connect_to_local(
  host="127.0.0.1", # the address to the learner's instance
  port=8080,
  grpc_port=50051,
  auth_credentials=Auth.api_key(WEAVIATE_KEY),
  headers={
    "X-OpenAI-Api-Key": OPENAI_API_KEY
  }
)

print(client.is_ready())

True


In [5]:
from weaviate.classes.config import Configure

def create_wiki_collection():
    if client.collections.exists("Wiki"):
        client.collections.delete("Wiki")

    # Create a collection here - with Weaviate vectorizer and define source properties
    client.collections.create(
        name="Wiki",

        vector_config=[
            Configure.Vectors.text2vec_openai(
                name="main_vector",
                base_url=OPENAI_URL,
                model= "text-embedding-3-small",
                source_properties=['title', 'texts']

                # TODO: use model text-embedding-3-small (or your favorite model)
                # TODO: set source properties to title and text
                
            )
        ],
    )

create_wiki_collection()

## Load the data from parquet files

In [6]:
from datasets import load_dataset

def prepare_dataset():
    return load_dataset('parquet', data_files={'train': ['../wiki-data/openai/text-embedding-3-small/*.parquet']}, split="train", streaming=True)
    # return load_dataset("weaviate/wiki-sample", "openai-text-embedding-3-small", split="train", streaming=True)

### Dataset Test
<!-- The parquet files should be located in "datasets/openai". -->

In [7]:
dataset = prepare_dataset()

counter = 10
for i in dataset:
    print(i)

    counter -= 1
    if(counter == 0): break

{'title': 'Unicode', 'text': "The Unicode Standard includes more than just the base code. Alongside the character encodings, the Consortium's official publication includes a wide variety of details about the scripts and how to display them: normalization rules, decomposition, collation, rendering, and bidirectional text display order for multilingual texts, and so on.", 'wiki_id': '20231101.simple_64846_4', 'url': 'https://simple.wikipedia.org/wiki/Unicode', 'vector': [-0.011741329915821552, 0.009521514177322388, 0.05931148678064346, -0.01194709911942482, -0.02601424790918827, -0.04736438766121864, 0.0016882448690012097, -0.011442028917372227, -0.01723475009202957, 0.00785041693598032, 0.005234650336205959, -0.06953760236501694, -0.06754226237535477, 0.046690959483385086, 0.01177250687032938, 0.014129502698779106, -0.04038069769740105, -0.02858324721455574, -0.06020938977599144, -0.03723803535103798, 0.01711004227399826, -0.02741098590195179, 0.01505234744399786, 0.042475804686546326, 

### The import function

`TODO:`
* add a function to add objects to batch

In [8]:
from tqdm import tqdm
from weaviate.util import generate_uuid5

def import_wiki_data(max_rows=10_000):
    print(f"Importing {max_rows} data items")

    dataset = prepare_dataset()
    wiki = client.collections.get("Wiki")

    counter = 0

    with wiki.batch.fixed_size(batch_size=2000, concurrent_requests=2) as batch:
        for item in tqdm(dataset, total=max_rows):

            data_to_insert = {   
                "wiki_id": item["wiki_id"],
                "text": item["text"],
                "title": item["title"],
                "url": item["url"],
            }

            item_id = generate_uuid5(item["wiki_id"])

            item_vector = {
                "main_vector": item["vector"]
            }

            # TODO: add objects to batch using
            batch.add_object(
                properties=data_to_insert,
                uuid=item_id,
                vector=item_vector,
                # * data_to_insert
                # * item_id
                # * item_vector
            )

            # Check number of errors while running
            if(batch.number_errors > 10):
                print(f"Reached {batch.number_errors} errors during batch import")
                break
            
            # stop after the request number reaches = max_rows
            counter += 1
            if counter >= max_rows:
                break
    
    # check for errors at the end
    if (len(wiki.batch.failed_objects)>0):
        print("Final error check")
        print(f"Some errors {len(wiki.batch.failed_objects)}")
        print(wiki.batch.failed_objects[-1])
    
    print(f"Imported {counter} items")
    print("-----------------------------------")

In [9]:
import_wiki_data(10_000)

Importing 10000 data items


100%|█████████▉| 9999/10000 [00:16<00:00, 589.42it/s]


Imported 10000 items
-----------------------------------


## Check if data loaded correctly

In [10]:
wiki = client.collections.get("Wiki")
len(wiki)

10000

In [11]:
res = wiki.query.fetch_objects(limit=1, include_vector=True)
print(res.objects[0].properties)
print(res.objects[0].vector)

{'text': "The Unicode Standard includes more than just the base code. Alongside the character encodings, the Consortium's official publication includes a wide variety of details about the scripts and how to display them: normalization rules, decomposition, collation, rendering, and bidirectional text display order for multilingual texts, and so on.", 'title': 'Unicode', 'wiki_id': '20231101.simple_64846_4', 'url': 'https://simple.wikipedia.org/wiki/Unicode'}
{'main_vector': [-0.011741329915821552, 0.009521514177322388, 0.05931148678064346, -0.01194709911942482, -0.02601424790918827, -0.04736438766121864, 0.0016882448690012097, -0.011442028917372227, -0.01723475009202957, 0.00785041693598032, 0.005234650336205959, -0.06953760236501694, -0.06754226237535477, 0.046690959483385086, 0.01177250687032938, 0.014129502698779106, -0.04038069769740105, -0.02858324721455574, -0.06020938977599144, -0.03723803535103798, 0.01711004227399826, -0.02741098590195179, 0.01505234744399786, 0.04247580468654

In [12]:
client.close()