# Adding Product Quantization while storing

In [1]:
#Loading environment variables

import os

WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY")
WEAVIATE_CLUSTER_URL = os.getenv("WEAVIATE_CLUSTER_URL")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
COHERE_API_KEY = os.getenv("COHERE_API_KEY")

## Connect to WCS Instance

In [2]:
import weaviate

client = weaviate.connect_to_wcs(
    cluster_url=WEAVIATE_CLUSTER_URL,
    auth_credentials=weaviate.auth.AuthApiKey(WEAVIATE_API_KEY),
    headers={
        "OpenAI-Api-Key": OPENAI_API_KEY,
        "Cohere-Api-Key": COHERE_API_KEY,
    },
)

client.is_ready()

True

## Vector Store with PQ

In [5]:
from weaviate.classes.config import Configure

if client.collections.exists("WikipediaPQ"):
    client.collections.delete("WikipediaPQ")

client.collections.create(
    name="WikipediaPQ",
    vectorizer_config=[
        Configure.NamedVectors.text2vec_cohere(
            name="text_vector",
            model="embed-multilingual-v2.0",
            source_properties=["text"],

            vector_index_config=Configure.VectorIndex.hnsw(
                quantizer=Configure.VectorIndex.Quantizer.pq(
                    segments=128, # 768/6
                )
            ),

    )],

    generative_config=Configure.Generative.openai("gpt-4")

)

<weaviate.collections.collection.Collection at 0x10dc2cf10>

In [6]:
from datasets import load_dataset
from tqdm import tqdm

def import_wiki_data(lang, lang_id, max_rows, skip_rows=0):
    print(f"Importing {max_rows} data items for {lang}")

    dataset = load_dataset(f"Cohere/wikipedia-22-12-{lang}-embeddings", split="train", streaming=True)
    dataset = dataset.skip(skip_rows)

    # counter = 0
    counter = skip_rows

    wikipedia = client.collections.get("Wikipedia")

    with wikipedia.batch.fixed_size(batch_size=1000, concurrent_requests=4) as batch:
        for item in tqdm(dataset, initial=skip_rows, total=max_rows):
            vector = item["emb"]
            data_to_insert = {   
                "text": item["text"],
                "wiki_id": item["wiki_id"],
                "title": item["title"],
                "url": item["url"],
                "views": item["views"],
                "lang": lang,
                "lang_id": lang_id,
            }

            batch.add_object(
                properties=data_to_insert,
                vector=vector
            )
            
            # stop after the request number reaches = max_rows
            counter += 1
            if counter >= max_rows:
                break
    
    # check for errors at the end
    if (len(wikipedia.batch.failed_objects)>0):
        print("Final error check")
        print(f"Some errors {len(wikipedia.batch.failed_objects)}")
        print(wikipedia.batch.failed_objects[-1])
    
    print(f"Imported {counter} items for {lang}")
    print("-----------------------------------")

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
import_per_country = 10_000

import_wiki_data("en", 0, import_per_country, 0)
import_wiki_data("de", 1, import_per_country, 0)
import_wiki_data("fr", 2, import_per_country, 0)
# import_wiki_data("es", 3, import_per_country, 0)
# import_wiki_data("it", 4, import_per_country, 0)

Importing 10000 data items for en


100%|█████████▉| 9999/10000 [01:12<00:00, 136.99it/s]


Imported 10000 items for en
-----------------------------------
Importing 10000 data items for de


100%|█████████▉| 9999/10000 [01:13<00:00, 136.51it/s]


Imported 10000 items for de
-----------------------------------
Importing 10000 data items for fr


100%|█████████▉| 9999/10000 [01:13<00:00, 136.45it/s]


Imported 10000 items for fr
-----------------------------------
