In [10]:
#Loading environment variables

import os

WEAVIATE_CLUSTER_URL = os.getenv("WEAVIATE_CLUSTER_URL")
WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
COHERE_API_KEY = os.getenv("COHERE_API_KEY")

## Connect to Weaviate Instance

In [None]:
import weaviate, os

# Connect to a cloud instance of Weaviate (with WCS)
client = weaviate.connect_to_wcs(
    cluster_url=WEAVIATE_CLUSTER_URL,
    auth_credentials=weaviate.auth.AuthApiKey(WEAVIATE_API_KEY),
)

client.is_ready()

## Store Vector Embeddings for Cohere

In [12]:
from weaviate.classes.config import Configure, Property, DataType

if client.collections.exists("Wikipedia"):
    client.collections.delete("Wikipedia")

# Create a collection here - with Cohere as a vectorizer
client.collections.create(
    name="Wikipedia",
    
    vectorizer_config=Configure.Vectorizer.text2vec_cohere(
        model="embed-multilingual-v2.0"
    ),

    generative_config=Configure.Generative.openai("gpt-4"),

    properties=[
        Property(name="text", data_type=DataType.TEXT),
        Property(name="title", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="wiki_id", data_type=DataType.INT, skip_vectorization=True),
        Property(name="url", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="lang", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="lang_id", data_type=DataType.INT, skip_vectorization=True),
        Property(name="views", data_type=DataType.NUMBER, skip_vectorization=True),
    ]
)

<weaviate.collections.collection.Collection at 0x11f6fe310>

In [13]:
from datasets import load_dataset
from tqdm import tqdm

def import_wiki_data(lang, lang_id, max_rows, skip_rows=0):
    print(f"Importing {max_rows} data items for {lang}")

    dataset = load_dataset(f"Cohere/wikipedia-22-12-{lang}-embeddings", split="train", streaming=True)
    dataset = dataset.skip(skip_rows)

    # counter = 0
    counter = skip_rows

    wikipedia = client.collections.get("Wikipedia")

    with wikipedia.batch.fixed_size(batch_size=1000, concurrent_requests=4) as batch:
        for item in tqdm(dataset, initial=skip_rows, total=max_rows):
            vector = item["emb"]
            data_to_insert = {   
                "text": item["text"],
                "wiki_id": item["wiki_id"],
                "title": item["title"],
                "url": item["url"],
                "views": item["views"],
                "lang": lang,
                "lang_id": lang_id,
            }

            batch.add_object(
                properties=data_to_insert,
                vector=vector
            )
            
            # stop after the request number reaches = max_rows
            counter += 1
            if counter >= max_rows:
                break
    
    # check for errors at the end
    if (len(wikipedia.batch.failed_objects)>0):
        print("Final error check")
        print(f"Some errors {len(wikipedia.batch.failed_objects)}")
        print(wikipedia.batch.failed_objects[-1])
    
    print(f"Imported {counter} items for {lang}")
    print("-----------------------------------")

In [16]:
import_per_country = 10_000

import_wiki_data("en", 0, import_per_country, 0)
import_wiki_data("de", 1, import_per_country, 0)
import_wiki_data("fr", 2, import_per_country, 0)
# import_wiki_data("es", 3, import_per_country, 0)
# import_wiki_data("it", 4, import_per_country, 0)

Importing 10000 data items for en


  0%|          | 0/10000 [00:00<?, ?it/s]'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 23b6209b-db81-4b06-a0c9-aa173a4210ed)')' thrown while requesting GET https://huggingface.co/datasets/Cohere/wikipedia-22-12-en-embeddings/resolve/85c2eca83d4b9dcecc043c23748cb8c1047f683f/data/train-00000-of-00253-8d3dffb4e6ef0304.parquet
Retrying in 1s [Retry 1/5].
100%|█████████▉| 9999/10000 [00:54<00:00, 182.54it/s]


Imported 10000 items for en
-----------------------------------
Importing 10000 data items for de


100%|█████████▉| 9999/10000 [00:45<00:00, 221.18it/s]


Imported 10000 items for de
-----------------------------------
Importing 10000 data items for fr


100%|█████████▉| 9999/10000 [01:11<00:00, 139.82it/s]


Imported 10000 items for fr
-----------------------------------
