In [1]:
from typing import List

import loguru
from datasets import load_dataset
from dotenv import load_dotenv
from openai import OpenAI
from tqdm import tqdm

load_dotenv()  # take environment variables from .env.

logger = loguru.logger
logger.add("logs.log", format="{time} {level} {message}", level="INFO")

1

In [2]:
dataset = load_dataset("KShivendu/dbpedia-entities-openai-1M", split="train")

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

In [3]:
dataset = dataset.shuffle(seed=42)
dataset = dataset.select(range(100000))

In [4]:
def sanitize(text: str):
    text = text.replace("\n", " ")
    text = text.replace("\t", " ")
    text = text.strip()
    if len(text) <= 0:
        return " "
    return text

dataset = dataset.map(lambda x: {"combined_text": sanitize(f"{x['title']}\n{x['text']}")})
combined_text = dataset["combined_text"]

In [13]:
MODEL_NAME = "text-embedding-3-large"
DIMENSIONS = 1536

In [14]:
client = OpenAI()

def get_embedding(texts: List[str], model: str, dimensions: int):
   texts = [text.replace("\n", " ") for text in texts]
   return client.embeddings.create(input = texts, model=model, dimensions=dimensions)
#    embedding_data = [{"embedding": data.embedding, "tokens": usage[1]} for data, usage in zip(embeddings_batch_response.data, embeddings_batch_response.usage)]
#    return embedding_data

In [15]:
sentences = ['Entamoebidae Entamoebidae is a family of Archamoebae.It includes Entamoeba and Endolimax.']  
embeddings_data = get_embedding(sentences, model=MODEL_NAME, dimensions=DIMENSIONS)
len(embeddings_data.data[0].embedding)

1536

In [16]:
bs = 1000
response_objects = []
for i in tqdm(range(0, len(combined_text), bs)):
    this_batch = list(combined_text[i : i + bs])
    embeddings_batch_response = get_embedding(this_batch, model=MODEL_NAME, dimensions=DIMENSIONS)
    response_objects.append(embeddings_batch_response)

100%|██████████| 100/100 [08:47<00:00,  5.28s/it]


In [17]:
embedding_responses = [r.data for r in response_objects]
embedding_objects = [item for sublist in embedding_responses for item in sublist]
embeddings = [e.embedding for e in embedding_objects]

In [18]:
dataset = dataset.remove_columns("embedding")

In [19]:
dataset = dataset.add_column("embedding", embeddings)

In [20]:
dataset.push_to_hub(f"Qdrant/dbpedia-entities-openai3-{MODEL_NAME}-{DIMENSIONS}-100K")

Uploading the dataset shards:   0%|          | 0/4 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-large-1536-100K/commit/4c5c6620ca00986409c1df5599eeeb5d1377dd02', commit_message='Upload dataset', commit_description='', oid='4c5c6620ca00986409c1df5599eeeb5d1377dd02', pr_url=None, pr_revision=None, pr_num=None)