In [1]:
import toml
from openai import OpenAI
import os
from datasets import load_dataset
import pandas as pd

# 載入 secrets.toml 文件
secrets = toml.load("secrets.toml")

# MongoDB URI
ATLAS_CONNECTION_STRING = secrets["MONGODB_URL"]
# 設置環境變數
os.environ["OPENAI_API_KEY"] = secrets["OPENAI_API_KEY"]

openai_client = OpenAI()

# Use streaming=True to load the dataset without downloading it fully
data = load_dataset(
    "MongoDB/cosmopedia-wikihow-chunked",
    split="train",
    streaming=True
)
# Get first 25k records from the dataset
data_head = data.take(25000)
df = pd.DataFrame(data_head)

# Use this if you want the full dataset
# data = load_dataset("MongoDB/cosmopedia-wikihow-chunked", split="train")
# df = pd.DataFrame(data)

In [2]:
df

Unnamed: 0,doc_id,chunk_id,text_token_length,text
0,0,0,180,Title: How to Create and Maintain a Compost Pi...
1,0,1,141,**Step 2: Gather Materials**\nGather brown (ca...
2,0,2,182,_Key guideline:_ For every volume of green mat...
3,0,3,188,_Key tip:_ Chop large items like branches and ...
4,0,4,157,**Step 7: Maturation and Use**\nAfter 3-4 mont...
...,...,...,...,...
24995,4334,2,178,Guideline: Make copies of all original documen...
24996,4334,3,183,"In Texas, the bond amount must equal either 1...."
24997,4334,4,191,Bond premium costs vary depending on factors s...
24998,4334,5,136,Or deliver it in person during business hours ...


In [3]:
# Ensuring length of dataset is what we expect i.e. 25k
len(df)

# Previewing the contents of the data
df.head()

# Only keep records where the text field is not null
df = df[df["text"].notna()]

# Number of unique documents in the dataset
df.doc_id.nunique()

4335

Step 5

In [4]:
from typing import Any, Dict, List

def get_embeddings(docs: List[str], input_type: str, model:str="voyage-lite-02-instruct") -> List[List[float]]:
    """
    Get embeddings using the Voyage AI API.

    Args:
        docs (List[str]): List of texts to embed
        input_type (str): Type of input to embed. Can be "document" or "query".
        model (str, optional): Model name. Defaults to "voyage-lite-02-instruct".

    Returns:
        List[List[float]]: Array of embedddings
    """
    response = voyage_client.embed(docs, model=model, input_type=input_type)
    return response.embeddings

Step 5 Part II

In [5]:
def get_embeddings(docs: List[str], model: str="text-embedding-3-large") -> List[List[float]]:
    """
    Generate embeddings using the OpenAI API.

    Args:
        docs (List[str]): List of texts to embed
        model (str, optional): Model name. Defaults to "text-embedding-3-large".

    Returns:
        List[float]: Array of embeddings
    """
    # replace newlines, which can negatively affect performance.
    docs = [doc.replace("\n", " ") for doc in docs]
    response = openai_client.embeddings.create(input=docs, model=model)
    response = [r.embedding for r in response.data]
    return response

In [6]:
from typing import List
from transformers import AutoModel, AutoTokenizer
import torch

# Instruction to append to user queries, to improve retrieval
RETRIEVAL_INSTRUCT = "Represent this sentence for searching relevant passages:"

# Check if CUDA (GPU support) is available, and set the device accordingly
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# Load the UAE-Large-V1 model from the Hugging Face 
model = AutoModel.from_pretrained('WhereIsAI/UAE-Large-V1').to(device)
# Load the tokenizer associated with the UAE-Large-V1 model
tokenizer = AutoTokenizer.from_pretrained('WhereIsAI/UAE-Large-V1')

# Decorator to disable gradient calculations
@torch.no_grad()
def get_embeddings(docs: List[str], input_type: str) -> List[List[float]]:

    # Prepend retrieval instruction to queries
    if input_type == "query":
        docs = ["{}{}".format(RETRIEVAL_INSTRUCT, q) for q in docs]
    # Tokenize input texts
    inputs = tokenizer(docs, padding=True, truncation=True, return_tensors='pt', max_length=512).to(device)
    # Pass tokenized inputs to the model, and obtain the last hidden state
    last_hidden_state = model(**inputs, return_dict=True).last_hidden_state
    # Extract embeddings from the last hidden state
    embeddings = last_hidden_state[:, 0]
    return embeddings.cpu().numpy()

config.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Step 6: Evaluation

In [7]:
from tqdm.auto import tqdm

# Get all the texts in the dataset
texts = df["text"].tolist()

# Number of samples in a single batch
batch_size = 128

embeddings = []
# Generate embeddings in batches
for i in tqdm(range(0, len(texts), batch_size)):
    end = min(len(texts), i+batch_size)
    batch = texts[i:end]
    # Generate embeddings for current batch
    batch_embeddings = get_embeddings(batch)
    # Add to the list of embeddings
    embeddings.extend(batch_embeddings)

  0%|          | 0/196 [00:00<?, ?it/s]

TypeError: get_embeddings() missing 1 required positional argument: 'input_type'