### Method 1. Sentence Transformers to generate embeddings using Multi-GPUs

Importing sentence transformers library

In [3]:
from sentence_transformers import SentenceTransformer
from tqdm.autonotebook import tqdm, trange

In [4]:
model = SentenceTransformer("BAAI/bge-m3", device="cuda")

Using one-liner function from sentnece transformers to use multi-gpu

In [5]:
pool = model.start_multi_process_pool()

In [6]:
# The sentences to encode
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]

In [7]:
emb = model.encode_multi_process(sentences, pool)

In [8]:
print("Embeddings computed. Shape:", emb.shape)

Embeddings computed. Shape: (3, 1024)


In [1]:
import pyarrow.parquet as pq
import pyarrow as pa
from sentence_transformers import SentenceTransformer
import time
from tqdm.autonotebook import tqdm

start_time = time.time()

input_file = 'bnwiki_all_abstract.parquet'
output_file = 'hf_embed.parquet'

table = pq.read_table(input_file)
df_subset = table.slice(0, 500).to_pandas()

sentences = df_subset['Abstract'].tolist()
version_control = df_subset['Version Control'].tolist()

model = SentenceTransformer("BAAI/bge-m3", device="cuda")
pool = model.start_multi_process_pool()

with tqdm(total=len(sentences), desc="Processing sentences") as pbar:
    embeddings = model.encode_multi_process(sentences, pool)
    pbar.update(len(sentences))

# Convert embeddings to lists of floats
embedding_lists = [list(embed) for embed in embeddings]

# Define schema for the table
embedding_field = pa.field('embeddings', pa.list_(pa.float32()))
version_control_field = pa.field('Version Control', pa.string())
schema = pa.schema([embedding_field, version_control_field])

# Create a PyArrow Table
embedding_array = pa.array(embedding_lists, type=pa.list_(pa.float32()))
embedding_dict = {
    'embeddings': embedding_array,
    'Version Control': pa.array(version_control)
}
table = pa.table(embedding_dict, schema=schema)

pq.write_table(table, output_file)

end_time = time.time()
elapsed_time = (end_time - start_time) / 60

print(f"Operation took {elapsed_time:.2f} minutes")


  from tqdm.autonotebook import tqdm, trange


Processing sentences:   0%|          | 0/500 [00:00<?, ?it/s]

Operation took 0.42 minutes


In [3]:
import pyarrow.parquet as pq

input_file = 'hf_embed.parquet'

table = pq.read_table(input_file)
df = table.to_pandas()

print(df)


                                            Embeddings   Version Control
0    [-0.53130025, 0.57009095, -0.30176148, -0.4871...      MjAyNDA4MTkx
1    [-0.916644, 1.1611831, -0.46351388, 0.11054788...      MjAyNDA4MTky
2    [0.27030998, 0.3748276, -0.8021263, -0.0353029...      MjAyNDA4MTkz
3    [-0.0006129769, 0.10446445, -0.91086644, -0.41...      MjAyNDA4MTk0
4    [-0.92064404, 0.9358199, -0.8850981, -0.496975...      MjAyNDA4MTk1
..                                                 ...               ...
995  [-0.39280245, -0.21199724, -0.2767391, -0.6167...  MjAyNDA4MTk5OTY=
996  [0.02107662, 0.6222971, -0.31643283, -0.400008...  MjAyNDA4MTk5OTc=
997  [-1.2849243, 1.0070957, -0.7613241, -0.1102534...  MjAyNDA4MTk5OTg=
998  [-1.2849243, 1.0070957, -0.7613241, -0.1102534...  MjAyNDA4MTk5OTk=
999  [-1.2849243, 1.0070957, -0.7613241, -0.1102534...  MjAyNDA4MTkxMDAw

[1000 rows x 2 columns]


### Method 2. Generate Embeddings using Ray and Distributing the workload 

In [1]:
from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import torch
import time

start_time = time.time()

accelerator = Accelerator()

print("Loading data...")
df = pd.read_parquet('bnwiki_all_abstract.parquet')
df = df.head(300)

print("Initializing model...")
model_name = 'BAAI/bge-m3'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model = model.to(accelerator.device)
model = accelerator.prepare(model)

def compute_embeddings(texts, tokenizer, model):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    inputs = {key: value.to(accelerator.device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

batch_size = 32
embeddings = []
version_control = df['Version Control'].tolist()

print("Processing batches...")
for i in range(0, len(df), batch_size):
    batch_texts = df['Abstract'].iloc[i:i + batch_size].tolist()
    batch_embeddings = compute_embeddings(batch_texts, tokenizer, model)
    embeddings.append(batch_embeddings)

print("Combining embeddings...")
all_embeddings = torch.cat(embeddings).cpu().numpy()

print("Saving results...")
output_df = pd.DataFrame({
    'Embeddings': list(all_embeddings),
    'Version Control': version_control
})
output_df.to_parquet('hf_embed.parquet', index=False)

end_time = time.time()
elapsed_time = (end_time - start_time) / 60

print(f"Processing completed in {elapsed_time:.2f} minutes.")

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Loading data...
Initializing model...
Processing batches...
Combining embeddings...
Saving results...
Processing completed in 0.42 minutes.


In [1]:
!nvidia-smi

Mon Sep  2 17:07:09 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        Off |   00000000:01:00.0 Off |                  N/A |
| 34%   25C    P8             36W /  370W |    7240MiB /  24576MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA GeForce RTX 3090        Off |   00

In [1]:
from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import torch
import time

start_time = time.time()

accelerator = Accelerator()

print("Loading data...")
df = pd.read_parquet('bnwiki_all_abstract.parquet')
df = df.head(500)

print("Initializing model...")
model_name = 'BAAI/bge-m3'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model = accelerator.prepare(model)

def compute_embeddings(texts, tokenizer, model):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    inputs = {key: value.to(accelerator.device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

batch_size = 32
embeddings = []
version_control = df['Version Control'].tolist()

print("Processing batches...")
for i in range(0, len(df), batch_size):
    batch_texts = df['Abstract'].iloc[i:i + batch_size].tolist()
    with accelerator.split_between_processes(batch_texts) as split_batch_texts:
        batch_embeddings = compute_embeddings(split_batch_texts, tokenizer, model)
        embeddings.append(batch_embeddings)

print("Combining embeddings...")
all_embeddings = torch.cat(embeddings).cpu().numpy()

print("Saving results...")
output_df = pd.DataFrame({
    'Embeddings': list(all_embeddings),
    'Version Control': version_control
})
output_df.to_parquet('hf_embed.parquet', index=False)

end_time = time.time()
elapsed_time = (end_time - start_time) / 60

print(f"Processing completed in {elapsed_time:.2f} minutes.")

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Loading data...
Initializing model...
Processing batches...
Combining embeddings...
Saving results...
Processing completed in 0.60 minutes.


In [2]:
from accelerate import Accelerator, PartialState
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import torch
import time

start_time = time.time()

accelerator = Accelerator()
distributed_state = PartialState()

print("Loading data...")
df = pd.read_parquet('bnwiki_all_abstract.parquet')
df = df.head(1000)

print("Initializing model...")
model_name = 'BAAI/bge-m3'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model = model.to(accelerator.device)
model = accelerator.prepare(model)

def compute_embeddings(texts, tokenizer, model):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    inputs = {key: value.to(accelerator.device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

batch_size = 32
embeddings = []
version_control = df['Version Control'].tolist()

print("Processing batches...")
for i in range(0, len(df), batch_size):
    batch_texts = df['Abstract'].iloc[i:i + batch_size].tolist()
    with distributed_state.split_between_processes(batch_texts) as split_batch_texts:
        batch_embeddings = compute_embeddings(split_batch_texts, tokenizer, model)
        embeddings.append(batch_embeddings)

print("Combining embeddings...")
all_embeddings = torch.cat(embeddings).cpu().numpy()

print("Saving results...")
output_df = pd.DataFrame({
    'Embeddings': list(all_embeddings),
    'Version Control': version_control
})
output_df.to_parquet('hf_embed.parquet', index=False)

end_time = time.time()
elapsed_time = (end_time - start_time) / 60

print(f"Processing completed in {elapsed_time:.2f} minutes.")

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Loading data...
Initializing model...
Processing batches...
Combining embeddings...
Saving results...
Processing completed in 1.05 minutes.
