At first, we'll use the sentence transformers to use multiple GPUs and run our embeddings

### Method 1. Sentence Transformers to generate embeddings using Multi-GPUs

Importing sentence transformers library

In [3]:
from sentence_transformers import SentenceTransformer
from tqdm.autonotebook import tqdm, trange

In [4]:
model = SentenceTransformer("BAAI/bge-m3", device="cuda")

Using one-liner function from sentnece transformers to use multi-gpu

In [5]:
pool = model.start_multi_process_pool()

In [6]:
# The sentences to encode
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]

In [7]:
emb = model.encode_multi_process(sentences, pool)

In [8]:
print("Embeddings computed. Shape:", emb.shape)

Embeddings computed. Shape: (3, 1024)


In [1]:
import pyarrow.parquet as pq
import pyarrow as pa
from sentence_transformers import SentenceTransformer
import time
from tqdm.autonotebook import tqdm

start_time = time.time()

input_file = 'bnwiki_all_abstract.parquet'
output_file = 'hf_embed.parquet'

table = pq.read_table(input_file)
df_subset = table.slice(0, 500).to_pandas()

sentences = df_subset['Abstract'].tolist()
version_control = df_subset['Version Control'].tolist()

model = SentenceTransformer("BAAI/bge-m3", device="cuda")
pool = model.start_multi_process_pool()

with tqdm(total=len(sentences), desc="Processing sentences") as pbar:
    embeddings = model.encode_multi_process(sentences, pool)
    pbar.update(len(sentences))

# Convert embeddings to lists of floats
embedding_lists = [list(embed) for embed in embeddings]

# Define schema for the table
embedding_field = pa.field('embeddings', pa.list_(pa.float32()))
version_control_field = pa.field('Version Control', pa.string())
schema = pa.schema([embedding_field, version_control_field])

# Create a PyArrow Table
embedding_array = pa.array(embedding_lists, type=pa.list_(pa.float32()))
embedding_dict = {
    'embeddings': embedding_array,
    'Version Control': pa.array(version_control)
}
table = pa.table(embedding_dict, schema=schema)

pq.write_table(table, output_file)

end_time = time.time()
elapsed_time = (end_time - start_time) / 60

print(f"Operation took {elapsed_time:.2f} minutes")


  from tqdm.autonotebook import tqdm, trange


Processing sentences:   0%|          | 0/500 [00:00<?, ?it/s]

Operation took 0.42 minutes


In [2]:
import pyarrow.parquet as pq

input_file = 'hf_embed.parquet'

table = pq.read_table(input_file)
df = table.to_pandas()

print(df)


                                            embeddings   Version Control
0    [-0.0043944786, -0.006028448, -0.024245542, -0...      MjAyNDA4MTkx
1    [-0.060743816, 0.047289453, -0.026209904, 0.01...      MjAyNDA4MTky
2    [0.010410685, 0.031769264, -0.051782552, -0.01...      MjAyNDA4MTkz
3    [-0.0097603975, 0.02948852, -0.056248866, -0.0...      MjAyNDA4MTk0
4    [0.0075047207, 0.037451863, -0.04594199, -0.02...      MjAyNDA4MTk1
..                                                 ...               ...
495  [-0.022085749, 0.010010308, -0.06106033, 0.027...  MjAyNDA4MTk0OTY=
496  [0.013045211, 0.05702561, -0.055616893, -0.007...  MjAyNDA4MTk0OTc=
497  [0.0017702369, 0.04213707, -0.039500177, -0.01...  MjAyNDA4MTk0OTg=
498  [0.004692101, 0.016187033, -0.019337796, -0.00...  MjAyNDA4MTk0OTk=
499  [0.015568068, 0.046629403, -0.050442986, -0.01...  MjAyNDA4MTk1MDA=

[500 rows x 2 columns]


### Method 2. Generate Embeddings using Ray and Distributing the workload 

In [6]:
!kill -9 2923098

/bin/bash: line 0: kill: (2923098) - Operation not permitted


In [7]:
!sudo kill -9 2923098

[sudo] password for ammar: 
