In [None]:
from huggingface_hub import login
login()

In [1]:
import psutil
import torch


gpu_count = torch.cuda.device_count()
cpu_count = psutil.cpu_count(logical=False)
print("Available CPUs:", cpu_count)
print("Available GPUs:", gpu_count)

Available CPUs: 2
Available GPUs: 0


In [2]:
SAMPLES_SIZE = 1000
MAX_LEN = 1000
LANGUAGE = "c-sharp"
DATASET = "bigcode/the-stack"

print(f"Samples size: {SAMPLES_SIZE} examples. Taken from {DATASET}, language = {LANGUAGE}. Each sample has {MAX_LEN} chars.")

Samples size: 1000 examples. Taken from bigcode/the-stack, language = c-sharp. Each sample has 1000 chars.


In [3]:
from datasets import load_dataset

ds = load_dataset(DATASET, data_dir=f"data/{LANGUAGE}", streaming=True, split="train")

count = 0
samples = []

for sample in ds:
    content = sample["content"]

    # delete comments 
    lines = content.split('\n')
    filtered_text = []
    for line in lines:
        if not (line.strip().startswith('//') or line.strip().startswith('/*') or line.strip().startswith('*') or line.strip().startswith('#')):
            filtered_text.append(line)
    filtered_text = '\n'.join(filtered_text)

    if len(filtered_text) >= MAX_LEN:
        truncated_text = filtered_text[:MAX_LEN]
        samples.append(truncated_text)
    else:
        continue

    count += 1
    if count >= SAMPLES_SIZE:
        break


In [4]:
model_name = 'intfloat/e5-small-v2'

In [5]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(model_name)

encode = model.encode

In [6]:
def run_samples():
    embeddings = []
    for sample in samples:
        encoded_sample = encode(sample)
        embeddings.append(encoded_sample)
    return embeddings

In [7]:
import time

start_time = time.time()

embeddings = run_samples()

end_time = time.time()
elapsed_time = end_time - start_time
print(f"{model_name}\nExecution time: {elapsed_time:.2f} seconds.")
print(f"{SAMPLES_SIZE/elapsed_time:.2f} samples per sec.")

#Additional info
print(f"\nAdditional info: embeddings = {len(embeddings)}, "
     f"embeddings dim = {len(embeddings[0])}")



intfloat/e5-small-v2
Execution time: 504.52 seconds.
1.98 samples per sec.

Additional info: embeddings = 1000, embeddings dim = 384


Models results:

sentence-transformers/all-MiniLM-L6-v2

Execution time: 69.14 seconds.

14.46 samples per sec.

Additional info: embeddings = 1000, embeddings dim = 384

TaylorAI/gte-tiny

Execution time: 290.09 seconds.
                       
3.45 samples per sec.

Additional info: embeddings = 1000, embeddings dim = 384

BAAI/bge-small-en-v1.5

Execution time: 467.23 seconds.

2.14 samples per sec.

Additional info: embeddings = 1000, embeddings dim = 384

thenlper/gte-small

Execution time: 506.21 seconds.
                       
1.98 samples per sec.

Additional info: embeddings = 1000, embeddings dim = 384

intfloat/e5-small-v2

Execution time: 504.52 seconds.
                       
1.98 samples per sec.

Additional info: embeddings = 1000, embeddings dim = 384