In [1]:
model_name = "intfloat/e5-small-v2"
print_mismatches = False

In [2]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(model_name)

encode = model.encode

In [3]:
import torch
import torch.nn.functional as F
import csv

code_snippets = []
nl_statements = []

with open('code_snippets_set.csv', newline='') as csvfile:
    csv_reader = csv.reader(csvfile)

    for row in csv_reader:
        if len(row) >= 2:
            code_snippets.append(row[0])
            nl_statements.append(row[1])

assert len(code_snippets) == len(nl_statements)

In [4]:
def calculate_cosine_similarity(vector1, vector2):
    tensor1 = torch.Tensor(vector1)
    tensor2 = torch.Tensor(vector2)
    similarity = F.cosine_similarity(tensor1, tensor2, dim=0)
    
    return similarity.item() 

def find_most_similar(code_embeddings, nl_embedding):
    max_similarity = -1
    most_similar_index = -1

    for index, code_embedding in enumerate(code_embeddings):
        similarity = calculate_cosine_similarity(code_embedding, nl_embedding)

        if similarity > max_similarity:
            max_similarity = similarity
            most_similar_index = index

    return max_similarity, most_similar_index

In [5]:

code_embeddings = [encode(code_snippet) for code_snippet in code_snippets]
statements_size = len(nl_statements)

assert len(code_snippets) == statements_size

mismatches = 0

wrong_answers = []

for index, statement in enumerate(nl_statements):
    nl_emb = encode(statement)
    max_similarity, most_similar_index = find_most_similar(code_embeddings, nl_emb)
    if most_similar_index != index :
        mismatches += 1
        wrong_answers.append((statement, code_snippets[index], code_snippets[most_similar_index], max_similarity))
    
print(f"{model_name}\nNumber of mismatches: {mismatches} out of {statements_size} examples.")
if print_mismatches:
    print("Mismatches:")
    for statement, expected_snippet, found_snippet, similarity in wrong_answers:
        print(f"Statement: {statement}\nExpected: {expected_snippet}\n"
          f"Found: {found_snippet}\nSimilarity: {similarity}")

intfloat/e5-small-v2
Number of mismatches: 49 out of 131 examples.


sentence-transformers/all-MiniLM-L6-v2

Number of mismatches: 37 out of 131 examples.

thenlper/gte-small

Number of mismatches: 38 out of 131 examples.

BAAI/bge-small-en-v1.5

Number of mismatches: 46 out of 131 examples.

intfloat/e5-small-v2

Number of mismatches: 49 out of 131 examples.

TaylorAI/bge-micro-v2

Number of mismatches: 51 out of 131 examples.

TaylorAI/gte-tiny

Number of mismatches: 52 out of 131 examples.