In [2]:
from transformers import RobertaTokenizer, RobertaModel
import torch

tokenizer = RobertaTokenizer.from_pretrained("microsoft/graphcodebert-base")
model = RobertaModel.from_pretrained("microsoft/graphcodebert-base")

def get_code_embedding(code):
    tokens = tokenizer(code, return_tensors="pt", max_length=512, truncation=True)
    with torch.no_grad():
        outputs = model(**tokens)
    return outputs.last_hidden_state[:, 0, :]  # [CLS] token


pytorch_model.bin:  35%|###5      | 176M/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [3]:
def compute_similarity(code1, code2):
    emb1 = get_code_embedding(code1)
    emb2 = get_code_embedding(code2)
    similarity = torch.nn.functional.cosine_similarity(emb1, emb2)
    return similarity.item()


In [4]:
# User-submitted (possibly brute-force) code
user_code = """
def two_sum(nums, target):
    for i in range(len(nums)):
        for j in range(i + 1, len(nums)):
            if nums[i] + nums[j] == target:
                print([i, j])

nums = [2, 7, 11, 15]
target = 9
two_sum(nums, target)
"""

# Reference (optimal) solution
reference_code = """
def two_sum(nums, target):
    hashmap = {}
    for i, num in enumerate(nums):
        if target - num in hashmap:
            print([hashmap[target - num], i])
        hashmap[num] = i

nums = [2, 7, 11, 15]
target = 9
two_sum(nums, target)
"""

similarity = compute_similarity(user_code, reference_code)
print(f"Code Similarity Score: {similarity:.4f}")


Code Similarity Score: 0.9753
