In [None]:
from datasets import load_dataset

# Load only the Python subset from CodeSearchNet
dataset = load_dataset("code_search_net", "python")

In [None]:
import torch

# Check if GPU is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model and move it to the appropriate device
model = model.to(device)

In [None]:
dataset = dataset["train"]

In [None]:
include_metadata = ["func_documentation_string", "func_code_string", "func_name", "func_code_url"]
df = pd.DataFrame(dataset)[include_metadata]

In [None]:
df.info()

In [None]:
#Lightly process the func_code_string value. Remove trailing whitespaces and extra lines
import re
def preprocess_func_code_string(code):
    code = "\n".join([line.rstrip() for line in code.splitlines()])
    code = re.sub(r'n\{2,}', '\n\n', code)
    return code

In [None]:
df[:5]["func_code_string"][0]

In [None]:
df["func_code_string"].apply(preprocess_func_code_string)

In [None]:
df[:5]["func_code_string"][0]

In [None]:
#loading CodeBERT
!pip install transformers

from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")

In [None]:
#Tokenizing and appending raw tokens to the dataframe. 
def tokenize_func_code_string(code):
    return tokenizer.encode(code, truncation=True, padding="max_length") #for consistent token length and code too long for tokenizer

df["func_code_string_token"] = df["func_code_string"].apply(tokenize_func_code_string)

In [None]:
#Converting the token ids to tensor for pytorch, models like CodeBERT are built to process data in tensor form. Tensor form can be imagined as an 
#enhanced version of an array that supports operations on both CPU and GPU.
def to_tensor(token_ids):
    return torch.tensor([token_ids]).to(device)

In [None]:
df["input_ids_tensor"] = df["func_code_string_token"].apply(to_tensor)

In [None]:
import torch

embeddings = [] #embeddings list

#to ensure model is in eval mode
model.eval()

#disabling gradient computation since we aren't training the data. Gradients are only necessary while we are training a model, where the model is allowed to adjust its parameters
#Also by disabling it we save memory and computing speed
with torch.no_grad():
    for tensor in df["input_ids_tensor"]:
        tensor = tensor.unsqueeze(0).to(model.device) 
        
        outputs = model(input_ids=tensor) 
        
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy() #Extracting CLS token 
        
        embeddings.append(cls_embedding)

In [None]:
!pip install numpy
!pip install faiss-gpu

#normalizing embedding will ensure all vectors lie on surface of a unit sphere so cosine similarity is applicable.Without normalization, original magnitudes of embedding will
#be retained and l2/euclidean distance is more applicable. 

#I require capturing semantic similarity for my project as its a code retrieval system so will be normalizing the embedding. 

import numpy as np
from sklearn.preprocessing import normalize
import faiss

embedding_dim = len(embeddings[0])
embeddings_np = np.array(embeddings).astype('float32')

#normalizing
embeddings_np = normalize(embeddings, axis=1)

#assign FAISS index
index = faiss.indexFlatIP(embedding_dim)

#add normalized embeddingg
index.add(embeddings_np)

print(f"No of embeddings indexed: {index.ntotal}")