In [None]:
from datasets import load_dataset

# Load the first 100,000 examples from the 'train' split of the 'python' subset
dataset = load_dataset("code_search_net", "python", split="train[:100000]")

In [None]:
import torch

# Check if GPU is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
print(dataset)

In [None]:
!pip install pandas

import pandas as pd

include_metadata = ["func_documentation_string", "func_code_string", "func_name", "func_code_url"]
df = pd.DataFrame(dataset)[include_metadata]

In [None]:
df[:5]

In [None]:
#Lightly process the func_code_string value. Remove trailing whitespaces and extra lines
import re
def preprocess_func_code_string(code):
    code = "\n".join([line.rstrip() for line in code.splitlines()])
    code = re.sub(r'n\{2,}', '\n\n', code)
    return code

In [None]:
df[:5]

In [None]:
df["func_code_string"].apply(preprocess_func_code_string)

In [None]:
df[:5]

In [None]:
#loading CodeBERT
!pip install transformers

from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")

In [None]:
#Tokenizing and appending raw tokens to the dataframe. 
def tokenize_func_code_string(code):
    return tokenizer.encode(code, truncation=True, padding="max_length") #for consistent token length and code too long for tokenizer

df["func_code_string_token"] = df["func_code_string"].apply(tokenize_func_code_string)

In [None]:
#Converting the token ids to tensor for pytorch, models like CodeBERT are built to process data in tensor form. Tensor form can be imagined as an 
#enhanced version of an array that supports operations on both CPU and GPU.
def to_tensor(token_ids):
    return torch.tensor([token_ids]).to(device)

In [None]:
df["input_ids_tensor"] = df["func_code_string_token"].apply(to_tensor)

In [None]:
import torch
from tqdm import tqdm

# Set device to GPU if available
if torch.cuda.is_available():
    device = torch.device("cuda")
    model = model.to(device)
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")
    model = model.to(device)

embeddings = []  # Embeddings list

# Ensure the model is in evaluation mode
model.eval()

# Disable gradient computation for memory efficiency and speed
with torch.no_grad():
    # Wrap the loop in tqdm for a progress bar
    for tensor in tqdm(df["input_ids_tensor"], desc="Processing tensors"):
        tensor = tensor.to(device)
        
        outputs = model(input_ids=tensor)
        
        # Extract CLS token embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
        
        embeddings.append(cls_embedding)

In [1]:
import numpy as np

# Load the embeddings array from the saved .npy file
embeddings = np.load('/kaggle/input/my-unique-embeddings-dataset/embeddings.npy')

In [2]:
!pip install faiss-gpu

import numpy as np
from sklearn.preprocessing import normalize
import faiss
from tqdm import tqdm

# Get the embedding dimension from the first embedding
embedding_dim = len(embeddings[0])

# Convert embeddings to a NumPy array and ensure it's in float32 format
embeddings_np = np.array(embeddings, dtype=np.float32)

# Normalize embeddings with progress bar
print("Normalizing embeddings...")
embeddings_np = normalize(embeddings_np, axis=1, norm='l2')

# Ensure embeddings are contiguous in memory
embeddings_np = np.ascontiguousarray(embeddings_np)

# Initialize FAISS index with inner product (IP) similarity for cosine similarity search
index = faiss.IndexFlatIP(embedding_dim)

# Add embeddings to the FAISS index
print("Adding embeddings to FAISS index...")
index.add(embeddings_np)  # Batch adding to FAISS

print(f"No of embeddings indexed: {index.ntotal}")

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
Normalizing embeddings...
Adding embeddings to FAISS index...
No of embeddings indexed: 100000


In [3]:
#loading CodeBERT
!pip install transformers

from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [4]:
import torch

# Check if GPU is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dr

In [8]:
import pandas as pd
import torch
def retrieval_func(query, top_k=5):
    query_tokens = tokenizer.encode(query, truncation=True, padding='max_length')
    query_tensor = torch.tensor([query_tokens]).to(device)
    
    with torch.no_grad():
        query_embedding = model(query_tensor).last_hidden_state[:, 0, :].squeeze().cpu().numpy()
        
    query_embedding_np = normalize(query_embedding.reshape(1, -1), axis=1, norm='l2')
        
    distances, indices = index.search(query_embedding_np, top_k)
    
    results = []
    
    for idx, distance in zip(indices[0], distances[0]):
        func_name = df.iloc[idx]["func_name"]
        code_snippet = df.iloc[idx]["func_code_string"]
        
        results.append({
            "function_name": func_name,
            "code_snippet": code_snippet,
            "similarity_score": distance
        })
        
    return results

In [None]:
query = "how to add two numbers"

result = retrieval_func("how to verify unwanted transactions")