In [1]:
from datasets import load_dataset

# Load the first 100,000 examples from the 'train' split of the 'python' subset
dataset = load_dataset("code_search_net", "python", split="train[:100000]")

code_search_net.py:   0%|          | 0.00/8.44k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

The repository for code_search_net contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/code_search_net.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


python.zip:   0%|          | 0.00/941M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/412178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/22176 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/23107 [00:00<?, ? examples/s]

In [2]:
import torch

# Check if GPU is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
print(dataset)

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 100000
})


In [5]:
!pip install pandas

import pandas as pd

include_metadata = ["func_documentation_string", "func_code_string", "func_name", "func_code_url"]
df = pd.DataFrame(dataset)[include_metadata]



In [7]:
df[:5]

Unnamed: 0,func_documentation_string,func_code_string,func_name,func_code_url
0,Estimate discontinuity in basis of low resolut...,def __msgc_step3_discontinuity_localization(se...,ImageGraphCut.__msgc_step3_discontinuity_local...,https://github.com/mjirik/imcut/blob/1b38e7cd1...
1,Run Graph-Cut segmentation with refinement of ...,"def __multiscale_gc_lo2hi_run(self): # , pyed...",ImageGraphCut.__multiscale_gc_lo2hi_run,https://github.com/mjirik/imcut/blob/1b38e7cd1...
2,Run Graph-Cut segmentation with simplifiyng of...,"def __multiscale_gc_hi2lo_run(self): # , pyed...",ImageGraphCut.__multiscale_gc_hi2lo_run,https://github.com/mjirik/imcut/blob/1b38e7cd1...
3,Return values (intensities) by indexes.\n\n ...,"def __ordered_values_by_indexes(self, data, in...",ImageGraphCut.__ordered_values_by_indexes,https://github.com/mjirik/imcut/blob/1b38e7cd1...
4,Function computes multiscale indexes of ndarra...,"def __hi2lo_multiscale_indexes(self, mask, ori...",ImageGraphCut.__hi2lo_multiscale_indexes,https://github.com/mjirik/imcut/blob/1b38e7cd1...


In [8]:
#Lightly process the func_code_string value. Remove trailing whitespaces and extra lines
import re
def preprocess_func_code_string(code):
    code = "\n".join([line.rstrip() for line in code.splitlines()])
    code = re.sub(r'n\{2,}', '\n\n', code)
    return code

In [9]:
df[:5]

Unnamed: 0,func_documentation_string,func_code_string,func_name,func_code_url
0,Estimate discontinuity in basis of low resolut...,def __msgc_step3_discontinuity_localization(se...,ImageGraphCut.__msgc_step3_discontinuity_local...,https://github.com/mjirik/imcut/blob/1b38e7cd1...
1,Run Graph-Cut segmentation with refinement of ...,"def __multiscale_gc_lo2hi_run(self): # , pyed...",ImageGraphCut.__multiscale_gc_lo2hi_run,https://github.com/mjirik/imcut/blob/1b38e7cd1...
2,Run Graph-Cut segmentation with simplifiyng of...,"def __multiscale_gc_hi2lo_run(self): # , pyed...",ImageGraphCut.__multiscale_gc_hi2lo_run,https://github.com/mjirik/imcut/blob/1b38e7cd1...
3,Return values (intensities) by indexes.\n\n ...,"def __ordered_values_by_indexes(self, data, in...",ImageGraphCut.__ordered_values_by_indexes,https://github.com/mjirik/imcut/blob/1b38e7cd1...
4,Function computes multiscale indexes of ndarra...,"def __hi2lo_multiscale_indexes(self, mask, ori...",ImageGraphCut.__hi2lo_multiscale_indexes,https://github.com/mjirik/imcut/blob/1b38e7cd1...


In [10]:
df["func_code_string"].apply(preprocess_func_code_string)

0        def __msgc_step3_discontinuity_localization(se...
1        def __multiscale_gc_lo2hi_run(self):  # , pyed...
2        def __multiscale_gc_hi2lo_run(self):  # , pyed...
3        def __ordered_values_by_indexes(self, data, in...
4        def __hi2lo_multiscale_indexes(self, mask, ori...
                               ...                        
99995    def close_stream(self):\n        """Close the ...
99996    def read_response(self):\n        """Read an i...
99997    def identify(self):\n        """Update client ...
99998    def auth(self):\n        """Send authorization...
99999    def subscribe(self, topic, channel):\n        ...
Name: func_code_string, Length: 100000, dtype: object

In [11]:
df[:5]

Unnamed: 0,func_documentation_string,func_code_string,func_name,func_code_url
0,Estimate discontinuity in basis of low resolut...,def __msgc_step3_discontinuity_localization(se...,ImageGraphCut.__msgc_step3_discontinuity_local...,https://github.com/mjirik/imcut/blob/1b38e7cd1...
1,Run Graph-Cut segmentation with refinement of ...,"def __multiscale_gc_lo2hi_run(self): # , pyed...",ImageGraphCut.__multiscale_gc_lo2hi_run,https://github.com/mjirik/imcut/blob/1b38e7cd1...
2,Run Graph-Cut segmentation with simplifiyng of...,"def __multiscale_gc_hi2lo_run(self): # , pyed...",ImageGraphCut.__multiscale_gc_hi2lo_run,https://github.com/mjirik/imcut/blob/1b38e7cd1...
3,Return values (intensities) by indexes.\n\n ...,"def __ordered_values_by_indexes(self, data, in...",ImageGraphCut.__ordered_values_by_indexes,https://github.com/mjirik/imcut/blob/1b38e7cd1...
4,Function computes multiscale indexes of ndarra...,"def __hi2lo_multiscale_indexes(self, mask, ori...",ImageGraphCut.__hi2lo_multiscale_indexes,https://github.com/mjirik/imcut/blob/1b38e7cd1...


In [12]:
#loading CodeBERT
!pip install transformers

from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")



The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [13]:
#Tokenizing and appending raw tokens to the dataframe. 
def tokenize_func_code_string(code):
    return tokenizer.encode(code, truncation=True, padding="max_length") #for consistent token length and code too long for tokenizer

df["func_code_string_token"] = df["func_code_string"].apply(tokenize_func_code_string)

In [14]:
#Converting the token ids to tensor for pytorch, models like CodeBERT are built to process data in tensor form. Tensor form can be imagined as an 
#enhanced version of an array that supports operations on both CPU and GPU.
def to_tensor(token_ids):
    return torch.tensor([token_ids]).to(device)

In [15]:
df["input_ids_tensor"] = df["func_code_string_token"].apply(to_tensor)

In [20]:
import torch
from tqdm import tqdm

# Set device to GPU if available
if torch.cuda.is_available():
    device = torch.device("cuda")
    model = model.to(device)
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")
    model = model.to(device)

embeddings = []  # Embeddings list

# Ensure the model is in evaluation mode
model.eval()

# Disable gradient computation for memory efficiency and speed
with torch.no_grad():
    # Wrap the loop in tqdm for a progress bar
    for tensor in tqdm(df["input_ids_tensor"], desc="Processing tensors"):
        tensor = tensor.to(device)
        
        outputs = model(input_ids=tensor)
        
        # Extract CLS token embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
        
        embeddings.append(cls_embedding)

Processing tensors: 100%|██████████| 100000/100000 [30:46<00:00, 54.15it/s] 


In [25]:
import numpy as np
from sklearn.preprocessing import normalize
import faiss
from tqdm import tqdm  # Import tqdm for progress bars

# Get the embedding dimension from the first embedding
embedding_dim = len(embeddings[0])

# Convert embeddings to a NumPy array and ensure it's in float32 format
embeddings_np = np.array(embeddings, dtype=np.float32)

# Normalize embeddings with progress bar
print("Normalizing embeddings...")
embeddings_np = normalize(embeddings_np, axis=1, norm='l2')

# Ensure embeddings are contiguous in memory
embeddings_np = np.ascontiguousarray(embeddings_np)

# Initialize FAISS index with inner product (IP) similarity for cosine similarity search
index = faiss.IndexFlatIP(embedding_dim)

# Add embeddings to the FAISS index
print("Adding embeddings to FAISS index...")
index.add(embeddings_np)  # Batch adding to FAISS

print(f"No of embeddings indexed: {index.ntotal}")

Normalizing embeddings...
Adding embeddings to FAISS index...
No of embeddings indexed: 100000


In [35]:
def retrieval_func(query, top_k=5):
    query_tokens = tokenizer.encode(query, truncation=True, padding='max_length')
    query_tensor = torch.tensor([query_tokens]).to(device)
    
    with torch.no_grad():
        query_embedding = model(query_tensor).last_hidden_state[:, 0, :].squeeze().cpu().numpy()
        
    query_embedding_np = normalize(query_embedding.reshape(1, -1), axis=1, norm='l2')
        
    distances, indices = index.search(query_embedding_np, top_k)
    
    results = []
    
    for idx, distance in zip(indices[0], distances[0]):
        func_name = df.iloc[idx]["func_name"]
        code_snippet = df.iloc[idx]["func_code_string"]
        
        results.append({
            "function_name": func_name,
            "code_snippet": code_snippet,
            "similarity_score": distance
        })
        
    return results

In [42]:
query = "how to add two numbers"

result = retrieval_func("how to verify unwanted transactions")

In [43]:
for result in result:
    print("Function Name:", result["function_name"])
    print("Code Snippet:", result["code_snippet"])
    print("Similarity Score:", result["similarity_score"])
    print("\n" + "-"*50 + "\n")

Function Name: add
Code Snippet: async def add(ctx, left: int, right: int):
    """Adds two numbers together."""
    await ctx.send(left + right)
Similarity Score: 0.99436134

--------------------------------------------------

Function Name: Color.asList
Code Snippet: def asList(self):
        """ returns the value as the list object"""
        return [self._red, self._green, self._blue, self._alpha]
Similarity Score: 0.99401593

--------------------------------------------------

Function Name: AccidentalDignity.inSignJoy
Code Snippet: def inSignJoy(self):
        """ Returns if the object is in its sign of joy. """
        return props.object.signJoy[self.obj.id] == self.obj.sign
Similarity Score: 0.9937881

--------------------------------------------------

Function Name: Job.delete
Code Snippet: def delete(self):
        """Delete this job."""
        self.conn.delete(self.jid)
        self.reserved = False
Similarity Score: 0.9937362

--------------------------------------------