In [7]:
from datasets import load_dataset

# Load the first 100,000 examples from the 'train' split of the 'python' subset
dataset = load_dataset("code_search_net", "python", split="train[:100000]")

code_search_net.py:   0%|          | 0.00/8.44k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

The repository for code_search_net contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/code_search_net.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


python.zip:   0%|          | 0.00/941M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/412178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/22176 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/23107 [00:00<?, ? examples/s]

In [9]:
print(dataset)

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 100000
})


In [10]:
!pip install pandas

import pandas as pd

include_metadata = ["func_documentation_string", "func_code_string", "func_name", "func_code_url"]
df = pd.DataFrame(dataset)[include_metadata]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [11]:
#Lightly process the func_code_string value. Remove trailing whitespaces and extra lines
import re
def preprocess_func_code_string(code):
    code = "\n".join([line.rstrip() for line in code.splitlines()])
    code = re.sub(r'n\{2,}', '\n\n', code)
    return code

In [12]:
df["func_code_string"].apply(preprocess_func_code_string)

0        def __msgc_step3_discontinuity_localization(se...
1        def __multiscale_gc_lo2hi_run(self):  # , pyed...
2        def __multiscale_gc_hi2lo_run(self):  # , pyed...
3        def __ordered_values_by_indexes(self, data, in...
4        def __hi2lo_multiscale_indexes(self, mask, ori...
                               ...                        
99995    def close_stream(self):\n        """Close the ...
99996    def read_response(self):\n        """Read an i...
99997    def identify(self):\n        """Update client ...
99998    def auth(self):\n        """Send authorization...
99999    def subscribe(self, topic, channel):\n        ...
Name: func_code_string, Length: 100000, dtype: object

In [None]:
import torch

# Check if CUDA is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [33]:
!pip install sentence-transformers

from sentence_transformers import SentenceTransformer
model = SentenceTransformer("mchochlov/codebert-base-cd-ft", device=device)

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [34]:
embeddings = model.encode(
    df['func_code_string'].tolist(),
    convert_to_numpy=True,
    show_progress_bar=True,
    device=device,           
    batch_size=64            
)

Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

In [39]:
import faiss
from sklearn.preprocessing import normalize

# Normalize embeddings
embeddings_np = normalize(embeddings, axis=1, norm='l2')

# Build FAISS index
embedding_dim = embeddings_np.shape[1]
index = faiss.IndexFlatIP(embedding_dim)
index.add(embeddings_np)

In [40]:
def retrieval_func(query, top_k=5):
    # Compute query embedding
    query_embedding = model.encode(query, convert_to_numpy=True)
    
    # Normalize query embedding
    query_embedding_np = normalize(query_embedding.reshape(1, -1), axis=1, norm='l2')
    
    # Search in FAISS index
    distances, indices = index.search(query_embedding_np, top_k)
    
    results = []
    for idx, distance in zip(indices[0], distances[0]):
        func_name = df.iloc[idx]["func_name"]
        code_snippet = df.iloc[idx]["func_code_string"]
        results.append({
            "function_name": func_name,
            "code_snippet": code_snippet,
            "similarity_score": distance
        })
        
    return results

In [43]:
query = "how to add numbers"

result = retrieval_func("how to verify unwanted transactions")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [44]:
for i in result:
    print(i['function_name']+"\n\n\n")
    print(i['code_snippet']+"\n\n\n")

cassists



def cassists(self,dc,dt,dt2,nodiag=False,memlimit=-1):
	"""Calculates probability of gene i regulating gene j with continuous data assisted method,
	with multiple tests, by converting log likelihoods into probabilities per A for all B.
	Probabilities are converted from likelihood ratios separately for each A. This gives better
	dc:	numpy.ndarray(nt,ns,dtype=ftype(='f4' by default)) Continuous anchor data.
		Entry dc[i,j] is anchor i's value for sample j.
		Anchor i is used to infer the probability of gene i -> any other gene.
	dt:	numpy.ndarray(nt,ns,dtype=ftype(='=f4' by default)) Gene expression data for A
		Entry dt[i,j] is gene i's expression level for sample j.
	dt2:numpy.ndarray(nt2,ns,dtype=ftype(='=f4' by default)) Gene expression data for B.
		dt2 has the same format as dt, and can be identical with, different from, or a superset of dt.
		When dt2 is a superset of (or identical with) dt, dt2 must be arranged
		to be identical with dt at its upper submatrix, i.e. dt