<a href="https://colab.research.google.com/github/silvererudite/code_search/blob/main/notebooks/code_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Install Cohere for embeddings, Umap to reduce embeddings to 2 dimensions, 
# Altair for visualization, Annoy for approximate nearest neighbor search
!pip install cohere umap-learn altair annoy datasets tqdm

Successfully installed annoy-1.17.1 cohere-3.1.3 datasets-2.8.0 huggingface-hub-0.11.1 multiprocess-0.70.14 pynndescent-0.5.8 responses-0.18.0 umap-learn-0.5.3 urllib3-1.25.11 xxhash-3.1.0


In [6]:
import cohere
import numpy as np
import pandas as pd
from tqdm import tqdm
import re
from datasets import load_dataset
import umap
import altair as alt
from sklearn.metrics.pairwise import cosine_similarity
from annoy import AnnoyIndex
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

In [2]:
data_df = pd.read_csv('/content/functions_data.csv')
data_df.head()

Unnamed: 0,function_name,docstring,function_body,file_path
0,diffusion_from_config,,"def diffusion_from_config(config: Dict[str, An...",point_e\diffusion\configs.py
1,get_beta_schedule,This is the deprecated API for creating beta s...,"def get_beta_schedule(beta_schedule, *, beta_s...",point_e\diffusion\gaussian_diffusion.py
2,get_named_beta_schedule,Get a pre-defined beta schedule for the given ...,"def get_named_beta_schedule(schedule_name, num...",point_e\diffusion\gaussian_diffusion.py
3,betas_for_alpha_bar,Create a beta schedule that discretizes the gi...,def betas_for_alpha_bar(num_diffusion_timestep...,point_e\diffusion\gaussian_diffusion.py
4,space_timesteps,Create a list of timesteps to use from an orig...,"def space_timesteps(num_timesteps, section_cou...",point_e\diffusion\gaussian_diffusion.py


In [9]:
data_df['docstring'].fillna('not specified', inplace=True)

In [10]:
# Paste your API key here. Remember to not share publicly
api_key = ''

# Create and retrieve a Cohere API key from dashboard.cohere.ai/welcome/register
co = cohere.Client(api_key)

# Get the embeddings
embeds = co.embed(texts=list(data_df['docstring']), model='large',truncate='LEFT').embeddings

In [14]:
em = np.array(embeds)

In [15]:
# Create the search index, pass the size of embedding
search_index = AnnoyIndex(em.shape[1], 'angular')
# Add all the vectors to the search index
for i in range(len(em)):
    search_index.add_item(i, em[i])
search_index.build(10) # 10 trees
search_index.save('code.ann')

True

In [18]:
query = "compute diffusion of given data"

# Get the query's embedding
query_embed = co.embed(texts=[query],
                  model="large",
                  truncate="LEFT").embeddings

# Retrieve the nearest neighbors
similar_item_ids = search_index.get_nns_by_vector(query_embed[0],3,
                                                include_distances=True)
# Format the results
results = pd.DataFrame(data={'function': data_df.iloc[similar_item_ids[0]]['function_body'], 
                             'file path': data_df.iloc[similar_item_ids[0]]['file_path']})


print(f"Query:'{query}'\nNearest neighbors:")
results

Query:'compute diffusion of given data'
Nearest neighbors:


Unnamed: 0,function,file path
13,"def q_sample(self, x_start, t, noise=None):\n """"""""""""\n if noise is None:\n noise = th.randn_like(x_start)\n assert noise.shape == x_start.shape\n return _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape\n ) * x_start + _extract_into_tensor(self.\n sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise\n",point_e\diffusion\gaussian_diffusion.py
14,"def q_posterior_mean_variance(self, x_start, x_t, t):\n """"""""""""\n assert x_start.shape == x_t.shape\n posterior_mean = _extract_into_tensor(self.posterior_mean_coef1, t, x_t\n .shape) * x_start + _extract_into_tensor(self.posterior_mean_coef2,\n t, x_t.shape) * x_t\n posterior_variance = _extract_into_tensor(self.posterior_variance, t,\n x_t.shape)\n posterior_log_variance_clipped = _extract_into_tensor(self.\n posterior_log_variance_clipped, t, x_t.shape)\n assert posterior_mean.shape[0] == posterior_variance.shape[0\n ] == posterior_log_variance_clipped.shape[0] == x_start.shape[0]\n return posterior_mean, posterior_variance, posterior_log_variance_clipped\n",point_e\diffusion\gaussian_diffusion.py
12,"def q_mean_variance(self, x_start, t):\n """"""""""""\n mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape\n ) * x_start\n variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape\n )\n log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod,\n t, x_start.shape)\n return mean, variance, log_variance\n",point_e\diffusion\gaussian_diffusion.py
