<a href="https://colab.research.google.com/github/rosafilgueira/PyCodeSearch/blob/main/Registry_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Deep Learning Transformer models for code


## UnixCoder RepoSIM

It uses Lazyhope/unixcoder-nine-advtest - which has been trained following: https://github.com/microsoft/CodeBERT/blob/master/UniXcoder/downstream-tasks/code-search/README.md

This model (which uses a bi-enconder approach) works very well for both - code similarity and text similarity.

In [2]:
!pip install tensorflow
!pip install -U accelerate
!pip install docarray
!pip install pandas
!pip install torch
!pip install transformers
!pip install tqdm

Collecting accelerate
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.20.3
Collecting docarray
  Downloading docarray-0.35.0-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.2/233.2 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
Collecting orjson>=3.8.2 (from docarray)
  Downloading orjson-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (136 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.0/137.0 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
Collecting types-requests>=2.28.11.6 (from docarray)
  Downloading types_requests-2.31.0.1-py3-none-any.whl (14 kB)
Collecting typing-inspect>=0.8.0 (from docarray)
  Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Collecting types-urllib3 (from types

## Add database example

In [35]:
import pandas as pd

# Define the codes and their corresponding docstrings
codes = [
    "def add(a, b):\n    return a + b",
    "def subtract(a, b):\n    return a - b",
    "def multiply(a, b):\n    return a * b",
    "def divide(a, b):\n    return a / b",
    "def power(a, b):\n    return a ** b",
    "def modulus(a, b):\n    return a % b"
]

docs = [
    "This function adds two numbers.",
    "This function subtracts the second number from the first.",
    "This function multiplies two numbers.",
    "This function divides the first number by the second.",
    "This function raises the first number to the power of the second.",
    "This function returns the remainder when the first number is divided by the second."
]

# Create the dataframe
registry = pd.DataFrame({
    'code': codes,
    'doc': docs
})

### dataframe
registry

Unnamed: 0,code,doc
0,"def add(a, b):\n return a + b",This function adds two numbers.
1,"def subtract(a, b):\n return a - b",This function subtracts the second number from...
2,"def multiply(a, b):\n return a * b",This function multiplies two numbers.
3,"def divide(a, b):\n return a / b",This function divides the first number by the ...
4,"def power(a, b):\n return a ** b",This function raises the first number to the p...
5,"def modulus(a, b):\n return a % b",This function returns the remainder when the f...


## Load the model


In [25]:
from transformers import pipeline

model = pipeline(
    model="Lazyhope/RepoSim",
    trust_remote_code=True,
    device_map="auto")



[*] Consider setting GitHub token to avoid hitting rate limits. 
For more info, see: https://docs.github.com/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token


## Turn code and docstring into torch embeddings

In [36]:
import torch

def encode(string):
    with torch.no_grad():
        embedding = model.encode(string, 512)

    return embedding.squeeze().cpu()

registry["code_embeddings"] = registry["code"].apply(encode)
registry["doc_embeddings"] = registry["doc"].apply(encode)

registry

Unnamed: 0,code,doc,code_embeddings,doc_embeddings
0,"def add(a, b):\n return a + b",This function adds two numbers.,"[tensor(-0.4317), tensor(-0.0293), tensor(2.66...","[tensor(-1.5903), tensor(-0.3973), tensor(3.99..."
1,"def subtract(a, b):\n return a - b",This function subtracts the second number from...,"[tensor(-1.2667), tensor(-0.6932), tensor(-0.4...","[tensor(-2.4678), tensor(-2.2801), tensor(0.66..."
2,"def multiply(a, b):\n return a * b",This function multiplies two numbers.,"[tensor(-1.0338), tensor(1.4185), tensor(1.093...","[tensor(-2.3068), tensor(1.3498), tensor(3.078..."
3,"def divide(a, b):\n return a / b",This function divides the first number by the ...,"[tensor(-0.9333), tensor(2.0973), tensor(0.789...","[tensor(-3.0391), tensor(-0.0748), tensor(1.79..."
4,"def power(a, b):\n return a ** b",This function raises the first number to the p...,"[tensor(-2.0816), tensor(1.8068), tensor(0.650...","[tensor(-2.9461), tensor(-0.5175), tensor(2.37..."
5,"def modulus(a, b):\n return a % b",This function returns the remainder when the f...,"[tensor(-0.2191), tensor(0.7266), tensor(-0.20...","[tensor(-2.4903), tensor(-1.3584), tensor(1.60..."


### Text-to-code Similarity

In [27]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [37]:
# Compute user query embeddings
user_query_docs = "Function that adds two numbers"
user_query_docs_emb = encode(user_query_docs)


# Convert document embeddings to numpy arrays
registry["doc_embeddings"] = registry["doc_embeddings"].apply(lambda x: np.array(x))

# Compute cosine similarity
user_query_emb = np.array(user_query_docs_emb)
cos_similarities = cosine_similarity(user_query_emb.reshape(1, -1), np.vstack(registry["doc_embeddings"]))

# Add cosine similarity scores as a new column
registry_doc=registry.copy()
registry_doc["cosine_similarity_doc"] = cos_similarities[0]

# Sort the dataframe based on cosine similarity
sorted_df = registry_doc.sort_values(by="cosine_similarity_doc", ascending=False)

# Retrieve the top 5 most similar documents
top_5_similar_docs = sorted_df.head(5)

In [29]:
top_5_similar_docs

Unnamed: 0,code,doc,code_embeddings,doc_embeddings,cosine_similarity
0,"def add(a, b):\n return a + b",This function adds two numbers.,"[tensor(-0.4317), tensor(-0.0293), tensor(2.66...","[-1.590327, -0.39731297, 3.997745, 2.6590729, ...",0.976803
2,"def multiply(a, b):\n return a * b",This function multiplies two numbers.,"[tensor(-1.0338), tensor(1.4185), tensor(1.093...","[-2.3068383, 1.3498025, 3.0785234, 1.3164992, ...",0.704091
1,"def subtract(a, b):\n return a - b",This function subtracts the second number from...,"[tensor(-1.2667), tensor(-0.6932), tensor(-0.4...","[-2.4677727, -2.2801313, 0.66812015, 1.4311575...",0.700668
3,"def divide(a, b):\n return a / b",This function divides the first number by the ...,"[tensor(-0.9333), tensor(2.0973), tensor(0.789...","[-3.0390856, -0.07484892, 1.7925591, 0.2812537...",0.57978
5,"def modulus(a, b):\n return a % b",This function returns the remainder when the f...,"[tensor(-0.2191), tensor(0.7266), tensor(-0.20...","[-2.4903457, -1.3584496, 1.6064596, -0.0300406...",0.545701


### Code-to-Text

In [38]:
# Compute user query embeddings
user_query_code = "def add_numbers(a, b):\n return a +"
user_query_code_emb = encode(user_query_code)


# Convert document embeddings to numpy arrays
registry["code_embeddings"] = registry["code_embeddings"].apply(lambda x: np.array(x))

# Compute cosine similarity
user_query_emb_c = np.array(user_query_code_emb)
cos_similarities = cosine_similarity(user_query_emb_c.reshape(1, -1), np.vstack(registry["code_embeddings"]))

# Add cosine similarity scores as a new column
registry_code=registry.copy()
registry_code["cosine_similarity_code"] = cos_similarities[0]

# Sort the dataframe based on cosine similarity
sorted_df_code = registry_code.sort_values(by="cosine_similarity_code", ascending=False)

# Retrieve the top 5 most similar documents
top_5_similar_code = sorted_df_code.head(5)

In [39]:
top_5_similar_code

Unnamed: 0,code,doc,code_embeddings,doc_embeddings,cosine_similarity_code
0,"def add(a, b):\n return a + b",This function adds two numbers.,"[-0.43167147, -0.029331975, 2.6694589, 3.79384...","[-1.590327, -0.39731297, 3.997745, 2.6590729, ...",0.892051
2,"def multiply(a, b):\n return a * b",This function multiplies two numbers.,"[-1.0338316, 1.4184728, 1.093852, 3.0212612, 0...","[-2.3068383, 1.3498025, 3.0785234, 1.3164992, ...",0.618545
1,"def subtract(a, b):\n return a - b",This function subtracts the second number from...,"[-1.2666721, -0.69317156, -0.43216228, 2.70539...","[-2.4677727, -2.2801313, 0.66812015, 1.4311575...",0.51237
5,"def modulus(a, b):\n return a % b",This function returns the remainder when the f...,"[-0.21913658, 0.7265661, -0.20258226, 1.877978...","[-2.4903457, -1.3584496, 1.6064596, -0.0300406...",0.489054
3,"def divide(a, b):\n return a / b",This function divides the first number by the ...,"[-0.9332766, 2.097301, 0.78919584, 1.6547529, ...","[-3.0390856, -0.07484892, 1.7925591, 0.2812537...",0.423482


# Code Sumarization

In [None]:
from transformers import RobertaTokenizer, T5ForConditionalGeneration

tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base-multi-sum')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base-multi-sum')


In [43]:
def generate_summary(text):
    input_ids = tokenizer.encode(text, return_tensors="pt")
    generated_ids = model.generate(input_ids, max_length=20)
    summary = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return summary

In [46]:
registry_summary=registry.copy()
# Assuming the registry dataframe is already defined
registry_summary["summarization"] = registry["code"].apply(generate_summary)
registry_summary


Unnamed: 0,code,doc,code_embeddings,doc_embeddings,summarization
0,"def add(a, b):\n return a + b",This function adds two numbers.,"[-0.43167147, -0.029331975, 2.6694589, 3.79384...","[-1.590327, -0.39731297, 3.997745, 2.6590729, ...",Add two vectors.
1,"def subtract(a, b):\n return a - b",This function subtracts the second number from...,"[-1.2666721, -0.69317156, -0.43216228, 2.70539...","[-2.4677727, -2.2801313, 0.66812015, 1.4311575...",Subtract two vectors.
2,"def multiply(a, b):\n return a * b",This function multiplies two numbers.,"[-1.0338316, 1.4184728, 1.093852, 3.0212612, 0...","[-2.3068383, 1.3498025, 3.0785234, 1.3164992, ...",Multiply two vectors.
3,"def divide(a, b):\n return a / b",This function divides the first number by the ...,"[-0.9332766, 2.097301, 0.78919584, 1.6547529, ...","[-3.0390856, -0.07484892, 1.7925591, 0.2812537...",Divide two numbers.
4,"def power(a, b):\n return a ** b",This function raises the first number to the p...,"[-2.0816362, 1.8067644, 0.6504534, 2.8207457, ...","[-2.9460752, -0.51749575, 2.379704, 2.370711, ...",Returns a power of b.
5,"def modulus(a, b):\n return a % b",This function returns the remainder when the f...,"[-0.21913658, 0.7265661, -0.20258226, 1.877978...","[-2.4903457, -1.3584496, 1.6064596, -0.0300406...",Returns a % b.
