<a href="https://colab.research.google.com/github/rosafilgueira/PyCodeSearch/blob/main/Registry_search_multimodel_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Deep Learning Transformer models for code


## UnixCoders and BM25 Models

We are going to use two models:
-  Lazyhope/unixcoder-nine-advtest, wich is used by the original RepoSim pipeline - **text-to-code search**

- BM25 Model: Rethink Training of BERT Rerankers in Multi-Stage Retrieval Pipeline - **code-completion search**



In [1]:
!pip install tensorflow
!pip install -U accelerate
!pip install docarray
!pip install pandas
!pip install torch
!pip install transformers
!pip install tqdm

Collecting accelerate
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.21.0
Collecting docarray
  Downloading docarray-0.37.0-py3-none-any.whl (263 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m263.2/263.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting orjson>=3.8.2 (from docarray)
  Downloading orjson-3.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (140 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.3/140.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting types-requests>=2.28.11.6 (from docarray)
  Downloading types_requests-2.31.0.2-py3-none-any.whl (14 kB)
Collecting typing-inspect>=0.8.0 (from docarray)
  Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Collecting types-urllib3 (from types

## Add database example

In [37]:
import pandas as pd

# Define the codes and their corresponding docstrings
codes = [
    "def add(a, b):\n    return a + b",
    "def subtract(a, b):\n    return a - b",
    "def multiply(a, b):\n    return a * b",
    "def divide(a, b):\n    return a / b",
    "def power(a, b):\n    return a ** b",
    "def modulus(a, b):\n    return a % b"
]

docs = [
    "This function adds two numbers.",
    "This function subtracts the second number from the first.",
    "This function multiplies two numbers.",
    "This function divides the first number by the second.",
    "This function raises the first number to the power of the second.",
    "This function returns the remainder when the first number is divided by the second."
]

# Create the dataframe
registry = pd.DataFrame({
    'code': codes,
    'doc': docs
})

### dataframe
registry

Unnamed: 0,code,doc
0,"def add(a, b):\n return a + b",This function adds two numbers.
1,"def subtract(a, b):\n return a - b",This function subtracts the second number from...
2,"def multiply(a, b):\n return a * b",This function multiplies two numbers.
3,"def divide(a, b):\n return a / b",This function divides the first number by the ...
4,"def power(a, b):\n return a ** b",This function raises the first number to the p...
5,"def modulus(a, b):\n return a % b",This function returns the remainder when the f...


## Load two models for two different tasks

- model_bm25  -- for code-completion search
- model_text_to_code -- for text-to-code search

In [43]:
#NEW: I have included two new imports here
from transformers import pipeline, AutoModel, AutoTokenizer

## Note: this pipeline uses "Lazyhope/unixcoder-nine-advtest" underneed
model_text_to_code = pipeline(
    model="Lazyhope/RepoSim",
    trust_remote_code=True,
    device_map="auto")

#NEW
model_name_= "Luyu/bert-base-mdoc-bm25"
model_code_to_code = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name_)


[*] Consider setting GitHub token to avoid hitting rate limits. 
For more info, see: https://docs.github.com/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token


## Turn code and docstring into torch embeddings

In [44]:
#NEW

import torch

def encode(string, model_type):
    if model_type == 1:
        with torch.no_grad():
            embedding = model_text_to_code.encode(string, 512)
        final_t=embedding.squeeze()

    else:
        with torch.no_grad():
            inputs = tokenizer_1(string, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
            outputs = model_code_completion_1(**inputs)
            embedding = outputs.last_hidden_state.mean(dim=1)  # You can use any pooling strategy here
        final_t = embedding.squeeze()
    return final_t

## model_type =1 -- for text-to_code
registry["doc_embeddings"] = registry["doc"].apply(encode, model_type=1)
## model_type=2 -- for code-to-code
registry["code_embeddings"] = registry["code"].apply(encode, model_type=2)
registry

Unnamed: 0,code,doc,doc_embeddings,code_embeddings,code_complete_embeddings
0,"def add(a, b):\n return a + b",This function adds two numbers.,"[tensor(-1.5903), tensor(-0.3973), tensor(3.99...","[tensor(0.3194), tensor(-0.1994), tensor(0.234...","[tensor(0.0198), tensor(0.0969), tensor(0.1743..."
1,"def subtract(a, b):\n return a - b",This function subtracts the second number from...,"[tensor(-2.4678), tensor(-2.2801), tensor(0.66...","[tensor(0.2588), tensor(-0.2688), tensor(0.324...","[tensor(0.1709), tensor(0.1219), tensor(0.2758..."
2,"def multiply(a, b):\n return a * b",This function multiplies two numbers.,"[tensor(-2.3068), tensor(1.3498), tensor(3.078...","[tensor(0.2858), tensor(-0.2316), tensor(0.240...","[tensor(-0.1580), tensor(0.6568), tensor(0.131..."
3,"def divide(a, b):\n return a / b",This function divides the first number by the ...,"[tensor(-3.0391), tensor(-0.0748), tensor(1.79...","[tensor(0.3486), tensor(-0.2342), tensor(0.236...","[tensor(-0.1537), tensor(0.2867), tensor(0.402..."
4,"def power(a, b):\n return a ** b",This function raises the first number to the p...,"[tensor(-2.9461), tensor(-0.5175), tensor(2.37...","[tensor(0.4351), tensor(-0.2007), tensor(0.289...","[tensor(-0.1428), tensor(0.5299), tensor(0.217..."
5,"def modulus(a, b):\n return a % b",This function returns the remainder when the f...,"[tensor(-2.4903), tensor(-1.3584), tensor(1.60...","[tensor(0.2112), tensor(-0.1506), tensor(0.531...","[tensor(-0.2249), tensor(0.5351), tensor(0.080..."


### Text-to-code Similarity

In [45]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [46]:
# Compute user query embeddings
user_query_docs = "Function that adds two numbers"
#NEW -- indicate the model_type=1 for unixcoder
user_query_docs_emb = encode(user_query_docs, model_type=1)


# Convert document embeddings to numpy arrays
registry["doc_embeddings"] = registry["doc_embeddings"].apply(lambda x: np.array(x))

# Compute cosine similarity
user_query_emb = np.array(user_query_docs_emb)
cos_similarities = cosine_similarity(user_query_emb.reshape(1, -1), np.vstack(registry["doc_embeddings"]))

# Add cosine similarity scores as a new column
registry_doc=registry.copy()
registry_doc["cosine_similarity_doc"] = cos_similarities[0]

# Sort the dataframe based on cosine similarity
sorted_df = registry_doc.sort_values(by="cosine_similarity_doc", ascending=False)

# Retrieve the top 5 most similar documents
top_5_similar_docs = sorted_df.head(5)

In [47]:
top_5_similar_docs

Unnamed: 0,code,doc,doc_embeddings,code_embeddings,code_complete_embeddings,cosine_similarity_doc
0,"def add(a, b):\n return a + b",This function adds two numbers.,"[-1.590325, -0.39731324, 3.997744, 2.6590736, ...","[tensor(0.3194), tensor(-0.1994), tensor(0.234...","[tensor(0.0198), tensor(0.0969), tensor(0.1743...",0.976803
2,"def multiply(a, b):\n return a * b",This function multiplies two numbers.,"[-2.3068378, 1.3498034, 3.0785217, 1.3164998, ...","[tensor(0.2858), tensor(-0.2316), tensor(0.240...","[tensor(-0.1580), tensor(0.6568), tensor(0.131...",0.704091
1,"def subtract(a, b):\n return a - b",This function subtracts the second number from...,"[-2.4677699, -2.280132, 0.6681174, 1.4311584, ...","[tensor(0.2588), tensor(-0.2688), tensor(0.324...","[tensor(0.1709), tensor(0.1219), tensor(0.2758...",0.700667
3,"def divide(a, b):\n return a / b",This function divides the first number by the ...,"[-3.0390859, -0.07484988, 1.7925602, 0.2812554...","[tensor(0.3486), tensor(-0.2342), tensor(0.236...","[tensor(-0.1537), tensor(0.2867), tensor(0.402...",0.57978
5,"def modulus(a, b):\n return a % b",This function returns the remainder when the f...,"[-2.490343, -1.3584495, 1.6064605, -0.03004119...","[tensor(0.2112), tensor(-0.1506), tensor(0.531...","[tensor(-0.2249), tensor(0.5351), tensor(0.080...",0.545701


### Code Completion

In [48]:
# Compute user query embeddings
user_query_code = "def add_numbers(a, b):\n return a "
#NEW -- indicate the model_type=2 for BM25
user_query_code_emb = encode(user_query_code, model_type=2)
# Convert document embeddings to numpy arrays
registry["code_embeddings"] = registry["code_embeddings"].apply(lambda x: np.array(x))

# Compute cosine similarity
user_query_emb_c = np.array(user_query_code_emb)
cos_similarities = cosine_similarity(user_query_emb_c.reshape(1, -1), np.vstack(registry["code_embeddings"]))

# Add cosine similarity scores as a new column
registry_code=registry.copy()
registry_code["cosine_similarity_code_2"] = cos_similarities[0]

# Sort the dataframe based on cosine similarity
sorted_df_code = registry_code.sort_values(by="cosine_similarity_code_2", ascending=False)

# Retrieve the top 5 most similar documents
top_5_similar_code = sorted_df_code.head(5)
top_5_similar_code

Unnamed: 0,code,doc,doc_embeddings,code_embeddings,code_complete_embeddings,cosine_similarity_code_2
0,"def add(a, b):\n return a + b",This function adds two numbers.,"[-1.590325, -0.39731324, 3.997744, 2.6590736, ...","[0.31938395, -0.19940263, 0.23405677, -0.42716...","[tensor(0.0198), tensor(0.0969), tensor(0.1743...",0.972584
2,"def multiply(a, b):\n return a * b",This function multiplies two numbers.,"[-2.3068378, 1.3498034, 3.0785217, 1.3164998, ...","[0.28581232, -0.23162104, 0.2407104, -0.225170...","[tensor(-0.1580), tensor(0.6568), tensor(0.131...",0.968535
1,"def subtract(a, b):\n return a - b",This function subtracts the second number from...,"[-2.4677699, -2.280132, 0.6681174, 1.4311584, ...","[0.2588312, -0.2687941, 0.3244955, 0.07137667,...","[tensor(0.1709), tensor(0.1219), tensor(0.2758...",0.943508
3,"def divide(a, b):\n return a / b",This function divides the first number by the ...,"[-3.0390859, -0.07484988, 1.7925602, 0.2812554...","[0.34864905, -0.23415264, 0.2369114, 0.0368931...","[tensor(-0.1537), tensor(0.2867), tensor(0.402...",0.920648
4,"def power(a, b):\n return a ** b",This function raises the first number to the p...,"[-2.946074, -0.51749504, 2.3797052, 2.3707116,...","[0.43510747, -0.20073725, 0.2897749, 0.0957916...","[tensor(-0.1428), tensor(0.5299), tensor(0.217...",0.919372


# Code Sumarization

In [49]:
from transformers import RobertaTokenizer, T5ForConditionalGeneration

tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base-multi-sum')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base-multi-sum')


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/902 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [50]:
def generate_summary(text):
    input_ids = tokenizer.encode(text, return_tensors="pt")
    generated_ids = model.generate(input_ids, max_length=20)
    summary = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return summary

In [52]:
registry_summary=registry.copy()
# Assuming the registry dataframe is already defined
registry_summary["summarization"] = registry["code"].apply(generate_summary)
registry_summary


Unnamed: 0,code,doc,doc_embeddings,code_embeddings,code_complete_embeddings,summarization
0,"def add(a, b):\n return a + b",This function adds two numbers.,"[-1.590325, -0.39731324, 3.997744, 2.6590736, ...","[0.31938395, -0.19940263, 0.23405677, -0.42716...","[tensor(0.0198), tensor(0.0969), tensor(0.1743...",Add two vectors.
1,"def subtract(a, b):\n return a - b",This function subtracts the second number from...,"[-2.4677699, -2.280132, 0.6681174, 1.4311584, ...","[0.2588312, -0.2687941, 0.3244955, 0.07137667,...","[tensor(0.1709), tensor(0.1219), tensor(0.2758...",Subtract two vectors.
2,"def multiply(a, b):\n return a * b",This function multiplies two numbers.,"[-2.3068378, 1.3498034, 3.0785217, 1.3164998, ...","[0.28581232, -0.23162104, 0.2407104, -0.225170...","[tensor(-0.1580), tensor(0.6568), tensor(0.131...",Multiply two vectors.
3,"def divide(a, b):\n return a / b",This function divides the first number by the ...,"[-3.0390859, -0.07484988, 1.7925602, 0.2812554...","[0.34864905, -0.23415264, 0.2369114, 0.0368931...","[tensor(-0.1537), tensor(0.2867), tensor(0.402...",Divide two numbers.
4,"def power(a, b):\n return a ** b",This function raises the first number to the p...,"[-2.946074, -0.51749504, 2.3797052, 2.3707116,...","[0.43510747, -0.20073725, 0.2897749, 0.0957916...","[tensor(-0.1428), tensor(0.5299), tensor(0.217...",Returns a power of b.
5,"def modulus(a, b):\n return a % b",This function returns the remainder when the f...,"[-2.490343, -1.3584495, 1.6064605, -0.03004119...","[0.21118452, -0.15059435, 0.5316688, 0.0912161...","[tensor(-0.2249), tensor(0.5351), tensor(0.080...",Returns a % b.
