## Imports

In [None]:
pip install datasets

In [2]:
from datasets import load_dataset
dataset = load_dataset("code_search_net", "ruby")

In [None]:
!pip install faiss-cpu
!pip install -U sentence-transformers

In [4]:
import numpy as np
import torch
import os
import pandas as pd
import faiss
import time
from sentence_transformers import SentenceTransformer

## Semantic Search

In [None]:
documents = dataset['train']['func_documentation_string']

In [6]:
model = SentenceTransformer('BAAI/bge-base-en-v1.5')

In [7]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [None]:
encoded_data = model.encode(documents)

In [9]:
index = faiss.IndexIDMap(faiss.IndexFlatIP(model.get_sentence_embedding_dimension()))
index.add_with_ids(encoded_data, np.array(range(0, len(documents))))

In [10]:
#serializing index to export it across different host
faiss.write_index(index, 'sample_documents')

#de-serializing the index
index = faiss.read_index('sample_documents')

In [11]:
def semantic_search(query):
    t = time.time()
    query_vector = model.encode([query])
    # Search for top k results
    k = 100
    top_k = index.search(query_vector, k)

    return top_k[1].tolist()[0]

## TF-IDF Search

In [13]:
func_tokens = dataset['train']['func_code_tokens']

In [14]:
from nltk.util import bigrams

def generate_bigrams(token_list):
    return [" ".join(bigram) for bigram in list(bigrams(token_list))]

In [15]:
def inverted_index(code_tokens):
    inverted_index = {}
    
    for i in range(len(code_tokens)):
        code = code_tokens[i]
        bi = generate_bigrams(code)
        all_token = code + bi
        for token in all_token:
            if token in inverted_index.keys():
                inverted_index[token].append(i)
            else:
                inverted_index[token] = [i]
    
    return inverted_index

In [16]:
inverted_ind = inverted_index(func_tokens)

In [17]:
import math
total_documents = len(func_tokens)

def compute_tfidf(word, index):
    
    documents_with_term = len(inverted_ind[word])
    
    inner = total_documents/documents_with_term

    doc_length = len(func_tokens[index])
    
    document_frequency = inverted_ind[word].count(index)
    
    total = (document_frequency / doc_length) * math.log(inner)
    return total

In [18]:
def tfidf_search(user_input):
    
    import numpy as np
    
    track_docs = [0] * len(func_tokens)
    
    for word in (user_input.split() + generate_bigrams(user_input.split())):
        if word in inverted_ind:
            for doc_index in inverted_ind[word]:
                track_docs[doc_index] += compute_tfidf(word, doc_index)
                
    sorted_idx = list(np.argsort(track_docs)[::-1])[0:100]

    return sorted_idx

## Combined Search

In [20]:
def search_results(user_input):
    
    s = semantic_search(user_input)
    t = tfidf_search(user_input)
    overlap_results = list(set(s) & set(t))
    
    top_10_docs = overlap_results
        
    while len(top_10_docs) < 10:
        for i in s:
            if i not in top_10_docs:
                top_10_docs.append(i)
    
    if len(top_10_docs) > 10:
        top_10_docs = overlap_results[:10]
        
    function_name = []
    doc_string = []
    for i in top_10_docs:
        function_name.append(dataset['train']['func_name'][i])
        doc_string.append(dataset['train']['func_documentation_string'][i])
        
    results_df = pd.DataFrame({'Document': top_10_docs, 'Function': function_name, 'Documentation': doc_string})
    
    return results_df

## Search Results

In [21]:
search_results("enumerable")

Unnamed: 0,Document,Function,Documentation
0,34424,Pandata.DataFormatter.custom_sort,Sorts alphabetically ignoring the initial 'The...
1,3890,Twitter.Utils.flat_pmap,Returns a new array with the concatenated resu...
2,3891,Twitter.Utils.pmap,Returns a new array with the results of runnin...
3,38981,TeradataExtractor.Query.enumerable,"returns an enumerable, each element of which i..."
4,43259,Doublylinkedlist.Doublylinkedlist.each,Método para que la lista sea enumerable
5,42390,Yargi.ElementSet.grep,See Enumerable.grep
6,42720,MMETools.Enumerable.classify,Interessant iterador que classifica un enumera...
7,30079,Wbem.WsmanClient.each_instance,Enumerate instances
8,46812,StixSchemaSpy.SimpleType.enumeration_values,Returns the list of values for this enumeration
9,5229,Magick.ImageList.reject,override Enumerable's reject
