In [1]:
import pickle
import re
import shutil
import sys

from annoy import AnnoyIndex
from docopt import docopt
from dpu_utils.utils import RichPath
import pandas as pd
from tqdm import tqdm
import wandb
from wandb.apis import InternalApi

from dataextraction.python.parse_python_data import tokenize_docstring_from_string
import model_restore_helper

In [2]:
def query_model(query, model, indices, language, topk=100):
    query_embedding = model.get_query_representations([{'docstring_tokens': tokenize_docstring_from_string(query),
                                                        'language': language}])[0]
    idxs, distances = indices.get_nns_by_vector(query_embedding, topk, include_distances=True)
    return idxs, distances

In [4]:
local_model_path = '../resources/saved_models/neuralbowmodel-2020-09-28-04-57-49_model_best.pkl.gz'
model_path = RichPath.create(local_model_path, None)
print("Restoring model from %s" % model_path)
model = model_restore_helper.restore(
    path=model_path,
    is_train=False,
    hyper_overrides={})

Restoring model from ../resources/saved_models/neuralbowmodel-2020-09-28-04-57-49_model_best.pkl.gz
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [3]:
args_wandb_run_id = "ligerfotis/semantic-code-search-src/2o0bi8kg"
# validate format of runid:
if len(args_wandb_run_id.split('/')) != 3:
    print("ERROR: Invalid wandb_run_id format: %s (Expecting: user/project/hash)" % args_wandb_run_id, file=sys.stderr)
    sys.exit(1)
wandb_api = wandb.Api()
# retrieve saved model from W&B for this run
print("Fetching run from W&B...")
try:
    run = wandb_api.run(args_wandb_run_id)
except wandb.CommError as e:
    print("ERROR: Problem querying W&B for wandb_run_id: %s" % args_wandb_run_id, file=sys.stderr)
    sys.exit(1)

 

print("Fetching run files from W&B...")
gz_run_files = [f for f in run.files() if f.name.endswith('gz')]
if not gz_run_files:
    print("ERROR: Run contains no model-like files")
    sys.exit(1)
model_file = gz_run_files[0].download(replace=True)
local_model_path = model_file.name
run_id = args_wandb_run_id.split('/')[-1]

model_path = RichPath.create(local_model_path, None)
print("Restoring model from %s" % model_path)
model = model_restore_helper.restore(
    path=model_path,
    is_train=False,
    hyper_overrides={})

Fetching run from W&B...
Fetching run files from W&B...
Restoring model from ./neuralbowmodel-2020-10-18-19-21-28_model_best.pkl.gz
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [4]:
predictions = []
language = 'python'
print("Evaluating language: %s" % language)
definitions = pickle.load(open('../resources/data/{}_dedupe_definitions_v2.pkl'.format(language), 'rb'))
indexes = [{'code_tokens': d['function_tokens'], 'language': d['language']} for d in tqdm(definitions)]
code_representations = model.get_code_representations(indexes[:int(len(indexes)/5)])

Evaluating language: python


100%|██████████| 1156085/1156085 [00:03<00:00, 334462.19it/s]


get_code_representations


In [None]:
print(len(indexes))

In [5]:
indices = AnnoyIndex(code_representations[0].shape[0], 'angular')
for index, vector in tqdm(enumerate(code_representations)):
    if vector is not None:
        indices.add_item(index, vector)
indices.build(200)

231217it [00:06, 36358.27it/s]


True

In [6]:
query = 'matrix multiply'

In [8]:
for idx, _ in zip(*query_model(query, model, indices, language)):
    predictions.append((query, language, definitions[idx]['identifier'], definitions[idx]['url']))

df = pd.DataFrame(predictions, columns=['query', 'language', 'identifier', 'url'])
urls = list(df['url'])
print("Top ten results")
for link in urls[:10]:
    print(link)

Top ten results
https://github.com/apache/spark/blob/618d6bff71073c8c93501ab7392c3cc579730f0b/python/pyspark/mllib/linalg/distributed.py#L373-L389
https://github.com/apache/spark/blob/618d6bff71073c8c93501ab7392c3cc579730f0b/python/pyspark/mllib/linalg/distributed.py#L705-L720
https://github.com/pymupdf/PyMuPDF/blob/917f2d83482510e26ba0ff01fd2392c26f3a8e90/fitz/fitz.py#L270-L275
https://github.com/pikepdf/pikepdf/blob/07154f4dec007e2e9c0c6a8c07b964fd06bc5f77/src/pikepdf/models/matrix.py#L63-L79
https://github.com/tensorflow/cleverhans/blob/97488e215760547b81afc53f5e5de8ba7da5bd98/cleverhans/utils_tf.py#L561-L569
https://github.com/pymupdf/PyMuPDF/blob/917f2d83482510e26ba0ff01fd2392c26f3a8e90/fitz/fitz.py#L277-L284
https://github.com/pandas-dev/pandas/blob/9feb3ad92cc0397a04b665803a49299ee7aa1037/pandas/core/frame.py#L1026-L1030
https://github.com/quantumlib/Cirq/blob/0827da80dd7880e5b923eb69407e980ed9bc0bd2/cirq/ops/matrix_gates.py#L38-L47
https://github.com/materialsproject/pymatgen/b

In [None]:
df.head(10)