In [2]:
import pickle
import re
import shutil
import sys

from annoy import AnnoyIndex
from docopt import docopt
from dpu_utils.utils import RichPath
import pandas as pd
from tqdm import tqdm
import wandb
from wandb.apis import InternalApi

from dataextraction.python.parse_python_data import tokenize_docstring_from_string
import model_restore_helper

In [3]:
def query_model(query, model, indices, language, topk=100):
    query_embedding = model.get_query_representations([{'docstring_tokens': tokenize_docstring_from_string(query),
                                                        'language': language}])[0]
    idxs, distances = indices.get_nns_by_vector(query_embedding, topk, include_distances=True)
    return idxs, distances

In [4]:
local_model_path = '../resources/saved_models/neuralbowmodel-2020-09-28-04-57-49_model_best.pkl.gz'
model_path = RichPath.create(local_model_path, None)
print("Restoring model from %s" % model_path)
model = model_restore_helper.restore(
    path=model_path,
    is_train=False,
    hyper_overrides={})

Restoring model from ../resources/saved_models/neuralbowmodel-2020-09-28-04-57-49_model_best.pkl.gz
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [5]:
predictions = []
language = 'python'
print("Evaluating language: %s" % language)
definitions = pickle.load(open('../resources/data/{}_dedupe_definitions_v2.pkl'.format(language), 'rb'))
indexes = [{'code_tokens': d['function_tokens'], 'language': d['language']} for d in tqdm(definitions)]
code_representations = model.get_code_representations(indexes[:int(len(indexes)/5)])

Evaluating language: python


100%|██████████| 1156085/1156085 [00:03<00:00, 339016.92it/s]


get_code_representations


In [None]:
print(len(indexes))

In [6]:
indices = AnnoyIndex(code_representations[0].shape[0], 'angular')
for index, vector in tqdm(enumerate(code_representations)):
    if vector is not None:
        indices.add_item(index, vector)
indices.build(200)

231217it [00:05, 39596.55it/s]


True

In [7]:
query = 'matrix multiply'

In [9]:
for idx, _ in zip(*query_model(query, model, indices, language)):
    predictions.append((query, language, definitions[idx]['identifier'], definitions[idx]['url']))

df = pd.DataFrame(predictions, columns=['query', 'language', 'identifier', 'url'])
urls = list(df['url'])
print("Top ten results")
for link in urls[:10]:
    print(link)

Top ten results
https://github.com/keon/algorithms/blob/4d6569464a62a75c1357acc97e2dd32ee2f9f4a3/algorithms/dfs/pacific_atlantic.py#L56-L65
https://github.com/quantumlib/Cirq/blob/0827da80dd7880e5b923eb69407e980ed9bc0bd2/cirq/linalg/predicates.py#L27-L42
https://github.com/vaexio/vaex/blob/a45b672f8287afca2ada8e36b74b604b9b28dd85/packages/vaex-core/vaex/column.py#L39-L43
https://github.com/mlperf/training/blob/1c6ae725a81d15437a2b2df05cac0673fde5c3a4/data_generation/fractal_graph_expansions/graph_reduction.py#L61-L64
https://github.com/quantumlib/Cirq/blob/0827da80dd7880e5b923eb69407e980ed9bc0bd2/cirq/ops/matrix_gates.py#L38-L47
https://github.com/tensorflow/probability/blob/e87fe34111d68c35db0f9eeb4935f1ece9e1a8f5/tensorflow_probability/python/internal/distribution_util.py#L686-L689
https://github.com/mlperf/training/blob/1c6ae725a81d15437a2b2df05cac0673fde5c3a4/data_generation/fractal_graph_expansions/graph_reduction.py#L31-L33
https://github.com/quantumlib/Cirq/blob/0827da80dd7880e5

In [None]:
df.head(10)