© 2020 Nokia

Licensed under the BSD 3 Clause license

SPDX-License-Identifier: BSD-3-Clause

## Setup

In [6]:
%load_ext autoreload
%autoreload 2

import os
import json
import time
import numpy as np
import sys

from codesearch.encoders import BasicEncoder
from codesearch import embedding_pretraining
from codesearch.embedding_pretraining import train_fasttext_model_from_snippets, load_fasttext_model
from codesearch.utils import SaveableFunction
from codesearch.data import load_snippet_collection, EVAL_DATASETS, SNIPPET_COLLECTIONS, eval_datasets_from_regex
from codesearch.ncs.ncs_embedder import TfidfCodeEmbedder, NcsEmbedder
from codesearch.evaluation import evaluate_and_dump 
from codesearch.embedding_retrieval import EmbeddingRetrievalModel
start = time.time()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Read configuration parameters from environment variables (when this notebook is run as a script).

In [7]:
fast_text_checkpoint = os.environ.get("fast_text_checkpoint", None)
model_filename = os.environ.get("model_filename", None)

snippets_collection = os.environ.get("snippets_collection", "so-ds-feb20")
train_snippets_collection = os.environ.get("train_snippets_collection", "so-ds-feb20")
valid_dataset = os.environ.get("valid_dataset", None)
test_dataset = os.environ.get("test_dataset", "conala-curated-0.5-test")

text_overrides = json.loads(os.environ.get("text_overrides", "{}"))
code_overrides = json.loads(os.environ.get("code_overrides", "{}"))
fast_text_overrides = json.loads(os.environ.get("fast_text_overrides", "{}"))
zip_fn_name = os.environ.get("zip_fn", "zip_descr_end")
output_dir = os.environ.get("output_dir", ".")

In [3]:
model_filename, fast_text_checkpoint

(None, None)

In [4]:
snippets_collection

'so-ds-feb20'

In [5]:
text_overrides, code_overrides, fast_text_overrides, zip_fn_name

({}, {}, {}, 'zip_descr_end')

## Load data

In [8]:
if valid_dataset and valid_dataset not in EVAL_DATASETS and valid_dataset not in SNIPPET_COLLECTIONS:
    raise ValueError()
test_datasets = eval_datasets_from_regex(test_dataset)
snippets = load_snippet_collection(snippets_collection)
train_snippets = load_snippet_collection(train_snippets_collection) 


In [5]:
train_snippets[-1]

{'attribution': ['https://stackoverflow.com',
  'https://stackoverflow.com/questions/7939954'],
 'language': 'python',
 'id': 'xml-parsing-4',
 'code': 'import xml.etree.ElementTree as et\nimport csv\n\nxmltext = """\n<dicts>\n    <key>1375</key>\n    <dict>\n        <key>Key 1</key><integer>1375</integer>\n        <key>Key 2</key><string>Some String</string>\n        <key>Key 3</key><string>Another string</string>\n        <key>Key 4</key><string>Yet another string</string>\n        <key>Key 5</key><string>Strings anyone?</string>\n    </dict>\n</dicts>\n"""\n\nf = open(\'output.txt\', \'w\')\n\nwriter = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)\n\ntree = et.fromstring(xmltext)\n\n# iterate over the dict elements\nfor dict_el in tree.iterfind(\'dict\'):\n    data = []\n    # get the text contents of each non-key element\n    for el in dict_el:\n        if el.tag == \'string\':\n            data.append(el.text)\n        # if it\'s an integer element convert to int so csv wont quote i

In [5]:
f=open('../../processed_search_1.json')
train_snippets=json.load(f)
snippets=train_snippets

In [6]:
snippets[0]

{'attribution': ['https://github.com',
  'https://github.com/cymcsg/UltimateRecyclerView/blob/0fefad7a0a432e957fef4c1e8b198069c27b5cd2/UltimateRecyclerView/app/src/main/java/com/marshalchen/ultimaterecyclerview/demo/LauncherActivity.java#L52-L221'],
 'language': 'java',
 'id': 1,
 'code': '    protected void onCreate(Bundle savedInstanceState) {\n        super.onCreate(savedInstanceState);\n        setContentView(R.layout.activity_launcher);\n\n        toolbar = (Toolbar) findViewById(R.id.tool_bar);\n        setSupportActionBar(toolbar);\n        getSupportActionBar().setDisplayShowTitleEnabled(false);\n\n\n        ultimateRecyclerView = (UltimateRecyclerView) findViewById(R.id.ultimate_recycler_view);\n        ultimateRecyclerView.setHasFixedSize(false);\n        final List<String> stringList = new ArrayList<>();\n\n        stringList.add("111");\n        stringList.add("aaa");\n        stringList.add("222");\n        stringList.add("33");\n        stringList.add("44");\n        stri

In [8]:
train_snippets

28'],
  'language': 'python',
  'id': 'attributeerror-datetime-module-has-no-attribute-strptime',
  'code': 'from datetime import datetime\nimport datetime\n\ndatetime.datetime.strptime(date, "%Y-%m-%d")\n\n\n# module  class    method\ndatetime.datetime.strptime(date, "%Y-%m-%d")',
  'description': "Attributeerror: 'datetime' module has no attribute 'strptime'"},
 {'attribution': ['https://stackoverflow.com',
   'https://stackoverflow.com/questions/19480028'],
  'language': 'python',
  'id': 'attributeerror-datetime-module-has-no-attribute-strptime-2',
  'code': 'self.date = datetime.datetime.strptime(self.d, "%Y-%m-%d")',
  'description': "Attributeerror: 'datetime' module has no attribute 'strptime'"},
 {'attribution': ['https://stackoverflow.com',
   'https://stackoverflow.com/questions/1263451'],
  'language': 'python',
  'id': 'decorators-in-classes',
  'code': 'import stackoverflow\n\n\nclass Test(object):\n    def _decorator(foo):\n        def magic(self):\n            print("st

## Train or load embedding model

In [9]:
enc = BasicEncoder(text_preprocessing_params=text_overrides, code_preprocessing_params=code_overrides)
zip_fn = getattr(sys.modules[embedding_pretraining.__name__], zip_fn_name)
model = train_fasttext_model_from_snippets(train_snippets, enc, zip_fn, fast_text_overrides, "./", save=True)

Initializing spacy nlp /
Initialized spacy nlp


In [10]:
if fast_text_checkpoint:
    model, enc = load_fasttext_model(fast_text_checkpoint)
    print("Loaded fast text checkpoint")
    
else:
    enc = BasicEncoder(text_preprocessing_params=text_overrides, code_preprocessing_params=code_overrides)
    zip_fn = getattr(sys.modules[embedding_pretraining.__name__], zip_fn_name)
    model = train_fasttext_model_from_snippets(train_snippets, enc, zip_fn, fast_text_overrides, "./", save=False)

## Unsupervised retrieval baseline

A first baseline that computes a representation a snippet representation as a tfidf weighted average of their embeddings and a query representation by averaging all terms.

### Embedding code & queries

In [10]:
tfidf_model = TfidfCodeEmbedder.create_tfidf_model(enc, model, snippets)
embedder = NcsEmbedder(model, enc, tfidf_model)

100%|██████████| 12137/12137 [00:16<00:00, 754.58it/s]


### Create retrieval model

In [11]:
retrieval_model = EmbeddingRetrievalModel(embedder)
retrieval_model.add_snippets(snippets)

Embedding snippets: 100%|██████████| 12137/12137 [00:42<00:00, 287.73it/s]Contains all zero rows



In [13]:
if model_filename: embedder.save(model_filename)

In [14]:
embedder.save('best_ncs_embedder')

## Evaluation

In [12]:
sample_queries = ["train a tensorflow model", "plot a bar chart", "merge two dataframes", "sort a list", "read a pandas dataframe from a file", "plot an image"]
config = {"text": text_overrides, "code": code_overrides, "fasttext": fast_text_overrides}
evaluate_and_dump(
    retrieval_model, 
    config, 
    output_dir, 
    valid_dataset, 
    test_datasets,
    sample_queries=sample_queries
)

Embedding sequences: 100%|██████████| 762/762 [00:00<00:00, 3447.68it/s]
Embedding sequences: 100%|██████████| 1/1 [00:00<00:00, 1544.86it/s]
Embedding sequences: 100%|██████████| 1/1 [00:00<00:00, 2552.83it/s]
Embedding sequences: 100%|██████████| 1/1 [00:00<00:00, 3679.21it/s]
Embedding sequences: 100%|██████████| 1/1 [00:00<00:00, 3539.50it/s]
Embedding sequences: 100%|██████████| 1/1 [00:00<00:00, 2589.08it/s]
Embedding sequences: 100%|██████████| 1/1 [00:00<00:00, 3578.76it/s]****************************************
QUERY: train a tensorflow model
****************************************

----------------------------------------
RANK 1
----------------------------------------
DESCRIPTION: Load a model from an hdf5 file in keras
CODE:
def create_model():
    model = Sequential()
    model.add(Dense(64, input_dim=14, init='uniform'))
    model.add(LeakyReLU(alpha=0.3))
    model.add(
        BatchNormalization(
            epsilon=1e-06,
            mode=0,
            momentum=0.9,

{'conala-curated-0.5-test': {'mrr': 0.0, 'recall@3': 0.0, 'recall@10': 0.0}}

In [16]:
duration = time.time() - start
f"Running the notebook took {duration} seconds"

'Running the notebook took 96.61421632766724 seconds'

In [17]:
give_your_query=['sort a list']