# NL-to-Leaf-Path Search
Treating each root-to-leaf path on AST as a unit. Make these units searchable (by converting them to vectors).

In [1]:
from codemend import BackupHandler, relative_path
from codemend.models.annotate_code_with_api import get_fu_fau

In [2]:
fu,fau = get_fu_fau()

In [3]:
bh2 = BackupHandler(relative_path('demo/data'))
element_index = bh2.load('element_index')

Restored from /Users/ronxin/Dropbox/git/codemend/codemend/demo/data/element_index.pickle


In [4]:
len(element_index)

9060

In [5]:
fav_keys = filter(lambda x:len(x) == 3, element_index.keys())
len(fav_keys)

3628

### Generate Text Representation for Each Root-to-Leaf Path

Naive strategy: simply concatenating docstrings (and repr strings of values)

**Alternative**: just use `fa` instead of `fav`  (but still based on real usage)

In [6]:
favu = []  # (f,a,v,u)
for f,a,v in fav_keys:
    u_f = fu[f] if f in fu else ''
    u_a = fau[f,a] if (f,a) in fau else ''
    assert isinstance(v, basestring)
    u_v = v
    u_cat = ' '.join((u_f, u_a, u_v))
    favu.append((f,a,v,u_cat))

In [7]:
len(favu)

3628

In [8]:
allelems_u = fu.items()+fau.items()+favu

In [9]:
len(allelems_u)

9056

In [10]:
f_u = {}
for f in fu:
    f_u[f] = fu[f] + ' ' + f

In [11]:
fa_catu = {}
for f,a in fau:
    if f in fu:
        fa_catu[f,a] = fu[f] + ' ' + fau[f,a] + ' ' + f + ' ' + a
    else:
        fa_catu[f,a] = fau[f,a] + ' ' + f + ' ' + a

In [12]:
twoelems_u = f_u.items() + [(x,y,z) for (x,y),z in fa_catu.items()]
len(twoelems_u)

5428

Build vector representations of these strings

In [35]:
from __future__ import division
from collections import defaultdict
from gensim import matutils
from gensim.models.word2vec import Word2Vec
from numpy import float32 as REAL
import numpy as np
import pattern.en
import pattern.vector
import math
import gensim
from codemend.models.word2vec_util import load_gensim_from_binary_file
from codemend.models.ngram_util import ngram_partition

In [17]:
w2v_model = load_gensim_from_binary_file(relative_path('demo/data/vectors-flat-mpl-0205.bin'))



In [140]:
class PathVector:
    def __init__(self, w2v_model, element_index, maxngram=1, any_u=None, use_lemma=False, dbg_ref=None):
        self.maxngram = maxngram
        self.use_lemma = use_lemma
        self.element_index = element_index

        if isinstance(w2v_model, basestring):
            # it is a file name
            self.model = load_gensim_from_binary_file(w2v_model)
            self.model.filename = w2v_model.split('/')[-1]
        else:
            assert isinstance(w2v_model, gensim.models.word2vec.Word2Vec)
            self.model = w2v_model

        self.model.init_sims()  # normalize the vectors

        if dbg_ref:
            attrs = ['idfs', 'elems', 'elem_lookup', 'elem_u', 'elem_vecmat']
            for att in attrs:
                setattr(self, att, getattr(dbg_ref, att))
        else:
            print 'building any-utterance indexes...'

            assert isinstance(any_u, list)

            self.idfs = self.get_idf([x[-1] for x in any_u])
            self.elems = [x[:-1] for x in any_u]
            self.elem_lookup = dict((y,x) for (x,y) in enumerate(self.elems))
            self.elem_u = dict((x[:-1],x[-1]) for x in any_u)

            elem_vecs = []
            for x in any_u:
                elem = x[:-1]
                u = x[-1]
                v = self.get_bow_representation(' '.join(elem) + ' ' + u)
                elem_vecs.append(v)
            self.elem_vecmat = np.array(elem_vecs)
            assert self.elem_vecmat.shape == (len(self.elems), self.model.vector_size)

            print 'Finished building indexes.'

        print 'Model intialized.'

    def get_idf(self, documents):
        """
        Get inverse document frequency based on a given vocabulary and a given
        corpus (`documents`). For words in the vocabulary that are unseen in the
        given corpus, assign an IDF of 5.

        documents: a list of strings.

        Returns a list of docfreqs. Indexed by word index.

        """
        counts = [0] * len(self.model.index2word)
        num_doc = len(documents)
        for doc in documents:
            word_idxs = set(self.get_word_indexes(doc))
            for wi in word_idxs:
                counts[wi] += 1
        idfs = []
        for i in xrange(len(counts)):
            if counts[i] > 0:
                idfs.append(np.log(num_doc / counts[i]))
            else:
                # tricky !!!
                idfs.append(5)
        return idfs
    
    def get_word_indexes(self, query):
        """
        Input will be tokenized and matched against the given vocabulary.

        If `maxngram` > 1, then n-gram partition is performed prior to matching.

        TODO: considers removing stop words. The `ngram_partition` function
        already supports this.
        """
        tokens = tokenize(query)
        if self.use_lemma:
            tokens = map(lemma, tokens)
        if self.maxngram > 1:
            tokens_ngrams = ngram_partition(' '.join(tokens), self.model.vocab)
            tokens = list(set(tokens) | set(tokens_ngrams))
        idxs = [self.model.vocab[w].index for w in tokens if w in self.model.vocab]
        if not idxs:
            idxs.append(0)  # the null word
        return idxs

    def get_bow_representation(self, query):
        """
        Returns the mean vector.

        Vectors are weighted by inverse document frequency.

        """
        idxs = self.get_word_indexes(query)

        if hasattr(self, 'idfs'):
            idf_weights = [self.idfs[x] for x in idxs]

            # tricky !!!  intention: stop word removal
            idf_weights = map(lambda x: x if x > 2 else 0, idf_weights)
        else:
            assert not self.bow
            idf_weights = [1] * len(idxs)

        idf_weights = np.array(idf_weights).reshape((1,-1))
        raw_vecs = self.model.syn0norm[idxs]
        weighted_sum = np.dot(idf_weights, raw_vecs)

        weighted_average = (weighted_sum / len(idxs))[0]
        return matutils.unitvec(weighted_average).astype(REAL)

    def find_nearest_favu(self, query, N=10):
        query = query.lower()
        q_vec = self.get_bow_representation(query)

        scores = np.dot(q_vec, self.elem_vecmat.T)
        sorted_elems = sorted(zip(self.elems, scores), key=lambda x:x[1], reverse=True)
        return sorted_elems[:N]
    
    def find_nearest_favu_with_history(self, query, called_funcs, N=10, level=None, particular=None):
        query = query.lower()
        q_vec = self.get_bow_representation(query)

        scores = np.dot(q_vec, self.elem_vecmat.T)
        for i, e in enumerate(self.elems):
            if level and len(e) != level:
                scores[i] = 0
            if e[0] in called_funcs:
                scores[i] *= 1.5
            e_key = e[0] if len(e)==1 else e
            if e_key in self.element_index:
                scores[i] *= math.log(self.element_index[e_key].count + 10)

        sorted_elems = sorted(zip(self.elems, scores), key=lambda x:x[1], reverse=True)
        if particular:
            for x,y in sorted_elems:
                if x == particular:
                    print 'Particular Example:', x, y
        
        return sorted_elems[:N]
    
    def eval_sim(self, query, elem):
        query = query.lower()
        q_vec = self.get_bow_representation(query)

        assert elem in self.elem_lookup
        return np.dot(q_vec, self.elem_vecmat[self.elem_lookup[elem],:].T)
    
def tokenize(s):
    return ' '.join(pattern.en.tokenize(s)).split()

def lemma(token):
    return pattern.vector.stem(token, stemmer=pattern.vector.LEMMA)

# test
pv = PathVector(w2v_model, element_index, any_u=twoelems_u, use_lemma=True, dbg_ref=pv)

Model intialized.


In [91]:
pv.find_nearest_favu_with_history("change the color of bar", ['bar','title','gca'], level=None)

[(('bar', 'color'), 6.8199506),
 (('bar',), 6.2778654),
 (('bar', 'width'), 5.5286312),
 (('plot', 'color'), 4.2947183),
 (('bar', 'bottom'), 4.1417375),
 (('bar', 'yerr'), 3.066117),
 (('tick_params',), 3.0513346),
 (('bar', 'linewidth'), 3.0113461),
 (('bar', 'edgecolor'), 2.9614539),
 (('colorbar',), 2.7744157)]

In [167]:
pv.find_nearest_favu_with_history("change the color", ['bar','title','gca'], level=None)

[(('bar', 'color'), 4.9586401),
 (('plot', 'color'), 4.0374556),
 (('bar', 'width'), 3.8815238),
 (('bar',), 3.2502491),
 (('tick_params',), 3.0480475),
 (('bar', 'linewidth'), 2.887253),
 (('bar', 'ecolor'), 2.7672811),
 (('plot',), 2.6812239),
 (('gca',), 2.632405),
 (('scatter', 'c'), 2.5525353)]

In [89]:
pv.find_nearest_favu_with_history("change the color of bar", ['bar','title','gca'], level=1)

[(('bar',), 6.2778654),
 (('tick_params',), 3.0513346),
 (('colorbar',), 2.7744157),
 (('plot',), 2.7485089),
 (('title',), 2.5615344),
 (('scatter',), 2.1229429),
 (('gca',), 2.0433226),
 (('pie',), 1.9941124),
 (('legend',), 1.9623019),
 (('set_title',), 1.9434626)]

In [90]:
pv.find_nearest_favu_with_history("change the color of bar", ['bar','title','gca'], level=2)

[(('bar', 'color'), 6.8199506),
 (('bar', 'width'), 5.5286312),
 (('plot', 'color'), 4.2947183),
 (('bar', 'bottom'), 4.1417375),
 (('bar', 'yerr'), 3.066117),
 (('bar', 'linewidth'), 3.0113461),
 (('bar', 'edgecolor'), 2.9614539),
 (('bar', 'ecolor'), 2.744487),
 (('axhline', 'color'), 2.7029259),
 (('bar', 'align'), 2.6506591)]

In [75]:
pv.find_nearest_favu_with_history("change the color of error bar", ['bar','title','gca'], level=2)

[(('bar', 'ecolor'), 1.0792642),
 (('bar', 'capsize'), 1.078748),
 (('bar', 'color'), 1.0319688),
 (('bar', 'tick_label'), 1.0238355),
 (('bar', 'width'), 0.99080426),
 (('bar', 'yerr'), 0.98589897),
 (('bar', 'edgecolor'), 0.94784379),
 (('bar', 'xerr'), 0.91548765),
 (('bar', 'bottom'), 0.89567578),
 (('bar', 'linewidth'), 0.84561592)]

In [76]:
pv.elem_u['bar', 'capsize']

'make a bar plot . determines the length in points of the error bar caps default : none , which will take the value from the errorbar.capsize rcparam . bar capsize'

In [93]:
pv.find_nearest_favu_with_history('bar chart', [])

[(('bar',), 5.3597655),
 (('bar', 'color'), 3.8867948),
 (('bar', 'width'), 3.3611932),
 (('pie',), 3.3467035),
 (('bar', 'bottom'), 3.2608292),
 (('barh',), 3.2043087),
 (('pie', 'labels'), 2.7181654),
 (('bar', 'yerr'), 2.51087),
 (('pie', 'colors'), 2.2094915),
 (('barh', 'color'), 2.1676953)]

In [94]:
pv.find_nearest_favu_with_history('add a legend', ['bar','title','gca'])

[(('legend',), 6.0455313),
 (('legend', 'loc'), 4.8694849),
 (('plot', 'label'), 4.4699159),
 (('set_title',), 4.2849402),
 (('title',), 4.177937),
 (('set_ylabel',), 3.8466082),
 (('set_xlabel',), 3.7347579),
 (('plot',), 3.2389994),
 (('legend', 'ncol'), 3.2349129),
 (('legend', 'numpoints'), 3.2005026)]

In [107]:
pv.find_nearest_favu_with_history('add label to axis', ['bar','title','gca'], N=20)

[(('set_ylabel',), 5.9215784),
 (('set_xlabel',), 5.749393),
 (('plot', 'label'), 5.2198696),
 (('title',), 5.1264553),
 (('set_xticklabels',), 4.6034708),
 (('legend',), 4.4691153),
 (('set_yticklabels',), 4.1659832),
 (('set_title',), 4.0020509),
 (('legend', 'loc'), 3.8035519),
 (('clabel',), 3.6635616),
 (('xticks',), 3.6516464),
 (('set_xticks',), 3.5326865),
 (('ylabel',), 3.4802456),
 (('set_ticklabels',), 3.4652359),
 (('set_yticks',), 3.3189418),
 (('xlabel',), 3.0925903),
 (('annotate',), 2.9764721),
 (('get_xticklabels',), 2.9295442),
 (('text',), 2.8735266),
 (('tick_params',), 2.8572853)]

In [114]:
pv.find_nearest_favu_with_history('label', ['bar','title','gca'])

[(('set_ylabel',), 5.9215784),
 (('set_xlabel',), 5.749393),
 (('plot', 'label'), 5.2198696),
 (('title',), 5.1264553),
 (('set_xticklabels',), 4.6034708),
 (('legend',), 4.4691153),
 (('set_yticklabels',), 4.1659832),
 (('set_title',), 4.0020509),
 (('legend', 'loc'), 3.8035519),
 (('clabel',), 3.6635616)]

In [162]:
pv.find_nearest_favu_with_history('add label to bar', ['barh','title','gca'])

[(('bar',), 5.1364307),
 (('set_ylabel',), 4.4863582),
 (('title',), 4.3627024),
 (('set_xlabel',), 4.3597617),
 (('barh',), 4.3539686),
 (('plot', 'label'), 4.2438855),
 (('bar', 'color'), 3.9847426),
 (('barh', 'label'), 3.72363),
 (('barh', 'align'), 3.6429608),
 (('legend',), 3.6261966)]

In [108]:
pv.element_index['xlabel'].count

274

In [109]:
pv.element_index['set_xlabel'].count

313

In [112]:
pv.element_index['set_ylabel'].count

372

In [111]:
pv.idfs[pv.model.vocab['x'].index]

1.8766962260993116

In [163]:
pv.find_nearest_favu_with_history('add title', ['bar'])

[(('title',), 5.7267838),
 (('set_title',), 5.4599924),
 (('set_ylabel',), 3.6360991),
 (('set_xlabel',), 3.5334954),
 (('figure',), 3.3880913),
 (('ylabel',), 3.3253629),
 (('plot', 'label'), 3.3251779),
 (('xlabel',), 3.1879299),
 (('legend',), 3.1693764),
 (('suptitle',), 3.1600735)]

In [170]:
pv.find_nearest_favu_with_history('hatching',['barh'],N=30)

[(('plot',), 3.0003369),
 (('plot', 'linestyle'), 2.6804135),
 (('use',), 2.450583),
 (('plot', 'alpha'), 2.2735984),
 (('set_color',), 2.1326993),
 (('set_hatch',), 2.1158412),
 (('barh', 'alpha'), 2.0963635),
 (('plot', 'marker'), 2.0929823),
 (('figure', 'figsize'), 1.9898123),
 (('add_patch',), 1.9872736),
 (('plot', 'color'), 1.9526453),
 (('plot', 'linewidth'), 1.8575439),
 (('scatter', 'alpha'), 1.8561339),
 (('barh', 'color'), 1.8245343),
 (('plot', 'markersize'), 1.7813454),
 (('scatter', 'marker'), 1.7598672),
 (('plot', 'label'), 1.7513134),
 (('set_alpha',), 1.7428889),
 (('barh', 'hatch'), 1.7377423),
 (('barh', 'ecolor'), 1.7346425),
 (('plot', 'zorder'), 1.7045667),
 (('barh', 'rasterized'), 1.6843882),
 (('get_hatch',), 1.6644897),
 (('grid', 'linestyle'), 1.6404513),
 (('plot', 'markerfacecolor'), 1.5937103),
 (('axhline', 'linestyle'), 1.5898454),
 (('tick_params',), 1.5878882),
 (('axis',), 1.578981),
 (('subplots_adjust',), 1.5780613),
 (('subplots', 'figsize'), 1.5

In [142]:
pv.element_index['barh','hatch'].count, pv.element_index['barh','color'].count

(0, 33)

In [144]:
pv.find_nearest_favu_with_history('add pattern',['barh'])

[(('plot',), 2.0971491),
 (('spy',), 2.0067892),
 (('set_hatch',), 1.9396838),
 (('set_fontconfig_pattern', 'pattern'), 1.8075762),
 (('plot', 'color'), 1.7272801),
 (('barh', 'hatch'), 1.5883023),
 (('subplot',), 1.5775084),
 (('imshow', 'extent'), 1.5291213),
 (('barbs', 'hatch'), 1.5160178),
 (('pcolormesh', 'hatch'), 1.4493353)]

In [145]:
pv.find_nearest_favu_with_history('add shadow to legend', ['barh'])

[(('legend',), 4.5363936),
 (('legend', 'loc'), 3.5968735),
 (('plot', 'label'), 3.3333888),
 (('legend', 'shadow'), 2.7488227),
 (('plot',), 2.7461987),
 (('set_title',), 2.6677158),
 (('set_ylabel',), 2.4933481),
 (('legend', 'prop'), 2.4445868),
 (('legend', 'ncol'), 2.4395103),
 (('legend', 'bbox_to_anchor'), 2.4271677)]

In [147]:
pv.element_index['legend','shadow'].count

22

In [149]:
pv.element_index['pie','explode'].count

12

In [164]:
pv.find_nearest_favu_with_history('add error bar', ['bar'], level=2)

[(('errorbar', 'color'), 3.1700554),
 (('bar', 'color'), 3.0637844),
 (('errorbar', 'yerr'), 2.7081172),
 (('errorbar', 'elinewidth'), 2.6738045),
 (('errorbar', 'marker'), 2.5789244),
 (('errorbar', 'label'), 2.5593166),
 (('bar', 'yerr'), 2.5365443),
 (('plot', 'color'), 2.4367342),
 (('errorbar', 'ecolor'), 2.4015684),
 (('plot', 'label'), 2.39098)]

In [155]:
pv.find_nearest_favu_with_history('change width of the bar', ['bar'], level=2)

[(('bar', 'width'), 6.2909636),
 (('bar', 'color'), 5.5728292),
 (('bar', 'bottom'), 4.5272994),
 (('bar', 'linewidth'), 3.8438463),
 (('bar', 'align'), 3.6277189),
 (('bar', 'edgecolor'), 3.4315317),
 (('bar', 'height'), 3.2594993),
 (('bar', 'ecolor'), 3.1723094),
 (('plot', 'linewidth'), 3.1693923),
 (('bar', 'yerr'), 3.101994)]

In [156]:
pv.elem_u['bar','yerr']

'make a bar plot . if not none , will be used to generate errorbar(s ) on the bar chart default : none bar yerr'

In [160]:
pv.find_nearest_favu_with_history('errorbar',['bar'], level=2)

[(('errorbar', 'color'), 3.1700554),
 (('bar', 'color'), 3.0637844),
 (('errorbar', 'yerr'), 2.7081172),
 (('errorbar', 'elinewidth'), 2.6738045),
 (('errorbar', 'marker'), 2.5789244),
 (('errorbar', 'label'), 2.5593166),
 (('bar', 'yerr'), 2.5365443),
 (('plot', 'color'), 2.4367342),
 (('errorbar', 'ecolor'), 2.4015684),
 (('plot', 'label'), 2.39098)]