In [1]:
use_cache = False

In [2]:
## Download pre-processed data if you want to run from tutorial from this step.##
from general_utils import get_step2_prerequisite_files

if use_cache:
    get_step2_prerequisite_files(output_directory = './data/processed_data/stackoverflow/')

# Build Language Model From Stackoverflow post content

In [3]:
import torch,cv2
from lang_model_utils import lm_vocab, load_lm_vocab, train_lang_model
from general_utils import save_file_pickle, load_file_pickle
import logging
from pathlib import Path
from fastai.text import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [10]:

source_path = Path('./data/stackoverflow/processed_data/')

with open(source_path/'train.content_token', 'r') as f:
    trn_raw = f.readlines()

with open(source_path/'valid.content_token', 'r') as f:
    val_raw = f.readlines()
    
with open(source_path/'test.content_token', 'r') as f:
    test_raw = f.readlines()

## Pre-process data for language model

We will use the class  `build_lm_vocab` to prepare our data for the language model

In [5]:
vocab = lm_vocab(max_vocab=50000,
                 min_freq=10)

# fit the transform on the training data, then transform
trn_flat_idx = vocab.fit_transform_flattened(trn_raw)



Look at the transformed data

In [6]:
trn_flat_idx[:10]

array([  7,  53, 368, 259, 146, 447,  23,  15, 308,   6])

In [7]:
[vocab.itos[x] for x in trn_flat_idx[:10]]

['_xbos_',
 'an',
 'approach',
 'without',
 'any',
 'built',
 '-',
 'in',
 'functions',
 ':']

In [8]:
# apply transform to validation data
val_flat_idx = vocab.transform_flattened(val_raw)



Save files for later use

In [9]:
if not use_cache:
    vocab.save('./data/stackoverflow/lang_model/vocab.cls')
    save_file_pickle('./data/stackoverflow/lang_model/trn_flat_idx_list.pkl', trn_flat_idx)
    save_file_pickle('./data/stackoverflow/lang_model/val_flat_idx_list.pkl', val_flat_idx)



## Train Fast.AI Language Model

This model will read in files that were created and train a [fast.ai](https://github.com/fastai/fastai/tree/master/fastai) language model.  This model learns to predict the next word in the sentence using fast.ai's implementation of [AWD LSTM](https://github.com/salesforce/awd-lstm-lm).  

The goal of training this model is to build a general purpose feature extractor for text that can be used in downstream models.  In this case, we will utilize this model to produce embeddings for function docstrings.

In [10]:
vocab = load_lm_vocab('./data/stackoverflow/lang_model/vocab.cls')
trn_flat_idx = load_file_pickle('./data/stackoverflow/lang_model/trn_flat_idx_list.pkl')
val_flat_idx = load_file_pickle('./data/stackoverflow/lang_model/val_flat_idx_list.pkl')



In [11]:
if not use_cache:
    fastai_learner, lang_model = train_lang_model(model_path = './data/stackoverflow/lang_model_weights',
                                                  trn_indexed = trn_flat_idx,
                                                  val_indexed = val_flat_idx,
                                                  vocab_size = vocab.vocab_size,
                                                  lr=3e-3,
                                                  em_sz= 500,
                                                  nh= 500,
                                                  bptt=20,
                                                  cycle_len=1,
                                                  n_cycle=2,
                                                  bs = 200,
                                                  wd = 1e-6)
    
elif use_cache:    
    logging.warning('Not re-training language model because use_cache=True')

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

epoch      trn_loss   val_loss                                    
    0      3.601241   3.474715  
                                                                  

data/stackoverflow/lang_model_weights/models/langmodel_best.h5


    1      3.481606   3.358989  



In [None]:
# if not use_cache:
#     fastai_learner.fit(1e-3, 3, wds=1e-6, cycle_len=2)

In [12]:
if not use_cache:
    fastai_learner.fit(1e-3, 2, wds=1e-6, cycle_len=3)

HBox(children=(IntProgress(value=0, description='Epoch', max=6), HTML(value='')))

epoch      trn_loss   val_loss                                    
    0      3.472972   3.333834  
    1      3.394681   3.286406                                    
    2      3.385816   3.272807                                    
    3      3.41238    3.283735                                    
 68%|██████▊   | 3456/5086 [3:25:59<1:37:09,  3.58s/it, loss=3.36]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 47%|████▋     | 2384/5086 [2:25:33<2:44:58,  3.66s/it, loss=3.29]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



    5      3.348129   3.232127                                  



In [None]:
# if not use_cache:
#     fastai_learner.fit(1e-3, 2, wds=1e-6, cycle_len=3, cycle_mult=10)

Save language model and learner

In [13]:
if not use_cache:
    fastai_learner.save('lang_model_learner.fai')
    lang_model_new = fastai_learner.model.eval()
#     torch.save(lang_model_new, './data/stackoverflow/lang_model/lang_model_gpu_v2.torch')
    torch.save(lang_model_new.cpu(), './data/stackoverflow/lang_model/lang_model_cpu.torch')

# Load Model and Encode All Docstrings

Now that we have trained the language model, the next step is to use the language model to encode all of the docstrings into a vector. 

** Note that checkpointed versions of the language model artifacts are available for download: **

1. `lang_model_cpu_v2.torch` : https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model/lang_model_cpu_v2.torch 
2. `lang_model_gpu_v2.torch` : https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model/lang_model_gpu_v2.torch
3. `vocab_v2.cls` : https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model/vocab_v2.cls

In [14]:
from lang_model_utils import load_lm_vocab
vocab = load_lm_vocab('./data/stackoverflow/lang_model/vocab.cls')
idx_docs = vocab.transform(trn_raw + val_raw, max_seq_len=30, padding=False)
lang_model = torch.load('./data/stackoverflow/lang_model/lang_model_cpu.torch', 
                        map_location=lambda storage, loc: storage)



In [15]:
lang_model.eval()

SequentialRNN(
  (0): RNN_Encoder(
    (encoder): Embedding(27877, 500, padding_idx=1)
    (encoder_with_dropout): EmbeddingDropout(
      (embed): Embedding(27877, 500, padding_idx=1)
    )
    (rnns): ModuleList(
      (0): WeightDrop(
        (module): LSTM(500, 500)
      )
      (1): WeightDrop(
        (module): LSTM(500, 500)
      )
      (2): WeightDrop(
        (module): LSTM(500, 500)
      )
    )
    (dropouti): LockedDropout(
    )
    (dropouths): ModuleList(
      (0): LockedDropout(
      )
      (1): LockedDropout(
      )
      (2): LockedDropout(
      )
    )
  )
  (1): LinearDecoder(
    (decoder): Linear(in_features=500, out_features=27877, bias=False)
    (dropout): LockedDropout(
    )
  )
)

**Note:** the below code extracts embeddings for docstrings one docstring at a time, which is very inefficient.  Ideally, you want to extract embeddings in batch but account for the fact that you will have padding, etc. when extracting the hidden states.  For this tutorial, we only provide this minimal example, however you are welcome to improve upon this and sumbit a PR!

In [16]:
def list2arr(l):
    "Convert list into pytorch Variable."
    return V(np.expand_dims(np.array(l), -1)).cpu()

def make_prediction_from_list(model, l):
    """
    Encode a list of integers that represent a sequence of tokens.  The
    purpose is to encode a sentence or phrase.

    Parameters
    -----------
    model : fastai language model
    l : list
        list of integers, representing a sequence of tokens that you want to encode

    """
    arr = list2arr(l)# turn list into pytorch Variable with bs=1
    model.reset()  # language model is stateful, so you must reset upon each prediction
    hidden_states = model(arr)[-1][-1] # RNN Hidden Layer output is last output, and only need the last layer

    #return avg-pooling, max-pooling, and last hidden state
    return hidden_states.mean(0), hidden_states.max(0)[0], hidden_states[-1]


def get_embeddings(lm_model, list_list_int):
    """
    Vectorize a list of sequences List[List[int]] using a fast.ai language model.

    Paramters
    ---------
    lm_model : fastai language model
    list_list_int : List[List[int]]
        A list of sequences to encode

    Returns
    -------
    tuple: (avg, mean, last)
        A tuple that returns the average-pooling, max-pooling over time steps as well as the last time step.
    """
    n_rows = len(list_list_int)
    n_dim = lm_model[0].nhid
    avgarr = np.empty((n_rows, n_dim))
    maxarr = np.empty((n_rows, n_dim))
    lastarr = np.empty((n_rows, n_dim))

    for i in tqdm_notebook(range(len(list_list_int))):
        avg_, max_, last_ = make_prediction_from_list(lm_model, list_list_int[i])
        avgarr[i,:] = avg_.data.numpy()
        maxarr[i,:] = max_.data.numpy()
        lastarr[i,:] = last_.data.numpy()

    return avgarr, maxarr, lastarr

In [17]:
%%time
avg_hs, max_hs, last_hs = get_embeddings(lang_model, idx_docs)

HBox(children=(IntProgress(value=0, max=560685), HTML(value='')))


CPU times: user 1d 7h 33min 42s, sys: 6min 46s, total: 1d 7h 40min 28s
Wall time: 4h 31min 13s


### Do the same thing for the test set

In [18]:
idx_docs_test = vocab.transform(test_raw, max_seq_len=30, padding=False)
avg_hs_test, max_hs_test, last_hs_test = get_embeddings(lang_model, idx_docs_test)



HBox(children=(IntProgress(value=0, max=83781), HTML(value='')))




# Save Language Model Embeddings For Docstrings

In [19]:
savepath = Path('./data/stackoverflow/lang_model_emb/')
np.save(savepath/'avg_emb_dim500.npy', avg_hs)
np.save(savepath/'max_emb_dim500.npy', max_hs)
np.save(savepath/'last_emb_dim500.npy', last_hs)

In [20]:
# save the test set embeddings also
np.save(savepath/'avg_emb_dim500_test.npy', avg_hs_test)
np.save(savepath/'max_emb_dim500_test.npy', max_hs_test)
np.save(savepath/'last_emb_dim500_test.npy', last_hs_test)

** Note that the embeddings saved to disk above have also been cached and are are available for download: **

Train + Validation docstrings vectorized:

1. `avg_emb_dim500_v2.npy` : https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model_emb/avg_emb_dim500_v2.npy
2. `max_emb_dim500_v2.npy` : https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model_emb/last_emb_dim500_v2.npy
3. `last_emb_dim500_v2.npy` : https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model_emb/max_emb_dim500_v2.npy

Test set docstrings vectorized:

1. `avg_emb_dim500_test_v2.npy`: https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model_emb/avg_emb_dim500_test_v2.npy

2. `max_emb_dim500_test_v2.npy`: https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model_emb/last_emb_dim500_test_v2.npy

3. `last_emb_dim500_test_v2.npy`: https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model_emb/max_emb_dim500_test_v2.npy

# Evaluate Sentence Embeddings

One popular way of evaluating sentence embeddings is to measure the efficacy of these embeddings in downstream tasks like sentiment analysis, textual similarity etc.  Usually you can use general-purpose benchmarks such as the examples outlined [here](https://github.com/facebookresearch/SentEval) to measure the quality of your embeddings.  However, since this is a very domain specific dataset - those general purpose benchmarks may not be appropriate.  Unfortunately, we have not designed downstream tasks that we can open source at this point.

In the absence of these downstream tasks, we can at least sanity check that these embeddings contain semantic information by doing the following:

1. Manually examine similarity between sentences, by supplying a statement and examining if the nearest phrase found is similar. 

2. Visualize the embeddings.

We will do the first approach, and leave the second approach as an exercise for the reader.  **It should be noted that this is only a sanity check -- a more rigorous approach is to measure the impact of these embeddings on a variety of downstream tasks** and use that to form a more objective opinion about the quality of your embeddings.

Furthermroe, there are many different ways of constructing a sentence embedding from the language model.  For example, we can take the average, the maximum or even the last value of the hidden states (or concatenate them all together).  **For simplicity, we will only evaluate the sentence embedding that is constructed by taking the average over the hidden states** (and leave other possibilities as an exercise for the reader). 

### Create search index using `nmslib` 

[nmslib](https://github.com/nmslib/nmslib) is a great library for doing nearest neighbor lookups, which we will use as a search engine for finding nearest neighbors of comments in vector-space.  

The convenience function `create_nmslib_search_index` builds this search index given a matrix of vectors as input.



In [57]:
from general_utils import create_nmslib_search_index
import nmslib
from lang_model_utils import Query2Emb
from pathlib import Path
import numpy as np
from lang_model_utils import load_lm_vocab
import torch

In [72]:
# Load matrix of vectors
loadpath = Path('./data/stackoverflow/lang_model_emb/')
avg_emb_dim500 = np.load(loadpath/'avg_emb_dim500_test.npy')

In [73]:
# Build search index (takes about an hour on a p3.8xlarge)
dim500_avg_searchindex = create_nmslib_search_index(avg_emb_dim500)

In [74]:
# save search index
dim500_avg_searchindex.saveIndex('./data/stackoverflow/lang_model_emb/dim500_avg_searchindex.nmslib')

Note that if you did not train your own language model and are downloading the pre-trained model artifacts instead, you can similarly download the pre-computed search index here: 

https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model_emb/dim500_avg_searchindex.nmslib

After you have built this search index with nmslib, you can do fast nearest-neighbor lookups.  We use the `Query2Emb` object to help convert strings to the embeddings: 

In [75]:
dim500_avg_searchindex = nmslib.init(method='hnsw', space='cosinesimil')
dim500_avg_searchindex.loadIndex('./data/stackoverflow/lang_model_emb/dim500_avg_searchindex.nmslib')

In [76]:
lang_model = torch.load('./data/stackoverflow/lang_model/lang_model_cpu.torch')
vocab = load_lm_vocab('./data/stackoverflow/lang_model/vocab.cls')

q2emb = Query2Emb(lang_model = lang_model.cpu(),
                  vocab = vocab)

The method `Query2Emb.emb_mean` will allow us to use the langauge model we trained earlier to generate a sentence embedding given a string.   Here is an example, `emb_mean` will return a numpy array of size (1, 500).

In [77]:
query = q2emb.emb_mean('Read data into pandas dataframe')
query.shape

(1, 500)

**Make search engine to inspect semantic similarity of phrases**.  This will take 3 inputs:

1. `nmslib_index` - this is the search index we built above.  This object takes a vector and will return the index of the closest vector(s) according to cosine distance.  
2. `ref_data` - this is the data for which the index refer to, in this case will be the docstrings. 
3. `query2emb_func` - this is a function that will convert a string into an embedding.

In [78]:
class search_engine:
    def __init__(self, 
                 nmslib_index, 
                 ref_data, 
                 query2emb_func):
        
        self.search_index = nmslib_index
        self.data = ref_data
        self.query2emb_func = query2emb_func
    
    def search(self, str_search, k=50):
        query = self.query2emb_func(str_search)
        idxs, dists = self.search_index.knnQuery(query, k=k)
        
        for idx, dist in zip(idxs, dists):
            print(f'cosine dist:{dist:.4f}\n---------------\n', self.data[idx])

In [106]:
se = search_engine(nmslib_index=dim500_avg_searchindex,
                   ref_data = test_raw,
                   query2emb_func = q2emb.emb_mean)

## Manually Inspect Phrase Similarity

Compare a user-supplied query vs. vectorized docstrings on test set.  We can see that similar phrases are not exactly the same, but the nearest neighbors are reasonable.  

In [107]:
import logging
logging.getLogger().setLevel(logging.ERROR)

In [108]:
se.search('Read data into pandas dataframe')

cosine dist:0.1122
---------------
 first read csv into pandas df :

cosine dist:0.1260
---------------
 the easiest way is pandas . read data from file into dataframe :

cosine dist:0.1292
---------------
 read the documentation : manage data in containers

cosine dist:0.1358
---------------
 you can read from multiple sheets with pandas :

cosine dist:0.1380
---------------
 you can read data in chunks :

cosine dist:0.1402
---------------
 read the csvs 's in using pd.read_csv(file )

cosine dist:0.1418
---------------
 read your dataframe -

cosine dist:0.1438
---------------
 you can read the file as two separate parts ( stats and csv )

cosine dist:0.1442
---------------
 first read your csv file with pandas with

cosine dist:0.1499
---------------
 you can easily read your data into a dictionary of dictionaries :

cosine dist:0.1501
---------------
 you can read the json file all in once like :

cosine dist:0.1565
---------------
 read the input & error stream in separated threa

In [104]:
se.search('train a deep learning model')

cosine dist:0.2142
---------------
 there is currently no other way that using a third party app to create a rest api in django . this is something not natively supported in django .

cosine dist:0.2367
---------------
 you can initialize a with 0.0flag_val = 0.0 def frozenindex(x ) : if x>5 : current = gen.next ( ) flag_val = current else : current = flag_val return current

cosine dist:0.2393
---------------
 use a list comprehension :

cosine dist:0.2595
---------------
 "well i could n't figure out what 's wrong with your code yet . but from the question , i guess this is what you 're looking for"

cosine dist:0.2603
---------------
 "to replace that loop at the bottom , you could do something like :"

cosine dist:0.2638
---------------
 i think you 're making this harder than it needs to be . either sum them and divide by the number of terms :

cosine dist:0.2659
---------------
 "we can do this in one pass : > > > from collections import defaultdict > > > counter = defaultdict(la

In [97]:
se.search('visualize location distribution')

cosine dist:0.1614
---------------
 draw keypoints as filled white circles :

cosine dist:0.1798
---------------
 try to threshold the input image .

cosine dist:0.1899
---------------
 find affine transformation to make rectangle axis - aligned . it is just rotation by angle

cosine dist:0.1936
---------------
 just plot with the dataframe values :

cosine dist:0.1979
---------------
 you can pick the color in the hsl space

cosine dist:0.1986
---------------
 save the bins and use pd.cut again :

cosine dist:0.1999
---------------
 provide the overlay alpha mask parameter and see if this yields results you expected :

cosine dist:0.2001
---------------
 it sounds like you need to focus on the interpolation of data and then extract values from desired coordinates . for 1d splrep and 2d bisplrep are the interpolation functions you need to check out ( a good overview ) . both of these functions can be weighted and provide fine tune control over the spline function you interpolate with .

In [105]:
se.search('start a web server')

cosine dist:0.2197
---------------
 "i modified your and it appears to work for me . if you still get an error , please post the full error . > > > import gmpy2 > > > > > > def unpack(x , b ) : ... try : ... return [ x for x in gmpy2.unpack(gmpy2.mpz(x ) , b ) ] ... except nameerror : ... b = 2 * * b ... r = [ ] ... while x : ... x , temp = divmod(x , b ) ... r.append(temp ) ... return r ... > > > unpack(123456**7 , 15 ) [ mpz(0 ) , mpz(0 ) , mpz(4096 ) , mpz(25855 ) , mpz(24508 ) , mpz(31925 ) , mpz(15111 ) , mpz(10775 ) ] > > > del(gmpy2 ) > > > unpack(123456**7 , 15 ) [ 0 , 0 , 4096 , 25855 , 24508 , 31925 , 15111 , 10775 ]"

cosine dist:0.2203
---------------
 every python object has a special member called which is a dictionary containing all the instance 's member .

cosine dist:0.2217
---------------
 "this is similar to @user3238855 's answer , but i 'm using python 3 's function , with a filter ( ) instead of a comprehension , and not saving a reference to a list of lines.with

In [99]:
se.search('send out email')

cosine dist:0.1530
---------------
 you should send get request with cookies :

cosine dist:0.1683
---------------
 try to send csrf token

cosine dist:0.1858
---------------
 send a multipart email with the appropriate mime types .

cosine dist:0.1993
---------------
 get the username :

cosine dist:0.2006
---------------
 you have to send the and connection : close headers .

cosine dist:0.2054
---------------
 you need to send a valid http request . for example :

cosine dist:0.2067
---------------
 "you need to send the output string ,"

cosine dist:0.2074
---------------
 you should generate e - mail in html format and insert needed tags

cosine dist:0.2097
---------------
 to handle cookies you should use a session that stores cookies between requests :

cosine dist:0.2110
---------------
 you can pass username and password as arguments :

cosine dist:0.2121
---------------
 you need to send json with additional header entries :

cosine dist:0.2141
---------------
 "since you are

In [100]:
se.search('parse html')

cosine dist:0.1476
---------------
 parse from file :

cosine dist:0.1544
---------------
 "parse the html using beautifulsoup , then only retrieve the text ."

cosine dist:0.1860
---------------
 you can use beautifulsoup for parsing the html string .

cosine dist:0.1887
---------------
 "you can parse the json , then output it again with indents like this :"

cosine dist:0.1947
---------------
 "use json . generate the css dynamically , using caching to reduce load ."

cosine dist:0.1996
---------------
 "use lxml.html , it handles invalid xhtml better ."

cosine dist:0.2003
---------------
 load the data first :

cosine dist:0.2005
---------------
 this can be done by generating html in python to replace the html of a text area .

cosine dist:0.2011
---------------
 "first , unescape html entities , then remove punctuation chars :"

cosine dist:0.2025
---------------
 simply sanitize your input :

cosine dist:0.2033
---------------
 you ca n't just parse the pdf with a regex to extr

### Visualize Embeddings (Optional)

We highly recommend using [tensorboard](https://www.tensorflow.org/versions/r1.0/get_started/embedding_viz) as way to visualize embeddings.  Tensorboard contains an interactive search that makes it easy (and fun) to explore embeddings.  We leave this as an exercise to the reader.