In [22]:
use_cache = False

In [23]:
## Download pre-processed data if you want to run from tutorial from this step.##
from general_utils import get_step2_prerequisite_files

if use_cache:
    get_step2_prerequisite_files(output_directory = './data/processed_data/stackoverflow/')

# Build Language Model From Stackoverflow post content

In [24]:
import torch,cv2
from lang_model_utils import lm_vocab, load_lm_vocab, train_lang_model
from general_utils import save_file_pickle, load_file_pickle
import logging
from pathlib import Path
from fastai.text import *

In [26]:

source_path = Path('./data/stackoverflow/processed_data/')

with open(source_path/'train.content_token', 'r') as f:
    trn_raw = f.readlines()

with open(source_path/'valid.content_token', 'r') as f:
    val_raw = f.readlines()
    
with open(source_path/'test.content_token', 'r') as f:
    test_raw = f.readlines()

## Pre-process data for language model

We will use the class  `build_lm_vocab` to prepare our data for the language model

In [27]:
vocab = lm_vocab(max_vocab=50000,
                 min_freq=10)

# fit the transform on the training data, then transform
trn_flat_idx = vocab.fit_transform_flattened(trn_raw)



Look at the transformed data

In [28]:
trn_flat_idx[:10]

array([   7, 1470,  211,  354,   21,   16,   34,    1,   12,  672])

In [29]:
[vocab.itos[x] for x in trn_flat_idx[:10]]

['_xbos_',
 'unfortunately',
 'right',
 'now',
 'it',
 'is',
 'not',
 '_pad_',
 'to',
 'filter']

In [30]:
# apply transform to validation data
val_flat_idx = vocab.transform_flattened(val_raw)



Save files for later use

In [33]:
if not use_cache:
    vocab.save('./data/stackoverflow/lang_model/vocab.cls')
    save_file_pickle('./data/stackoverflow/lang_model/trn_flat_idx_list.pkl', trn_flat_idx)
    save_file_pickle('./data/stackoverflow/lang_model/val_flat_idx_list.pkl', val_flat_idx)



## Train Fast.AI Language Model

This model will read in files that were created and train a [fast.ai](https://github.com/fastai/fastai/tree/master/fastai) language model.  This model learns to predict the next word in the sentence using fast.ai's implementation of [AWD LSTM](https://github.com/salesforce/awd-lstm-lm).  

The goal of training this model is to build a general purpose feature extractor for text that can be used in downstream models.  In this case, we will utilize this model to produce embeddings for function docstrings.

In [35]:
vocab = load_lm_vocab('./data/stackoverflow/lang_model/vocab.cls')
trn_flat_idx = load_file_pickle('./data/stackoverflow/lang_model/trn_flat_idx_list.pkl')
val_flat_idx = load_file_pickle('./data/stackoverflow/lang_model/val_flat_idx_list.pkl')



In [36]:
if not use_cache:
    fastai_learner, lang_model = train_lang_model(model_path = './data/stackoverflow/lang_model_weights',
                                                  trn_indexed = trn_flat_idx,
                                                  val_indexed = val_flat_idx,
                                                  vocab_size = vocab.vocab_size,
                                                  lr=3e-3,
                                                  em_sz= 500,
                                                  nh= 500,
                                                  bptt=20,
                                                  cycle_len=1,
                                                  n_cycle=3,
                                                  cycle_mult=2,
                                                  bs = 200,
                                                  wd = 1e-6)
    
elif use_cache:    
    logging.warning('Not re-training language model because use_cache=True')

HBox(children=(IntProgress(value=0, description='Epoch', max=7), HTML(value='')))

epoch      trn_loss   val_loss                            
    0      5.289824   5.073163  
    1      4.991432   4.777453                            
    2      4.633141   4.359899                            
    3      4.261553   4.03212                             
    4      3.991656   3.857163                            
    5      3.810928   3.743084                            
                                                          

data/stackoverflow/lang_model_weights/models/langmodel_best.h5


    6      3.688868   3.666875  


In [37]:
if not use_cache:
    fastai_learner.fit(1e-3, 3, wds=1e-6, cycle_len=2)

HBox(children=(IntProgress(value=0, description='Epoch', max=6), HTML(value='')))

epoch      trn_loss   val_loss                            
    0      3.53591    3.620921  
    1      3.457347   3.601915                            
    2      3.446788   3.576479                            
    3      3.397346   3.554816                            
    4      3.389691   3.543317                            
    5      3.332148   3.523645                            


In [38]:
if not use_cache:
    fastai_learner.fit(1e-3, 2, wds=1e-6, cycle_len=3, cycle_mult=2)

HBox(children=(IntProgress(value=0, description='Epoch', max=9), HTML(value='')))

epoch      trn_loss   val_loss                            
    0      3.336175   3.512969  
    1      3.282112   3.495233                            
    2      3.234097   3.486641                            
    3      3.245711   3.495026                            
    4      3.207955   3.48001                             
    5      3.157338   3.463748                            
    6      3.100554   3.447244                            
    7      3.061632   3.443354                            
    8      3.032302   3.435668                            


In [39]:
if not use_cache:
    fastai_learner.fit(1e-3, 2, wds=1e-6, cycle_len=3, cycle_mult=10)

HBox(children=(IntProgress(value=0, description='Epoch', max=33), HTML(value='')))

epoch      trn_loss   val_loss                            
    0      3.085778   3.455039  
    1      3.026549   3.433519                            
    2      3.001236   3.44187                             
    3      3.034877   3.44887                             
    4      3.008628   3.45341                             
    5      2.96828    3.451785                            
    6      2.918167   3.444601                            
    7      2.881839   3.450093                            
    8      2.815383   3.453834                            
    9      2.763045   3.47167                             
    10     2.714031   3.4795                              
    11     2.677741   3.485371                            
    12     2.620355   3.487961                            
    13     2.597635   3.487397                            
    14     2.557454   3.507987                            
    15     2.544845   3.505275                            
    16     2.546671   3

Save language model and learner

In [40]:
if not use_cache:
    fastai_learner.save('lang_model_learner.fai')
    lang_model_new = fastai_learner.model.eval()
#     torch.save(lang_model_new, './data/stackoverflow/lang_model/lang_model_gpu_v2.torch')
    torch.save(lang_model_new.cpu(), './data/stackoverflow/lang_model/lang_model_cpu.torch')

# Load Model and Encode All Docstrings

Now that we have trained the language model, the next step is to use the language model to encode all of the docstrings into a vector. 

** Note that checkpointed versions of the language model artifacts are available for download: **

1. `lang_model_cpu_v2.torch` : https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model/lang_model_cpu_v2.torch 
2. `lang_model_gpu_v2.torch` : https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model/lang_model_gpu_v2.torch
3. `vocab_v2.cls` : https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model/vocab_v2.cls

In [41]:
from lang_model_utils import load_lm_vocab
vocab = load_lm_vocab('./data/stackoverflow/lang_model/vocab.cls')
idx_docs = vocab.transform(trn_raw + val_raw, max_seq_len=30, padding=False)
lang_model = torch.load('./data/stackoverflow/lang_model/lang_model_cpu.torch', 
                        map_location=lambda storage, loc: storage)



In [42]:
lang_model.eval()

SequentialRNN(
  (0): RNN_Encoder(
    (encoder): Embedding(1520, 500, padding_idx=1)
    (encoder_with_dropout): EmbeddingDropout(
      (embed): Embedding(1520, 500, padding_idx=1)
    )
    (rnns): ModuleList(
      (0): WeightDrop(
        (module): LSTM(500, 500)
      )
      (1): WeightDrop(
        (module): LSTM(500, 500)
      )
      (2): WeightDrop(
        (module): LSTM(500, 500)
      )
    )
    (dropouti): LockedDropout(
    )
    (dropouths): ModuleList(
      (0): LockedDropout(
      )
      (1): LockedDropout(
      )
      (2): LockedDropout(
      )
    )
  )
  (1): LinearDecoder(
    (decoder): Linear(in_features=500, out_features=1520, bias=False)
    (dropout): LockedDropout(
    )
  )
)

**Note:** the below code extracts embeddings for docstrings one docstring at a time, which is very inefficient.  Ideally, you want to extract embeddings in batch but account for the fact that you will have padding, etc. when extracting the hidden states.  For this tutorial, we only provide this minimal example, however you are welcome to improve upon this and sumbit a PR!

In [43]:
def list2arr(l):
    "Convert list into pytorch Variable."
    return V(np.expand_dims(np.array(l), -1)).cpu()

def make_prediction_from_list(model, l):
    """
    Encode a list of integers that represent a sequence of tokens.  The
    purpose is to encode a sentence or phrase.

    Parameters
    -----------
    model : fastai language model
    l : list
        list of integers, representing a sequence of tokens that you want to encode

    """
    arr = list2arr(l)# turn list into pytorch Variable with bs=1
    model.reset()  # language model is stateful, so you must reset upon each prediction
    hidden_states = model(arr)[-1][-1] # RNN Hidden Layer output is last output, and only need the last layer

    #return avg-pooling, max-pooling, and last hidden state
    return hidden_states.mean(0), hidden_states.max(0)[0], hidden_states[-1]


def get_embeddings(lm_model, list_list_int):
    """
    Vectorize a list of sequences List[List[int]] using a fast.ai language model.

    Paramters
    ---------
    lm_model : fastai language model
    list_list_int : List[List[int]]
        A list of sequences to encode

    Returns
    -------
    tuple: (avg, mean, last)
        A tuple that returns the average-pooling, max-pooling over time steps as well as the last time step.
    """
    n_rows = len(list_list_int)
    n_dim = lm_model[0].nhid
    avgarr = np.empty((n_rows, n_dim))
    maxarr = np.empty((n_rows, n_dim))
    lastarr = np.empty((n_rows, n_dim))

    for i in tqdm_notebook(range(len(list_list_int))):
        avg_, max_, last_ = make_prediction_from_list(lm_model, list_list_int[i])
        avgarr[i,:] = avg_.data.numpy()
        maxarr[i,:] = max_.data.numpy()
        lastarr[i,:] = last_.data.numpy()

    return avgarr, maxarr, lastarr

In [44]:
%%time
avg_hs, max_hs, last_hs = get_embeddings(lang_model, idx_docs)

HBox(children=(IntProgress(value=0, max=7125), HTML(value='')))

CPU times: user 20min 39s, sys: 4.27 s, total: 20min 43s
Wall time: 2min 35s


### Do the same thing for the test set

In [45]:
idx_docs_test = vocab.transform(test_raw, max_seq_len=30, padding=False)
avg_hs_test, max_hs_test, last_hs_test = get_embeddings(lang_model, idx_docs_test)



HBox(children=(IntProgress(value=0, max=1065), HTML(value='')))

# Save Language Model Embeddings For Docstrings

In [46]:
savepath = Path('./data/stackoverflow/lang_model_emb/')
np.save(savepath/'avg_emb_dim500.npy', avg_hs)
np.save(savepath/'max_emb_dim500.npy', max_hs)
np.save(savepath/'last_emb_dim500.npy', last_hs)

In [47]:
# save the test set embeddings also
np.save(savepath/'avg_emb_dim500_test.npy', avg_hs_test)
np.save(savepath/'max_emb_dim500_test.npy', max_hs_test)
np.save(savepath/'last_emb_dim500_test.npy', last_hs_test)

** Note that the embeddings saved to disk above have also been cached and are are available for download: **

Train + Validation docstrings vectorized:

1. `avg_emb_dim500_v2.npy` : https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model_emb/avg_emb_dim500_v2.npy
2. `max_emb_dim500_v2.npy` : https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model_emb/last_emb_dim500_v2.npy
3. `last_emb_dim500_v2.npy` : https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model_emb/max_emb_dim500_v2.npy

Test set docstrings vectorized:

1. `avg_emb_dim500_test_v2.npy`: https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model_emb/avg_emb_dim500_test_v2.npy

2. `max_emb_dim500_test_v2.npy`: https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model_emb/last_emb_dim500_test_v2.npy

3. `last_emb_dim500_test_v2.npy`: https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model_emb/max_emb_dim500_test_v2.npy

# Evaluate Sentence Embeddings

One popular way of evaluating sentence embeddings is to measure the efficacy of these embeddings in downstream tasks like sentiment analysis, textual similarity etc.  Usually you can use general-purpose benchmarks such as the examples outlined [here](https://github.com/facebookresearch/SentEval) to measure the quality of your embeddings.  However, since this is a very domain specific dataset - those general purpose benchmarks may not be appropriate.  Unfortunately, we have not designed downstream tasks that we can open source at this point.

In the absence of these downstream tasks, we can at least sanity check that these embeddings contain semantic information by doing the following:

1. Manually examine similarity between sentences, by supplying a statement and examining if the nearest phrase found is similar. 

2. Visualize the embeddings.

We will do the first approach, and leave the second approach as an exercise for the reader.  **It should be noted that this is only a sanity check -- a more rigorous approach is to measure the impact of these embeddings on a variety of downstream tasks** and use that to form a more objective opinion about the quality of your embeddings.

Furthermroe, there are many different ways of constructing a sentence embedding from the language model.  For example, we can take the average, the maximum or even the last value of the hidden states (or concatenate them all together).  **For simplicity, we will only evaluate the sentence embedding that is constructed by taking the average over the hidden states** (and leave other possibilities as an exercise for the reader). 

### Create search index using `nmslib` 

[nmslib](https://github.com/nmslib/nmslib) is a great library for doing nearest neighbor lookups, which we will use as a search engine for finding nearest neighbors of comments in vector-space.  

The convenience function `create_nmslib_search_index` builds this search index given a matrix of vectors as input.



In [48]:
from general_utils import create_nmslib_search_index
import nmslib
from lang_model_utils import Query2Emb
from pathlib import Path
import numpy as np
from lang_model_utils import load_lm_vocab
import torch

In [49]:
# Load matrix of vectors
loadpath = Path('./data/stackoverflow/lang_model_emb/')
avg_emb_dim500 = np.load(loadpath/'avg_emb_dim500_test.npy')

In [50]:
# Build search index (takes about an hour on a p3.8xlarge)
dim500_avg_searchindex = create_nmslib_search_index(avg_emb_dim500)

In [51]:
# save search index
dim500_avg_searchindex.saveIndex('./data/stackoverflow/lang_model_emb/dim500_avg_searchindex.nmslib')

Note that if you did not train your own language model and are downloading the pre-trained model artifacts instead, you can similarly download the pre-computed search index here: 

https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model_emb/dim500_avg_searchindex.nmslib

After you have built this search index with nmslib, you can do fast nearest-neighbor lookups.  We use the `Query2Emb` object to help convert strings to the embeddings: 

In [52]:
dim500_avg_searchindex = nmslib.init(method='hnsw', space='cosinesimil')
dim500_avg_searchindex.loadIndex('./data/stackoverflow/lang_model_emb/dim500_avg_searchindex.nmslib')

In [53]:
lang_model = torch.load('./data/stackoverflow/lang_model/lang_model_cpu.torch')
vocab = load_lm_vocab('./data/stackoverflow/lang_model/vocab.cls')

q2emb = Query2Emb(lang_model = lang_model.cpu(),
                  vocab = vocab)



The method `Query2Emb.emb_mean` will allow us to use the langauge model we trained earlier to generate a sentence embedding given a string.   Here is an example, `emb_mean` will return a numpy array of size (1, 500).

In [55]:
query = q2emb.emb_mean('Read data into pandas dataframe')
query.shape



(1, 500)

**Make search engine to inspect semantic similarity of phrases**.  This will take 3 inputs:

1. `nmslib_index` - this is the search index we built above.  This object takes a vector and will return the index of the closest vector(s) according to cosine distance.  
2. `ref_data` - this is the data for which the index refer to, in this case will be the docstrings. 
3. `query2emb_func` - this is a function that will convert a string into an embedding.

In [61]:
class search_engine:
    def __init__(self, 
                 nmslib_index, 
                 ref_data, 
                 query2emb_func):
        
        self.search_index = nmslib_index
        self.data = ref_data
        self.query2emb_func = query2emb_func
    
    def search(self, str_search, k=10):
        query = self.query2emb_func(str_search)
        idxs, dists = self.search_index.knnQuery(query, k=k)
        
        for idx, dist in zip(idxs, dists):
            print(f'cosine dist:{dist:.4f}\n---------------\n', self.data[idx])

In [62]:
se = search_engine(nmslib_index=dim500_avg_searchindex,
                   ref_data = test_raw,
                   query2emb_func = q2emb.emb_mean)

## Manually Inspect Phrase Similarity

Compare a user-supplied query vs. vectorized docstrings on test set.  We can see that similar phrases are not exactly the same, but the nearest neighbors are reasonable.  

In [63]:
import logging
logging.getLogger().setLevel(logging.ERROR)

In [65]:
se.search('read csv into pandas dataframe')

cosine dist:0.2382
---------------
 read your file into a numpy array and use numpy broadcast feature :

cosine dist:0.2522
---------------
 starting with this :

cosine dist:0.2800
---------------
 install this package using :

cosine dist:0.2863
---------------
 if you want to convert a python dictionary to json using the json.dumps ( ) method .

cosine dist:0.2909
---------------
 reading up on floating point numbers may be worth while : https://docs.python.org/3/tutorial/floatingpoint.html

cosine dist:0.3021
---------------
 use list append for adding items into list

cosine dist:0.3036
---------------
 install tesseract from https://github.com/ub-mannheim/tesseract/wiki and add the path of tesseract.exe to the path environment variable .

cosine dist:0.3074
---------------
 use zip :

cosine dist:0.3214
---------------
 "use httpresponseredirect instead of redirect function ,"

cosine dist:0.3256
---------------
 "as @jonrsharpe mentions , use actual datetime objects . here i def

In [67]:
se.search('train a random forest')

cosine dist:0.3063
---------------
 1.install scrapy for your python version

cosine dist:0.3514
---------------
 starting with this :

cosine dist:0.3614
---------------
 javax.net.ssl.sslhandshakeexception : no cipher suits in common

cosine dist:0.3845
---------------
 ipopt has a bunch of different convergence tolerances . check out some documentation at : http://www.coin-or.org/ipopt/documentation/node42.html

cosine dist:0.3906
---------------
 "including in a path means "" from the current directory "" ."

cosine dist:0.3945
---------------
 stick with one service(either resource or client ) .

cosine dist:0.4113
---------------
 accordind to your own code :

cosine dist:0.4120
---------------
 output :

cosine dist:0.4157
---------------
 straight forward :

cosine dist:0.4233
---------------
 your original function returns a numeric value .



In [69]:
se.search('download files')

cosine dist:0.1265
---------------
 use glob.glob

cosine dist:0.1859
---------------
 prints

cosine dist:0.1880
---------------
 use zip :

cosine dist:0.1964
---------------
 output :

cosine dist:0.1985
---------------
 edit :

cosine dist:0.2099
---------------
 use this urlpattern

cosine dist:0.2269
---------------
 run php script in background

cosine dist:0.2475
---------------
 straight forward :

cosine dist:0.2574
---------------
 install this package using :

cosine dist:0.2610
---------------
 data.json



In [72]:
se.search('start webserver')

cosine dist:0.1899
---------------
 straight forward :

cosine dist:0.2053
---------------
 idp generic means

cosine dist:0.2181
---------------
 data.json

cosine dist:0.2181
---------------
 michael

cosine dist:0.2307
---------------
 output :

cosine dist:0.2516
---------------
 things changes :

cosine dist:0.2702
---------------
 prints

cosine dist:0.2732
---------------
 for me it works nice :

cosine dist:0.2753
---------------
 code

cosine dist:0.2831
---------------
 iiuc you want :



In [73]:
se.search('send out email notification')

cosine dist:0.2420
---------------
 "use httpresponseredirect instead of redirect function ,"

cosine dist:0.2597
---------------
 try model structure syntax :

cosine dist:0.2649
---------------
 reading up on floating point numbers may be worth while : https://docs.python.org/3/tutorial/floatingpoint.html

cosine dist:0.2823
---------------
 you have to convert your latitudes ans longitudes to map projection before calling scatter :

cosine dist:0.2872
---------------
 "each file is a module . for a module to access another module 's content , it needs to import it first.from mysecondclass import mysecondclass x = mysecondclass ( )"

cosine dist:0.2973
---------------
 you need to change this part :

cosine dist:0.2973
---------------
 telegram api names it as . you can send message to chat_id which goes to private chat .

cosine dist:0.3023
---------------
 you need to pass some of the information along with the request to the server . following code should work ... you can play alo

In [75]:
se.search('save file')

cosine dist:0.1796
---------------
 output :

cosine dist:0.1954
---------------
 use glob.glob

cosine dist:0.1987
---------------
 prints

cosine dist:0.2077
---------------
 use zip :

cosine dist:0.2395
---------------
 use this urlpattern

cosine dist:0.2468
---------------
 edit :

cosine dist:0.2548
---------------
 starting with this :

cosine dist:0.2803
---------------
 run php script in background

cosine dist:0.2877
---------------
 straight forward :

cosine dist:0.2966
---------------
 change the following code



### Visualize Embeddings (Optional)

We highly recommend using [tensorboard](https://www.tensorflow.org/versions/r1.0/get_started/embedding_viz) as way to visualize embeddings.  Tensorboard contains an interactive search that makes it easy (and fun) to explore embeddings.  We leave this as an exercise to the reader.