In [3]:
use_cache = False

In [4]:
## Download pre-processed data if you want to run from tutorial from this step.##
from general_utils import get_processed_data

if use_cache:
    get_processed_data(output_directory = './data/processed_data/stackoverflow/')

# Build Language Model From Stackoverflow post content

In [5]:
import torch,cv2
from lang_model_utils import lm_vocab, load_lm_vocab, train_lang_model
from general_utils import save_file_pickle, load_file_pickle
import logging
from pathlib import Path
from fastai.text import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [6]:

source_path = Path('./data/stackoverflow/processed_data/')

with open(source_path/'train.content_token', 'r') as f:
    trn_raw = f.readlines()

with open(source_path/'valid.content_token', 'r') as f:
    val_raw = f.readlines()
    
with open(source_path/'test.content_token', 'r') as f:
    test_raw = f.readlines()

## Pre-process data for language model

We will use the class  `build_lm_vocab` to prepare our data for the language model

In [5]:
vocab = lm_vocab(max_vocab=50000,
                 min_freq=10)

# fit the transform on the training data, then transform
trn_flat_idx = vocab.fit_transform_flattened(trn_raw)



Look at the transformed data

In [6]:
trn_flat_idx[:10]

array([  7,  53, 368, 259, 146, 447,  23,  15, 308,   6])

In [7]:
[vocab.itos[x] for x in trn_flat_idx[:10]]

['_xbos_',
 'an',
 'approach',
 'without',
 'any',
 'built',
 '-',
 'in',
 'functions',
 ':']

In [8]:
# apply transform to validation data
val_flat_idx = vocab.transform_flattened(val_raw)



Save files for later use

In [9]:
if not use_cache:
    vocab.save('./data/stackoverflow/lang_model/vocab.cls')
    save_file_pickle('./data/stackoverflow/lang_model/trn_flat_idx_list.pkl', trn_flat_idx)
    save_file_pickle('./data/stackoverflow/lang_model/val_flat_idx_list.pkl', val_flat_idx)



## Train Fast.AI Language Model

This model will read in files that were created and train a [fast.ai](https://github.com/fastai/fastai/tree/master/fastai) language model.  This model learns to predict the next word in the sentence using fast.ai's implementation of [AWD LSTM](https://github.com/salesforce/awd-lstm-lm).  

The goal of training this model is to build a general purpose feature extractor for text that can be used in downstream models.

In [10]:
vocab = load_lm_vocab('./data/stackoverflow/lang_model/vocab.cls')
trn_flat_idx = load_file_pickle('./data/stackoverflow/lang_model/trn_flat_idx_list.pkl')
val_flat_idx = load_file_pickle('./data/stackoverflow/lang_model/val_flat_idx_list.pkl')



In [11]:
if not use_cache:
    fastai_learner, lang_model = train_lang_model(model_path = './data/stackoverflow/lang_model_weights',
                                                  trn_indexed = trn_flat_idx,
                                                  val_indexed = val_flat_idx,
                                                  vocab_size = vocab.vocab_size,
                                                  lr=3e-3,
                                                  em_sz= 500,
                                                  nh= 500,
                                                  bptt=20,
                                                  cycle_len=1,
                                                  n_cycle=2,
                                                  bs = 200,
                                                  wd = 1e-6)
    
elif use_cache:    
    logging.warning('Not re-training language model because use_cache=True')

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

epoch      trn_loss   val_loss                                    
    0      3.601241   3.474715  
                                                                  

data/stackoverflow/lang_model_weights/models/langmodel_best.h5


    1      3.481606   3.358989  



In [12]:
if not use_cache:
    fastai_learner.fit(1e-3, 2, wds=1e-6, cycle_len=3)

HBox(children=(IntProgress(value=0, description='Epoch', max=6), HTML(value='')))

epoch      trn_loss   val_loss                                    
    0      3.472972   3.333834  
    1      3.394681   3.286406                                    
    2      3.385816   3.272807                                    
    3      3.41238    3.283735                                    
 68%|██████▊   | 3456/5086 [3:25:59<1:37:09,  3.58s/it, loss=3.36]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 47%|████▋     | 2384/5086 [2:25:33<2:44:58,  3.66s/it, loss=3.29]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



    5      3.348129   3.232127                                  



In [None]:
# if not use_cache:
#     fastai_learner.fit(1e-3, 2, wds=1e-6, cycle_len=3, cycle_mult=10)

Save language model and learner

In [13]:
if not use_cache:
    fastai_learner.save('lang_model_learner.fai')
    lang_model_new = fastai_learner.model.eval()
    torch.save(lang_model_new.cpu(), './data/stackoverflow/lang_model/lang_model_cpu.torch')

# Load Model and Encode All Post answers

In [14]:
from lang_model_utils import load_lm_vocab
vocab = load_lm_vocab('./data/stackoverflow/lang_model/vocab.cls')
idx_docs = vocab.transform(trn_raw + val_raw, max_seq_len=30, padding=False)
lang_model = torch.load('./data/stackoverflow/lang_model/lang_model_cpu.torch', 
                        map_location=lambda storage, loc: storage)



In [15]:
lang_model.eval()

SequentialRNN(
  (0): RNN_Encoder(
    (encoder): Embedding(27877, 500, padding_idx=1)
    (encoder_with_dropout): EmbeddingDropout(
      (embed): Embedding(27877, 500, padding_idx=1)
    )
    (rnns): ModuleList(
      (0): WeightDrop(
        (module): LSTM(500, 500)
      )
      (1): WeightDrop(
        (module): LSTM(500, 500)
      )
      (2): WeightDrop(
        (module): LSTM(500, 500)
      )
    )
    (dropouti): LockedDropout(
    )
    (dropouths): ModuleList(
      (0): LockedDropout(
      )
      (1): LockedDropout(
      )
      (2): LockedDropout(
      )
    )
  )
  (1): LinearDecoder(
    (decoder): Linear(in_features=500, out_features=27877, bias=False)
    (dropout): LockedDropout(
    )
  )
)

In [16]:
def list2arr(l):
    "Convert list into pytorch Variable."
    return V(np.expand_dims(np.array(l), -1)).cpu()

def make_prediction_from_list(model, l):
    """
    Encode a list of integers that represent a sequence of tokens.  The
    purpose is to encode a sentence or phrase.

    Parameters
    -----------
    model : fastai language model
    l : list
        list of integers, representing a sequence of tokens that you want to encode

    """
    arr = list2arr(l)# turn list into pytorch Variable with bs=1
    model.reset()  # language model is stateful, so you must reset upon each prediction
    hidden_states = model(arr)[-1][-1] # RNN Hidden Layer output is last output, and only need the last layer

    #return avg-pooling, max-pooling, and last hidden state
    return hidden_states.mean(0), hidden_states.max(0)[0], hidden_states[-1]


def get_embeddings(lm_model, list_list_int):
    """
    Vectorize a list of sequences List[List[int]] using a fast.ai language model.

    Paramters
    ---------
    lm_model : fastai language model
    list_list_int : List[List[int]]
        A list of sequences to encode

    Returns
    -------
    tuple: (avg, mean, last)
        A tuple that returns the average-pooling, max-pooling over time steps as well as the last time step.
    """
    n_rows = len(list_list_int)
    n_dim = lm_model[0].nhid
    avgarr = np.empty((n_rows, n_dim))
    maxarr = np.empty((n_rows, n_dim))
    lastarr = np.empty((n_rows, n_dim))

    for i in tqdm_notebook(range(len(list_list_int))):
        avg_, max_, last_ = make_prediction_from_list(lm_model, list_list_int[i])
        avgarr[i,:] = avg_.data.numpy()
        maxarr[i,:] = max_.data.numpy()
        lastarr[i,:] = last_.data.numpy()

    return avgarr, maxarr, lastarr

In [17]:
%%time
avg_hs, max_hs, last_hs = get_embeddings(lang_model, idx_docs)

HBox(children=(IntProgress(value=0, max=560685), HTML(value='')))


CPU times: user 1d 7h 33min 42s, sys: 6min 46s, total: 1d 7h 40min 28s
Wall time: 4h 31min 13s


### Do the same thing for the test set

In [18]:
idx_docs_test = vocab.transform(test_raw, max_seq_len=30, padding=False)
avg_hs_test, max_hs_test, last_hs_test = get_embeddings(lang_model, idx_docs_test)



HBox(children=(IntProgress(value=0, max=83781), HTML(value='')))




# Save Language Model Embeddings

In [19]:
savepath = Path('./data/stackoverflow/lang_model_emb/')
np.save(savepath/'avg_emb_dim500.npy', avg_hs)
np.save(savepath/'max_emb_dim500.npy', max_hs)
np.save(savepath/'last_emb_dim500.npy', last_hs)

In [20]:
# save the test set embeddings also
np.save(savepath/'avg_emb_dim500_test.npy', avg_hs_test)
np.save(savepath/'max_emb_dim500_test.npy', max_hs_test)
np.save(savepath/'last_emb_dim500_test.npy', last_hs_test)

### Create search index using `nmslib` 

In [57]:
from general_utils import create_nmslib_search_index
import nmslib
from lang_model_utils import Query2Emb
from pathlib import Path
import numpy as np
from lang_model_utils import load_lm_vocab
import torch

In [72]:
# Load matrix of vectors
loadpath = Path('./data/stackoverflow/lang_model_emb/')
avg_emb_dim500 = np.load(loadpath/'avg_emb_dim500_test.npy')

In [73]:
# Build search index
dim500_avg_searchindex = create_nmslib_search_index(avg_emb_dim500)

In [74]:
# save search index
dim500_avg_searchindex.saveIndex('./data/stackoverflow/lang_model_emb/dim500_avg_searchindex.nmslib')

In [75]:
dim500_avg_searchindex = nmslib.init(method='hnsw', space='cosinesimil')
dim500_avg_searchindex.loadIndex('./data/stackoverflow/lang_model_emb/dim500_avg_searchindex.nmslib')

In [76]:
lang_model = torch.load('./data/stackoverflow/lang_model/lang_model_cpu.torch')
vocab = load_lm_vocab('./data/stackoverflow/lang_model/vocab.cls')

q2emb = Query2Emb(lang_model = lang_model.cpu(),
                  vocab = vocab)

The method `Query2Emb.emb_mean` will allow us to use the langauge model we trained earlier to generate a sentence embedding given a string.   Here is an example, `emb_mean` will return a numpy array of size (1, 500).

In [77]:
query = q2emb.emb_mean('Read data into pandas dataframe')
query.shape

(1, 500)