In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import nmslib

from lang_model_utils import load_lm_vocab, Query2Emb

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# from lang_model_utils import load_lm_vocab, Query2Emb
# from general_utils import create_nmslib_search_index

input_path = Path('./data/stackoverflow/processed_data/')
# code2emb_path = Path('./data/code2emb/')
output_path = Path('./data/stackoverflow/search')
output_path.mkdir(exist_ok=True)

# Rank results with similarity, votes, and comments semtiments

## Read in contents

In [3]:
body_df = pd.read_csv(input_path/'test.content_token', header=None, names=['content'])


## Read in votes

In [4]:
# read file of votes
vote_df = pd.read_csv(input_path/'test.vote', header=None, names=['vote'])
assert body_df.shape[0] == vote_df.shape[0]

# Read in comments

In [5]:
# read file of comments
comment_df = pd.read_csv(input_path/'test.comment', header=None, names=['comment'])
assert body_df.shape[0] == comment_df.shape[0]

## Read in URL

In [6]:
# read file of urls
url_df = pd.read_csv(input_path/'test.url', header=None, names=['url'])
assert body_df.shape[0] == url_df.shape[0]

In [7]:
# collect these two together into a dataframe
ref_df = pd.concat([url_df, body_df, comment_df, vote_df], axis = 1).reset_index(drop=True)
ref_df.head()

Unnamed: 0,url,content,comment,vote
0,https://stackoverflow.com/a/18622422,"first , create a list containing one hundred d...","Hi Kevin, I tried the latter code and it worke...",4
1,https://stackoverflow.com/a/5185748,i 've had a similar problem and i 've ended up...,IMO Fabio's solution is quite to the point :),1
2,https://stackoverflow.com/a/59841,two options that do n't require copying the wh...,"`next(iter(your_list or []), None)` to handle ...",371
3,https://stackoverflow.com/a/2659296,i do n't think you can do this in one database...,"Thanks. Note that in the example you gave, the...",7
4,https://stackoverflow.com/a/26097790,for python 3 :,@MartijnPieters Do you know what version intro...,4


## Retrieve top 1000

In [8]:
# load raw contents
with open(input_path/'train.content_token', 'r') as f:
    trn_raw = f.readlines()

with open(input_path/'valid.content_token', 'r') as f:
    val_raw = f.readlines()
    
with open(input_path/'test.content_token', 'r') as f:
    test_raw = f.readlines()

In [9]:
# load vocab 
vocab = load_lm_vocab('./data/stackoverflow/lang_model/vocab.cls')



In [10]:
# load language model
lang_model = torch.load('./data/stackoverflow/lang_model/lang_model_cpu.torch', 
                        map_location=lambda storage, loc: storage)

In [11]:
q2emb = Query2Emb(lang_model = lang_model.cpu(),
                  vocab = vocab)

search_index = nmslib.init(method='hnsw', space='cosinesimil')

# use pre build index 
#TODO: rebuild index for bigger data set
search_index.loadIndex('./data/stackoverflow/lang_model_emb/dim500_avg_searchindex.nmslib')



In [12]:
# test
test = q2emb.emb_mean('Hello World!  This is a test.')
test.shape



(1, 500)

In [14]:
class search_engine:
    """Organizes all the necessary elements we need to make a search engine."""
    def __init__(self, 
                 nmslib_index, 
                 ref_df, 
                 query2emb_func):
        """
        Parameters
        ==========
        nmslib_index : nmslib object
            This is pre-computed search index.
        ref_df : pandas.DataFrame
            This dataframe contains meta-data for search results, 
            must contain the columns 'code' and 'url'.
        query2emb_func : callable
            This is a function that takes as input a string and returns a vector
            that is in the same vector space as what is loaded into the search index.

        """
        assert 'url' in ref_df.columns
        assert 'content' in ref_df.columns
        
        self.search_index = nmslib_index
        self.ref_df = ref_df
        self.query2emb_func = query2emb_func
    
    def search(self, str_search, k=20):
        """
        Prints the code that are the nearest neighbors (by cosine distance)
        to the search query.
        
        Parameters
        ==========
        str_search : str
            a search query.  Ex: "read data into pandas dataframe"
        k : int
            the number of nearest neighbors to return.  Defaults to 2.
        
        """
        query = self.query2emb_func(str_search)
        idxs, dists = self.search_index.knnQuery(query, k=k)
        
        for idx, dist in zip(idxs, dists):
            content = self.ref_df.iloc[idx].content
            url = self.ref_df.iloc[idx].url
            print(f'cosine dist:{dist:.4f}  url: {url}\n---------------\n')
            print(content)

In [15]:
se = search_engine(nmslib_index=search_index,
                   ref_df=ref_df,
                   query2emb_func=q2emb.emb_mean)

In [16]:
se.search("read from csv")



cosine dist:0.0904  url: https://stackoverflow.com/a/20262821
---------------

read in your file :
cosine dist:0.1003  url: https://stackoverflow.com/a/33471224
---------------

read the log :
cosine dist:0.1052  url: https://stackoverflow.com/a/40093543
---------------

read in binary mode :
cosine dist:0.1168  url: https://stackoverflow.com/a/7368575
---------------

read the help :
cosine dist:0.1238  url: https://stackoverflow.com/a/14187749
---------------

read the documentation for :
cosine dist:0.1314  url: https://stackoverflow.com/a/12149173
---------------

read this : http://www.michelepasin.org/techblog/2010/07/20/the-power-of-djangos-q-objects/
cosine dist:0.1367  url: https://stackoverflow.com/a/15184426
---------------

read the documentation on the queue .
cosine dist:0.1381  url: https://stackoverflow.com/a/28818016
---------------

read the csvs 's in using pd.read_csv(file )
cosine dist:0.1431  url: https://stackoverflow.com/a/5913767
---------------

read about the

## Rerank

# Return top 50