In [40]:
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import nmslib
from sklearn import preprocessing

from lang_model_utils import load_lm_vocab, Query2Emb

In [257]:
# from lang_model_utils import load_lm_vocab, Query2Emb
# from general_utils import create_nmslib_search_index

input_path = Path('./data/stackoverflow/processed_data/')
# code2emb_path = Path('./data/code2emb/')
output_path = Path('./data/stackoverflow/search')
output_path.mkdir(exist_ok=True)

# Rank results with similarity, votes, and comments semtiments

## Read in contents

In [258]:
body_df = pd.read_csv(input_path/'test.content_token', header=None, names=['content'])


## Read in votes

In [259]:
# read file of votes
vote_df = pd.read_csv(input_path/'test.vote', header=None, names=['vote'])
assert body_df.shape[0] == vote_df.shape[0]

# Read in comments

In [260]:
# read file of comments
comment_df = pd.read_csv(input_path/'test.comment')
assert body_df.shape[0] == comment_df.shape[0]

## Read in URL

In [261]:
# read file of urls
url_df = pd.read_csv(input_path/'test.url', header=None, names=['url'])
assert body_df.shape[0] == url_df.shape[0]

In [262]:
# collect these two together into a dataframe
ref_df = pd.concat([url_df, body_df, comment_df, vote_df], axis = 1).reset_index(drop=True)
ref_df.head()

Unnamed: 0,url,content,comment,sentiment_polarity,sentiment_subjectivity,vote
0,https://stackoverflow.com/a/18622422,"first , create a list containing one hundred d...","Hi Kevin, I tried the latter code and it worke...",0.090909,0.292424,4
1,https://stackoverflow.com/a/5185748,i 've had a similar problem and i 've ended up...,IMO Fabio's solution is quite to the point :),0.5,1.0,1
2,https://stackoverflow.com/a/59841,two options that do n't require copying the wh...,"`next(iter(your_list or []), None)` to handle ...",0.0375,0.570833,371
3,https://stackoverflow.com/a/2659296,i do n't think you can do this in one database...,"Thanks. Note that in the example you gave, the...",0.167308,0.442949,7
4,https://stackoverflow.com/a/26097790,for python 3 :,@MartijnPieters Do you know what version intro...,-0.1,0.275,4


# Use tanh to process votes

In [263]:
ref_df.vote = np.tanh(ref_df.vote)
ref_df[['sentiment_polarity', 'sentiment_subjectivity']] = preprocessing.MinMaxScaler().fit_transform(ref_df[['sentiment_polarity', 'sentiment_subjectivity']])
ref_df.head()

Unnamed: 0,url,content,comment,sentiment_polarity,sentiment_subjectivity,vote
0,https://stackoverflow.com/a/18622422,"first , create a list containing one hundred d...","Hi Kevin, I tried the latter code and it worke...",0.545455,0.292424,0.999329
1,https://stackoverflow.com/a/5185748,i 've had a similar problem and i 've ended up...,IMO Fabio's solution is quite to the point :),0.75,1.0,0.761594
2,https://stackoverflow.com/a/59841,two options that do n't require copying the wh...,"`next(iter(your_list or []), None)` to handle ...",0.51875,0.570833,1.0
3,https://stackoverflow.com/a/2659296,i do n't think you can do this in one database...,"Thanks. Note that in the example you gave, the...",0.583654,0.442949,0.999998
4,https://stackoverflow.com/a/26097790,for python 3 :,@MartijnPieters Do you know what version intro...,0.45,0.275,0.999329


## Retrieve top 1000

In [264]:
# load raw contents
with open(input_path/'train.content_token', 'r') as f:
    trn_raw = f.readlines()

with open(input_path/'valid.content_token', 'r') as f:
    val_raw = f.readlines()
    
with open(input_path/'test.content_token', 'r') as f:
    test_raw = f.readlines()

In [265]:
# load vocab 
vocab = load_lm_vocab('./data/stackoverflow/lang_model/vocab.cls')



In [266]:
# load language model
lang_model = torch.load('./data/stackoverflow/lang_model/lang_model_cpu.torch', 
                        map_location=lambda storage, loc: storage)

In [267]:
q2emb = Query2Emb(lang_model = lang_model.cpu(),
                  vocab = vocab)

search_index = nmslib.init(method='hnsw', space='cosinesimil')

# use pre build index 
#TODO: rebuild index for bigger data set
search_index.loadIndex('./data/stackoverflow/lang_model_emb/dim500_avg_searchindex.nmslib')



In [268]:
# test
test = q2emb.emb_mean('Hello World!  This is a test.')
test.shape



(1, 500)

In [269]:
class search_engine:
    """Organizes all the necessary elements we need to make a search engine."""
    def __init__(self, 
                 nmslib_index, 
                 ref_df, 
                 query2emb_func):
        """
        Parameters
        ==========
        nmslib_index : nmslib object
            This is pre-computed search index.
        ref_df : pandas.DataFrame
            This dataframe contains meta-data for search results, 
            must contain the columns 'code' and 'url'.
        query2emb_func : callable
            This is a function that takes as input a string and returns a vector
            that is in the same vector space as what is loaded into the search index.

        """
        assert 'url' in ref_df.columns
        assert 'content' in ref_df.columns
        
        self.search_index = nmslib_index
        self.ref_df = ref_df
        self.query2emb_func = query2emb_func
    
    def search(self, str_search, k=1000):
        """
        Prints the code that are the nearest neighbors (by cosine distance)
        to the search query.
        
        Parameters
        ==========
        str_search : str
            a search query.  Ex: "read data into pandas dataframe"
        k : int
            the number of nearest neighbors to return.  Defaults to 2.
        
        """
        query = self.query2emb_func(str_search)
        return self.search_index.knnQuery(query, k=k)


In [278]:
se = search_engine(nmslib_index=search_index,
                   ref_df=ref_df,
                   query2emb_func=q2emb.emb_mean)

def rank(input):
    idx, dist = input
    vote = ref_df.iloc[idx].vote
    polarity = ref_df.iloc[idx].sentiment_polarity
    subjectivity = ref_df.iloc[idx].sentiment_subjectivity
    return dist +  0.01/vote + 0.001/polarity

def search(se, query):    
    idxs, dists = se.search(query)
    ranked_results = sorted(zip(idxs, dists), key=rank)
    for idx, dist in ranked_results[:10]:    
        content = ref_df.iloc[idx].content
        url = ref_df.iloc[idx].url
        vote = ref_df.iloc[idx].vote
        polarity = ref_df.iloc[idx].sentiment_polarity
        subjectivity = ref_df.iloc[idx].sentiment_subjectivity
        score = dist + 0.01/vote + 0.001/polarity
        print(content)
        print(f'score: {score}')
        print(f'url: {url}\n---------------\n')


In [279]:
search(se, "start a web server")

  # Remove the CWD from sys.path while we load stuff.


another solution is to use a proxy for the d_file .
score: 0.23250860159443879
url: https://stackoverflow.com/a/11521614
---------------

a pretty common way to communicate between a webpage and a python program is to run the python as a wsgi server . effectively the python program is a separate server which communicates with the webpage using gets and posts .
score: 0.2354370473625922
url: https://stackoverflow.com/a/43161022
---------------

one way to solve the issue is to start the container using a host network mode
score: 0.23627890943989055
url: https://stackoverflow.com/a/46407386
---------------

you could set up a simple ssh tunnel on a remote machine :
score: 0.23661329907125153
url: https://stackoverflow.com/a/10880851
---------------

you can start your server with following command :
score: 0.24460984548008444
url: https://stackoverflow.com/a/6111790
---------------

you can run a web browser or web control within xvfb , and use something like to capture it .
score: 0.262

In [280]:
search(se, "read data into dataframe")

  # Remove the CWD from sys.path while we load stuff.


read the documentation : manage data in containers
score: 0.12819459751618018
url: https://stackoverflow.com/a/35772167
---------------

you can read data in chunks :
score: 0.1290179330433333
url: https://stackoverflow.com/a/44203901
---------------

you can read the json file all in once like :
score: 0.15426600991689515
url: https://stackoverflow.com/a/44381211
---------------

you can read the file as two separate parts ( stats and csv )
score: 0.15745706546333071
url: https://stackoverflow.com/a/44666621
---------------

first read csv into pandas df :
score: 0.15783050736039206
url: https://stackoverflow.com/a/44158879
---------------

read the csvs 's in using pd.read_csv(file )
score: 0.16292514826609952
url: https://stackoverflow.com/a/28818016
---------------

you can easily read your data into a dictionary of dictionaries :
score: 0.16328864815394495
url: https://stackoverflow.com/a/44234874
---------------

you can read from multiple sheets with pandas :
score: 0.1633837493

In [289]:
search(se, 'plot time series')

  # Remove the CWD from sys.path while we load stuff.


index the string :
score: 0.18749885437462094
url: https://stackoverflow.com/a/46774736
---------------

draw keypoints as filled white circles :
score: 0.18912973460082574
url: https://stackoverflow.com/a/47157299
---------------

creating random sample data :
score: 0.22811340574390707
url: https://stackoverflow.com/a/34001321
---------------

find affine transformation to make rectangle axis - aligned . it is just rotation by angle
score: 0.22844695922053726
url: https://stackoverflow.com/a/48354958
---------------

i have tried applying a gaussian blur then processing it with adaptive thresholding and result removed noise in the image and blurriness .
score: 0.2285950980838155
url: https://stackoverflow.com/a/51081432
---------------

draw a line segment between those points :
score: 0.2306625744563891
url: https://stackoverflow.com/a/10573237
---------------

save the bins and use pd.cut again :
score: 0.23214419030079306
url: https://stackoverflow.com/a/42749333
---------------



## Rerank

# Return top 50