In [1]:
import gensim
#import logging

#import string
from nltk.tokenize import RegexpTokenizer # tokenizing
from nltk.corpus import stopwords  # list of stop words
from nltk.stem.wordnet import WordNetLemmatizer # lemmatizer

from itertools import product
import numpy as np

from bs4 import BeautifulSoup
import requests

import time
import pandas as pd

import html

from collections import defaultdict
# Logging code taken from http://rare-technologies.com/word2vec-tutorial/
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Loading model

Choose one option from below.  The first 2 are the same model, which can be used to calculate a probability score for a phrase. The first is just the word vectors (no hidden layer weights). The second is the full model (can calculate score).

The last file is the negative sampling model, which always for the 'predict_output_word' function

In [2]:
# load full model with softmax, CBOW, and no negative sampling
path = "/Users/stevenfelix/Documents/DataScience_local/Insight/"
file = 'model_full_50M_sg0_sz250_win5_min3_hs1_neg0'
model= gensim.models.word2vec.Word2Vec.load(path+file)

# Prediction algorithm

In [3]:
"""take a single phrase as a string, and preprocess, return tokens"""
tokenizer = RegexpTokenizer(r'\w+') # tokens separated by white spice
stops = set(stopwords.words('english')) # list of english stop words
lemma = WordNetLemmatizer()

def clean(title, rmv_stop_words=False):
    tokens = tokenizer.tokenize(title.lower())     # tokenize
    if rmv_stop_words:
        tokens = [i for i in tokens if not i in stops] # remove stop words
    normalized = [lemma.lemmatize(token) for token in tokens] # lemma
    return normalized

In [None]:
""" These generate alternative queries and score them and filter them """
def generate_alternatives(query, n, model, rmv_stop_words=False):
    print('getting similar words')
    syns = get_similar(query, n, model, rmv_stop_words) # synonyms
    print('making combinations')
    combs = get_combinations(syns) # combinations
    # generatings probaiblity scores
    probs = [model.score([sug])[0] for sug in combs] # probabilities
    preds_probs =[(p,q) for p,q in zip(probs,combs)] # combine with queries
    q_score = model.score([clean(query)])[0] # score for original query
    sd = get_sd(preds_probs)
    preds_1sd = [(x,y) for x,y in preds_probs if np.abs(x-q_score)<=sd] # keep just those within 1 sd
    preds_1sd.sort(reverse=True)
    #print("original query: {}".format(query))
    #print("score: {}".format(q_score))
    #print("sd of all results: {}".format(sd))
    #print("number of results within 1 SD of original query score: {}".format(len(preds_1sd)))
    return preds_1sd

def get_similar(query, n, model, rmv_stop_words):
    q = clean(query, rmv_stop_words=rmv_stop_words)
    # turn each word  of query into its own list
    d = [[x] for x in q]
    for x in d:
        # for each word in original query, add topn similar words to list
        x.extend([syn for syn,_ in model.most_similar(x[0],topn=n)])
    return d

def get_combinations(l):
    combs = [x for x in product(*l)]
    return combs

def get_sd(tups):
    vals = [x for x,_ in tups]
    return np.std(vals)

def clean_preds(pred_scores, topn=3):
    clean = []
    i = 0
    for score,query in pred_scores:
        i+=1
        if i > topn: break
        clean.append((score, ' '.join(query)))
    return clean

# Validation algorithms

In [33]:
"""These query stack overflow and return and parse the serach results for validation"""

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
            'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            'Accept_Encoding': 'gzip, deflate, sdch, br', 'Accept_Language':'en-US,en;q=0.8',
            'Connection': 'keep-alive'}

def get_query_results(query):
    url = 'https://stackoverflow.com/search?q='+'+'.join(query)
    print(url)
    r = requests.get(url)
    print(r.status_code)
    soup = BeautifulSoup(r.text, 'lxml')
    x = parse_results(soup)
    print('found {} results'.format(len(x)))
    return x
    
def parse_results(soup):
    l = []
    results = soup.find_all("div", class_="question-summary search-result")
    i = 0
    for result in results:
        i += 1
        if i > 10:
            return l
        votes = [v.get_text() for v in result.find_all("strong")]
        if len(votes) == 1:
            votes.append(1)
        link = result.find("div", class_="result-link").find('a')
        query = link.attrs['title']
        url = link.attrs['href']
        votes.extend([query,url])
        l.append(tuple(votes))
    return l

In [38]:
""" given dictionaries of suggested-queries' results and the original query results,
    compares metrics of the returned queries"""
def compare(suggestions, original):
    # get result titles from original in a list
    orig_items = [title for _,_,title,_ in original[list(original)[0]]]
    summaries = defaultdict(dict)
    orig_summaries = defaultdict(dict)
    for k in suggestions:
        # isolate titles for suggested results
        sug_items = [title for _,_,title,_ in suggestions[k]]
        # identify which results are shared with the original search results
        shared = set(orig_items) & set(sug_items)
        orig_unique = list(set(orig_items).difference(set(sug_items)))
        sug_unique = list(set(sug_items).difference(set(orig_items)))
        if len(sug_items) > 0:
            concordance = len(shared)*1.0/len(sug_items)
        else:
            concordance = np.nan
        
        votes = 0
        answers = 0
        for vote,answer,query,_ in original[list(original)[0]]:
            if query in shared:
                continue
            votes += int(vote)
            answers += int(answer)
        orig_summaries[k]['unique'] = len(orig_unique)
        try: 
            orig_summaries[k]['avg_unique_votes'] = votes*1.0/orig_summaries[k]['unique']
            orig_summaries[k]['avg_unique_answers'] = answers*1.0/orig_summaries[k]['unique']
        except ZeroDivisionError:
            orig_summaries[k]['avg_unique_votes'] = 0
            orig_summaries[k]['avg_unique_answers'] = 0
        orig_summaries[k]['titles'] = orig_unique
        orig_summaries[k]['concordance'] = concordance
        
        # calculate sum of votes and answers for all unique search results in suggestion
        votes = 0
        answers = 0
        #distances = []
        for vote,answer,query,url in suggestions[k]:
            if query in shared:
                continue
            votes += int(vote)
            answers += int(answer)
            #distances.append ()# cosine distance fucntion here)
            # concordance
        summaries[k]['unique'] = len(sug_unique)
        summaries[k]['titles'] = sug_unique
        summaries[k]['concordance'] = concordance
        try: 
            summaries[k]['avg_unique_votes'] = votes*1.0/summaries[k]['unique']
            summaries[k]['avg_unique_answers'] = answers*1.0/summaries[k]['unique']
        except ZeroDivisionError:
            summaries[k]['avg_unique_votes'] = 0
            summaries[k]['avg_unique_answers'] = 0
    return summaries, orig_summaries

"""
def make_summaries(key, dic):
    d = defaultdict(dict)
    votes = 0
    answers = 0
    for vote,answer,query,_ in dic[list(original)[0]]:
        if query in shared:
            continue
            votes += int(vote)
            answers += int(answer)
        d[key]['votes'] = votes
        d[key]['answers'] = answers
        d[key]['unique'] = orig_unique
        d[key]['unique_perc'] = len(orig_unique)*1.0/len(orig_items)
        """

"\ndef make_summaries(key, dic):\n    d = defaultdict(dict)\n    votes = 0\n    answers = 0\n    for vote,answer,query,_ in dic[list(original)[0]]:\n        if query in shared:\n            continue\n            votes += int(vote)\n            answers += int(answer)\n        d[key]['votes'] = votes\n        d[key]['answers'] = answers\n        d[key]['unique'] = orig_unique\n        d[key]['unique_perc'] = len(orig_unique)*1.0/len(orig_items)\n        "

In [42]:
def validate(queries, n_syns, topn, model, dic):
    #results_dict = defaultdict(dict)
    i = 0
    for q in queries:
        i+=1
        print('starting query {}'.format(i))
        dic[q] = evaluate_query(q, n_syns, topn, model)
        if dic[q] == None:
            print('query {} failed'.format(i))
            continue
        print('query {} completed'.format(i))
    return dic
        #results_dict[q] = evaluate_query(q, n_syns, topn, model)
    #return results_dict

def evaluate_query(orig_query, n_syns, topn, model):
    # improtant local varaibles
    orig_query_vec = clean(orig_query)

    # generate, rank, filter suggestions
    print('generating predictions')
    preds = generate_alternatives(orig_query, n_syns, model)
    best_preds = clean_preds(preds, topn)
    best_preds2 = [(s,q) for s,q in best_preds if q!=' '.join(orig_query_vec)]
    #orig = (model.score([orig_query_vec])[0], ' '.join(orig_query_vec))
    
    # query each suggestion and scrape relevent metrics
    sug_q_results ={}
    for _,q in best_preds2:
        print('getting search results for \"{}\"'.format(q))
        sug_q_results[q] = get_query_results(clean(q))
        print(sug_q_results[q])
        #if sug_q_results[q] == []:
        #    print('Website returned no results.')
        #    return None
        time.sleep(2.5)
    
    # get query results for original query
    orig_q_results = {orig_query: get_query_results(orig_query_vec)}
    
    # get metrics
    print('making comparisons')
    summaries,orig_summaries = compare(sug_q_results, orig_q_results)
    
    
    # calculate average metrics
    avg_answers_dif_score = np.mean([summaries[k]['avg_unique_answers'] - 
                                  orig_summaries[k]['avg_unique_answers'] for k in summaries])
    avg_votes_dif_score = np.mean([summaries[k]['avg_unique_votes'] - 
                                orig_summaries[k]['avg_unique_votes'] for k in summaries])
    avg_unique_items = np.mean([summaries[k]['unique'] for k in summaries])

    results = {}
    orig_prob = model.score([orig_query_vec])[0]
    results['avg_unique_items'] = avg_unique_items
    results['avg_answers_dif_score'] = avg_answers_dif_score
    results['avg_votes_dif_score'] = avg_votes_dif_score
    results['avg_prob_dif_score'] = np.mean([np.exp(prob)-np.exp(orig_prob) for prob,q in best_preds2])
    return results

# Brief test

In [40]:
queries = ['initializing row pandas dataframe']

In [41]:
r = defaultdict(dict)
results = validate(queries, n_syns=5, topn=5, model=model, dic=r)

starting query 1
generating predictions
getting search results for "initialising column panda dataframe"
https://stackoverflow.com/search?q=initialising+column+panda+dataframe
200
found 5 results
[('3', '1', 'creating subclass form class returning pandas dataFrame', '/questions/14430263/creating-subclass-form-class-returning-pandas-dataframe'), ('2', '1', 'Add pandas Series to a DataFrame, preserving index', '/questions/29805126/add-pandas-series-to-a-dataframe-preserving-index'), ('13', '6', 'Performance issues with pandas and filtering on datetime column', '/questions/38902239/performance-issues-with-pandas-and-filtering-on-datetime-column'), ('0', '2', 'How can I get the <select> tag option chosen by the user using Flask?', '/questions/39741302/how-can-i-get-the-select-tag-option-chosen-by-the-user-using-flask'), ('1', 1, 'Selectively converting float to whole number and decimals in Python pandas', '/questions/45902991/selectively-converting-float-to-whole-number-and-decimals-in-pyt

In [43]:
results

defaultdict(dict,
            {'initializing row pandas dataframe': {'avg_answers_dif_score': -0.41999999999999993,
              'avg_prob_dif_score': 4.7898663e-09,
              'avg_unique_items': 2.2000000000000002,
              'avg_votes_dif_score': -1.8800000000000001}})

In [44]:
pd.DataFrame(results).T

Unnamed: 0,avg_answers_dif_score,avg_prob_dif_score,avg_unique_items,avg_votes_dif_score
initializing row pandas dataframe,-0.42,4.789866e-09,2.2,-1.88


# Getting queries for validation

In [None]:
from stackapi import StackAPI
from datetime import datetime, timedelta

In [None]:
# create connection
SITE = StackAPI('stackoverflow')
SITE.page_size = 100
SITE.max_pages = 10

In [None]:
def get_attrib(posts, tag):
    return [d[tag] for d in posts['items']]

def append(filename, dat):    
    with open(filename, 'a+') as f:
        for item in dat:
            f.write(item+'\n')

In [None]:
def get_posts(iters, connection, typ, path,fname):
    for i in range(iters):
        posts = connection.fetch(typ)

        # parse and write titles
        titles = get_attrib(posts, 'title')
        append(path+fname, titles)

        print('waiting {} seconds'.format(60+posts['backoff']))
        time.sleep(60+posts['backoff'])

In [None]:
path = '/Users/stevenfelix/Documents/DataScience_local/Insight/'
get_posts(1, SITE, 'questions', path, 'recent_titles.txt')

# Validation

In [45]:
with open(path+'recent_titles.txt', 'r') as f:
    queries = f.readlines()

queries =[html.unescape(q).strip() for q in queries]

In [None]:
queries
## note, lots of special characters will get stripped in my pre-processor, 
## meaning that the original query will almost always perform better
## need to retain proper names and commong things (C++, robots.txt, C#, function())

In [60]:
#queries = ['deep copy pandas dataframe']
qs = queries[10:13]

In [61]:
qs

['Building spiral-type function in Python',
 "Couldn't register a user on Firebase android",
 'How can I get a unique set of values?']

In [85]:
clean(qs[2], rmv_stop_words=True)
# do remove stop words
# don't remove trailing s !!

['get', 'unique', 'set', 'value']

In [78]:
a = np.exp(model.score([clean('function in Haskell')]))
b = np.exp(model.score([clean('raise error Python')]))

In [79]:
b/a

array([ 0.29275721], dtype=float32)

In [63]:
r = defaultdict(dict)
results = validate(qs, n_syns=3, topn=5, model=model, dic=r)

starting query 1
generating predictions
getting search results for "building quadtree type function with haskell"
https://stackoverflow.com/search?q=building+quadtree+type+function+with+haskell
200
found 0 results
[]
getting search results for "building quadtree type function inside haskell"
https://stackoverflow.com/search?q=building+quadtree+type+function+inside+haskell
200
found 0 results
[]
getting search results for "creating spiral type function in haskell"
https://stackoverflow.com/search?q=creating+spiral+type+function+in+haskell
200
found 0 results
[]
getting search results for "creating quadtree type function in haskell"
https://stackoverflow.com/search?q=creating+quadtree+type+function+in+haskell
200
found 0 results
[]
getting search results for "building quadtree type function within haskell"
https://stackoverflow.com/search?q=building+quadtree+type+function+within+haskell
200
found 0 results
[]
https://stackoverflow.com/search?q=building+spiral+type+function+in+python
200


KeyboardInterrupt: 

In [58]:
pd.DataFrame(results).T

Unnamed: 0,avg_answers_dif_score,avg_prob_dif_score,avg_unique_items,avg_votes_dif_score
iterating over pandas dataframe,0.125,4e-06,8.0,-25.825
