In [1]:
import gensim
#import logging

#import string
from nltk.tokenize import RegexpTokenizer # tokenizing
from nltk.corpus import stopwords  # list of stop words
from nltk.stem.wordnet import WordNetLemmatizer # lemmatizer

from itertools import product
import numpy as np

from bs4 import BeautifulSoup
import requests

import time
import pandas as pd

from collections import defaultdict
# Logging code taken from http://rare-technologies.com/word2vec-tutorial/
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Loading model

Choose one option from below.  The first 2 are the same model, which can be used to calculate a probability score for a phrase. The first is just the word vectors (no hidden layer weights). The second is the full model (can calculate score).

The last file is the negative sampling model, which always for the 'predict_output_word' function

In [2]:
# load full model with softmax, CBOW, and no negative sampling
path = "/Users/stevenfelix/Documents/DataScience_local/Insight/"
file = 'model_full_50M_sg0_sz250_win5_min3_hs1_neg0'
model= gensim.models.word2vec.Word2Vec.load(path+file)

# Prediction algorithm

In [3]:
"""take a single phrase as a string, and preprocess, return tokens"""
tokenizer = RegexpTokenizer(r'\w+') # tokens separated by white spice
stops = set(stopwords.words('english')) # list of english stop words
lemma = WordNetLemmatizer()

def clean(title, rmv_stop_words=False):
    tokens = tokenizer.tokenize(title.lower())     # tokenize
    if rmv_stop_words:
        tokens = [i for i in tokens if not i in stops] # remove stop words
    normalized = [lemma.lemmatize(token) for token in tokens] # lemma
    return normalized

In [4]:
""" These generate alternative queries and score them and filter them """
def generate_alternatives(query, n, model, rmv_stop_words=False):
    syns = get_similar(query, n, model, rmv_stop_words) # synonyms
    combs = get_combinations(syns) # combinations
    probs = [model.score([sug])[0] for sug in combs] # probabilities
    preds_probs =[(p,q) for p,q in zip(probs,combs)] # combine with queries
    q_score = model.score([clean(query)])[0] # score for original query
    sd = get_sd(preds_probs)
    preds_1sd = [(x,y) for x,y in preds_probs if np.abs(x-q_score)<=sd] # keep just those within 1 sd
    preds_1sd.sort(reverse=True)
    print("original query: {}".format(query))
    print("score: {}".format(q_score))
    print("sd of all results: {}".format(sd))
    print("number of results within 1 SD of original query score: {}".format(len(preds_1sd)))
    return preds_1sd

def get_similar(query, n, model, rmv_stop_words):
    q = clean(query, rmv_stop_words=rmv_stop_words)
    # turn each word  of query into its own list
    d = [[x] for x in q]
    for x in d:
        # for each word in original query, add topn similar words to list
        x.extend([syn for syn,_ in model.most_similar(x[0],topn=n)])
    return d

def get_combinations(l):
    combs = [x for x in product(*l)]
    return combs

def get_sd(tups):
    vals = [x for x,_ in tups]
    return np.std(vals)

def clean_preds(pred_scores, topn=3):
    clean = []
    i = 0
    for score,query in pred_scores:
        i+=1
        if i > topn: break
        clean.append((score, ' '.join(query)))
    return clean

In [44]:
"""These query stack overflow and return and parse the serach results for validation"""

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
            'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            'Accept_Encoding': 'gzip, deflate, sdch, br', 'Accept_Language':'en-US,en;q=0.8',
            'Connection': 'keep-alive'}

def get_query_results(query):
    url = 'https://stackoverflow.com/search?q='+'+'.join(query)
    print(url)
    r = requests.get(url)
    print(r.status_code)
    soup = BeautifulSoup(r.text, 'lxml')
    x = parse_results(soup)
    return x
    
def parse_results(soup):
    l = []
    results = soup.find_all("div", class_="question-summary search-result")
    i = 0
    for result in results:
        i += 1
        if i > 10:
            return l
        votes = [v.get_text() for v in result.find_all("strong")]
        if len(votes) == 1:
            votes.append(1)
        link = result.find("div", class_="result-link").find('a')
        query = link.attrs['title']
        url = link.attrs['href']
        votes.extend([query,url])
        l.append(tuple(votes))
    return l

In [38]:
""" given dictionaries of suggested-queries' results and the original query results,
    compares metrics of the returned queries"""
def compare(suggestions, original):
    # get result titles from original in a list
    print(original)
    print(list(original))
    orig_items = [title for _,_,title,_ in original[list(original)[0]]]
    summaries = defaultdict(dict)
    orig_summaries = defaultdict(dict)
    for k in suggestions:
        # isolate titles for suggested results
        sug_items = [title for _,_,title,_ in suggestions[k]]
        # identify which results are shared with the original search results
        shared = set(orig_items) & set(sug_items)
        orig_unique = list(set(orig_items).difference(set(sug_items)))
        sug_unique = list(set(sug_items).difference(set(orig_items)))
        concordance = len(shared)*1.0/len(sug_items)
        
        votes = 0
        answers = 0
        for vote,answer,query,_ in original[list(original)[0]]:
            if query in shared:
                continue
            votes += int(vote)
            answers += int(answer)
        orig_summaries[k]['unique'] = len(orig_unique)
        try: 
            orig_summaries[k]['avg_unique_votes'] = votes*1.0/orig_summaries[k]['unique']
            orig_summaries[k]['avg_unique_answers'] = answers*1.0/orig_summaries[k]['unique']
        except ZeroDivisionError:
            orig_summaries[k]['avg_unique_votes'] = 0
            orig_summaries[k]['avg_unique_answers'] = 0
        orig_summaries[k]['titles'] = orig_unique
        orig_summaries[k]['concordance'] = concordance
        
        # calculate sum of votes and answers for all unique search results in suggestion
        votes = 0
        answers = 0
        #distances = []
        for vote,answer,query,url in suggestions[k]:
            if query in shared:
                continue
            votes += int(vote)
            answers += int(answer)
            #distances.append ()# cosine distance fucntion here)
            # concordance
        summaries[k]['unique'] = len(sug_unique)
        try: 
            summaries[k]['avg_unique_votes'] = votes*1.0/summaries[k]['unique']
            summaries[k]['avg_unique_answers'] = answers*1.0/summaries[k]['unique']
        except ZeroDivisionError:
            summaries[k]['avg_unique_votes'] = 0
            summaries[k]['avg_unique_answers'] = 0
        summaries[k]['unique'] = len(sug_unique)
        summaries[k]['titles'] = sug_unique
        summaries[k]['concordance'] = concordance
    return summaries, orig_summaries

"""
def make_summaries(key, dic):
    d = defaultdict(dict)
    votes = 0
    answers = 0
    for vote,answer,query,_ in dic[list(original)[0]]:
        if query in shared:
            continue
            votes += int(vote)
            answers += int(answer)
        d[key]['votes'] = votes
        d[key]['answers'] = answers
        d[key]['unique'] = orig_unique
        d[key]['unique_perc'] = len(orig_unique)*1.0/len(orig_items)
        """

"\ndef make_summaries(key, dic):\n    d = defaultdict(dict)\n    votes = 0\n    answers = 0\n    for vote,answer,query,_ in dic[list(original)[0]]:\n        if query in shared:\n            continue\n            votes += int(vote)\n            answers += int(answer)\n        d[key]['votes'] = votes\n        d[key]['answers'] = answers\n        d[key]['unique'] = orig_unique\n        d[key]['unique_perc'] = len(orig_unique)*1.0/len(orig_items)\n        "

In [45]:
def validate(queries, n_syns, topn, model):
    results_dict = defaultdict(dict)
    for q in queries:
        results_dict[q] = evaluate_query(q, n_syns, topn, model)
    return results_dict

def evaluate_query(orig_query, n_syns, topn, model):
    # improtant local varaibles
    orig_query_vec = clean(orig_query)
    
    # generate, rank, filter suggestions
    preds = generate_alternatives(orig_query, n_syns, model)
    best_preds = clean_preds(preds, topn)
    best_preds2 = [(s,q) for s,q in best_preds if q!=' '.join(orig_query_vec)]
    #orig = (model.score([orig_query_vec])[0], ' '.join(orig_query_vec))
    
    # query each suggestion and scrape relevent metrics
    sug_q_results ={}
    for _,q in best_preds2:
        sug_q_results[q] = get_query_results(clean(q))
        if sug_q_results[q] == []:
            print('Website returned no results.')

        time.sleep(1)
    
    # get query results for original query
    orig_q_results = {orig_query: get_query_results(orig_query_vec)}
    
    # get metrics
    summaries,orig_summaries = compare(sug_q_results, orig_q_results)
    
    
    # calculate average metrics
    avg_answers_dif_score = np.mean([summaries[k]['avg_unique_answers'] - 
                                  orig_summaries[k]['avg_unique_answers'] for k in summaries])
    avg_votes_dif_score = np.mean([summaries[k]['avg_unique_votes'] - 
                                orig_summaries[k]['avg_unique_votes'] for k in summaries])
    avg_unique_items = np.mean([summaries[k]['unique'] for k in summaries])

    results = {}
    orig_prob = model.score([orig_query_vec])[0]
    results['avg_unique_items'] = avg_unique_items
    results['avg_answers_dif_score'] = avg_answers_dif_score
    results['avg_votes_dif_score'] = avg_votes_dif_score
    results['avg_prob_dif_score'] = np.mean([np.exp(prob)-np.exp(orig_prob) for prob,q in best_preds2])
    return results

In [36]:
queries = ['all combinations of two lists',
          'converting python string to unicode']

In [41]:
results = validate(queries, n_syns=5, topn=5, model=model)

original query: all combinations of two lists
score: -21.941810607910156
sd of all results: 8.711952209472656
number of results within 1 SD of original query score: 93
https://stackoverflow.com/search?q=all+sum+of+two+list
200
https://stackoverflow.com/search?q=all+combination+of+multiple+list
200
https://stackoverflow.com/search?q=all+permutation+of+two+list
200
https://stackoverflow.com/search?q=all+combination+of+three+list
200
https://stackoverflow.com/search?q=all+combination+of+two+list
200
{'all combinations of two lists': [('3351', 1, 'The definitive guide to form-based website authentication', '/questions/549/the-definitive-guide-to-form-based-website-authentication/477578#477578'), ('5', '4', 'all combination of a complicated list', '/questions/25665046/all-combination-of-a-complicated-list'), ('3', '2', 'How to remove all value where the combination of two attributes equals a string?', '/questions/15303656/how-to-remove-all-value-where-the-combination-of-two-attributes-equal

In [43]:
pd.DataFrame(results).T

Unnamed: 0,avg_answers_dif_score,avg_prob_dif_score,avg_unique_items,avg_votes_dif_score
all combinations of two lists,0.685897,-1.97984e-10,13.5,-52.285897
converting python string to unicode,-0.536484,4.75839e-10,12.0,-25.420147
