In [6]:
import gensim
#import logging

#import string
from nltk.tokenize import RegexpTokenizer # tokenizing
from nltk.corpus import stopwords  # list of stop words
from nltk.stem.wordnet import WordNetLemmatizer # lemmatizer

from itertools import product
import numpy as np

from bs4 import BeautifulSoup
import requests

import time

from collections import defaultdict
# Logging code taken from http://rare-technologies.com/word2vec-tutorial/
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Loading model

Choose one option from below.  The first 2 are the same model, which can be used to calculate a probability score for a phrase. The first is just the word vectors (no hidden layer weights). The second is the full model (can calculate score).

The last file is the negative sampling model, which always for the 'predict_output_word' function

In [None]:
# Load keyed vector file (model with softmax, CBOW, and no negative sampling)
path = "/Users/stevenfelix/Documents/DataScience_local/Insight/"
file = 'model_full_50M_sg0_sz250_win5_min3_hs1_neg0_kv'
# CBOW, window=250, min_count=3, hierarchical softmax, 
del model_full
model_kv = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format(path+file)

In [4]:
# load full model with softmax, CBOW, and no negative sampling
path = "/Users/stevenfelix/Documents/DataScience_local/Insight/"
file = 'model_full_50M_sg0_sz250_win5_min3_hs1_neg0'
model= gensim.models.word2vec.Word2Vec.load(path+file)

In [None]:
# load full model with skipgram and negative sampling
path = "/Users/stevenfelix/Documents/DataScience_local/Insight/"
file = 'model_full_50M_1_250_5_3'
model_ns = gensim.models.word2vec.Word2Vec.load(path+file)

# Prediction algorithm

In [9]:
tokenizer = RegexpTokenizer(r'\w+') # tokens separated by white spice
stops = set(stopwords.words('english')) # list of english stop words
lemma = WordNetLemmatizer()

def clean(title, rmv_stop_words=False):
    tokens = tokenizer.tokenize(title.lower())     # tokenize
    if rmv_stop_words:
        tokens = [i for i in tokens if not i in stops] # remove stop words
    normalized = [lemma.lemmatize(token) for token in tokens] # lemma
    return normalized

def predict_similar(query, model, rmv_stop_words=False):
    l = []
    q = clean(query, rmv_stop_words=rmv_stop_words)
    print('Original query: {}\n'.format(query))
    for word in q:
        missing = q[:]
        ind = q.index(word)
        missing.remove(word)
        for syn in model.most_similar([word],topn=3):
            full = missing[:]
            full.insert(ind,syn[0])
            l.append(' '.join(full))
    return l

In [5]:
""" These generate alternative queries and score them and filter them """
def generate_alternatives(query, n, model, rmv_stop_words=False):
    syns = get_similar(query, n, model, rmv_stop_words) # synonyms
    combs = get_combinations(syns) # combinations
    probs = [model.score([sug])[0] for sug in combs] # probabilities
    preds_probs =[(p,q) for p,q in zip(probs,combs)] # combine with queries
    q_score = model.score([clean(query)])[0] # score for original query
    sd = get_sd(preds_probs)
    preds_1sd = [(x,y) for x,y in preds_probs if np.abs(x-q_score)<=sd] # keep just those within 1 sd
    preds_1sd.sort(reverse=True)
    print("original query: {}".format(query))
    print("score: {}".format(q_score))
    print("sd of all results: {}".format(sd))
    print("number of results within 1 SD of original query score: {}".format(len(preds_1sd)))
    return preds_1sd

def get_similar(query, n, model, rmv_stop_words):
    q = clean(query, rmv_stop_words=rmv_stop_words)
    # turn each word  of query into its own list
    d = [[x] for x in q]
    for x in d:
        # for each word in original query, add topn similar words to list
        x.extend([syn for syn,_ in model.most_similar(x[0],topn=n)])
    return d

def get_combinations(l):
    combs = [x for x in product(*l)]
    return combs

def get_sd(tups):
    vals = [x for x,_ in tups]
    return np.std(vals)

def clean_preds(pred_scores, topn=3):
    clean = []
    i = 0
    for score,query in pred_scores:
        i+=1
        if i > topn: break
        clean.append((score, ' '.join(query)))
    return clean

In [73]:
"""These query stack overflow and return and parse the serach results"""

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
            'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            'Accept_Encoding': 'gzip, deflate, sdch, br', 'Accept_Language':'en-US,en;q=0.8',
            'Connection': 'keep-alive'}

def get_query_results(query):
    url = 'https://stackoverflow.com/search?q='+'+'.join(query)
    print(url)
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'lxml')
    x = parse_results(soup)
    return x
    
def parse_results(soup):
    l = []
    results = soup.find_all("div", class_="question-summary search-result")
    for result in results:
        votes = [v.get_text() for v in result.find_all("strong")]
        link = result.find("div", class_="result-link").find('a')
        query = link.attrs['title']
        url = link.attrs['href']
        votes.extend([query,url])
        l.append(tuple(votes))
    return l

In [66]:
best_preds

[(-11.837975, 'looping through panda dataframe'),
 (-12.610452, 'loop through panda dataframe'),
 (-12.617397, 'iterating over panda dataframe'),
 (-12.801682, 'iterating through panda dataframe')]

In [95]:
orig_query = 'iterate over pandas dataframe'
orig_query_vec = clean(orig_query)
preds = generate_alternatives(orig_query, 3, model)
best_preds = clean_preds(preds, 4)
# remove any top queries identical to original (so i can make valid comparison)
best_preds2 = [(s,q) for s,q in best_preds if q!=' '.join(orig_query_vec)]
orig = (model.score([orig_query_vec])[0], ' '.join(orig_query_vec))
#if [query==q for _,q in best_preds]):
#    best_preds.insert(0,(model.score([clean(query)])[0], ' '.join(clean(query))))

original query: iterate over pandas dataframe
score: -11.328330993652344
sd of all results: 10.13688850402832
number of results within 1 SD of original query score: 15


In [133]:
# query each suggestion and scrape relevent metrics
sug_q_results ={}
for _,q in best_preds2:
    sug_q_results[q] = get_query_results(clean(q))
    time.sleep(5)
orig_q_results = {}
orig_q_results[orig_query] = get_query_results(orig_query_vec)

https://stackoverflow.com/search?q=looping+through+panda+dataframe
https://stackoverflow.com/search?q=iterate+through+panda+dataframe
https://stackoverflow.com/search?q=iterating+over+panda+dataframe
https://stackoverflow.com/search?q=iterate+over+panda+dataframe


In [134]:
# just for convenience do to manual checks later
queries = list(sug_q_results)
print(queries)

['looping through panda dataframe', 'iterate through panda dataframe', 'iterating over panda dataframe']


In [211]:
def compare(suggestions, original):
    # get result titles from original in a list
    orig_items = [title for _,_,title,_ in original[list(original)[0]]]
    summaries = defaultdict(dict)
    orig_summaries = defaultdict(dict)
    for k in suggestions:
        # isolate titles for suggested results
        sug_items = [title for _,_,title,_ in suggestions[k]]
        
        # identify which results are shared with the original search results
        shared = set(orig_items) & set(sug_items)
        orig_unique = list(set(orig_items).difference(set(sug_items)))
        sug_unique = list(set(sug_items).difference(set(orig_items)))
        concordance = len(shared)*1.0/len(sug_items)
        
        votes = 0
        answers = 0
        for vote,answer,query,_ in original[list(original)[0]]:
            if query in shared:
                continue
            votes += int(vote)
            answers += int(answer)
        orig_summaries[k]['unique'] = len(orig_unique)
        try: 
            orig_summaries[k]['avg_unique_votes'] = votes*1.0/orig_summaries[k]['unique']
            orig_summaries[k]['avg_unique_answers'] = answers*1.0/orig_summaries[k]['unique']
        except ZeroDivisionError:
            orig_summaries[k]['avg_unique_votes'] = 0
            orig_summaries[k]['avg_unique_answers'] = 0
        orig_summaries[k]['titles'] = orig_unique
        orig_summaries[k]['concordance'] = concordance
        
        # calculate sum of votes and answers for all unique search results in suggestion
        votes = 0
        answers = 0
        #distances = []
        for vote,answer,query,url in suggestions[k]:
            if query in shared:
                continue
            votes += int(vote)
            answers += int(answer)
            #distances.append ()# cosine distance fucntion here)
            # concordance
        summaries[k]['unique'] = len(sug_unique)
        try: 
            summaries[k]['avg_unique_votes'] = votes*1.0/summaries[k]['unique']
            summaries[k]['avg_unique_answers'] = answers*1.0/summaries[k]['unique']
        except ZeroDivisionError:
            summaries[k]['avg_unique_votes'] = 0
            summaries[k]['avg_unique_answers'] = 0
        summaries[k]['unique'] = len(sug_unique)
        summaries[k]['titles'] = sug_unique
        summaries[k]['concordance'] = concordance
    return summaries, orig_summaries

"""
def make_summaries(key, dic):
    d = defaultdict(dict)
    votes = 0
    answers = 0
    for vote,answer,query,_ in dic[list(original)[0]]:
        if query in shared:
            continue
            votes += int(vote)
            answers += int(answer)
        d[key]['votes'] = votes
        d[key]['answers'] = answers
        d[key]['unique'] = orig_unique
        d[key]['unique_perc'] = len(orig_unique)*1.0/len(orig_items)
        """

"\ndef make_summaries(key, dic):\n    d = defaultdict(dict)\n    votes = 0\n    answers = 0\n    for vote,answer,query,_ in dic[list(original)[0]]:\n        if query in shared:\n            continue\n            votes += int(vote)\n            answers += int(answer)\n        d[key]['votes'] = votes\n        d[key]['answers'] = answers\n        d[key]['unique'] = orig_unique\n        d[key]['unique_perc'] = len(orig_unique)*1.0/len(orig_items)\n        "

In [212]:
summaries,orig_summaries = compare(sug_q_results, orig_q_results)

In [263]:
answers_dif_scores = []
votes_dif_scores = []
count_unqiue_items = []
for k in summaries:
    answers_dif_scores.append(summaries[k]['avg_unique_answers'] - orig_summaries[k]['avg_unique_answers'])
    votes_dif_scores.append(summaries[k]['avg_unique_votes'] - orig_summaries[k]['avg_unique_votes'])
    count_unqiue_items.append(summaries[k]['unique'])
avg_answers_dif_score = np.mean(answers_dif_scores)
avg_votes_dif_score = np.mean(votes_dif_scores)
avg_unique_items = np.mean(count_unqiue_items)

In [264]:
validation_results = defaultdict(dict)

# add results to validation results dictionary
orig_prob = model.score([orig_query_vec])[0]
validation_results[orig_query]['avg_unique_items'] = avg_unique_items
validation_results[orig_query]['avg_answers_dif_score'] = avg_answers_dif_score
validation_results[orig_query]['avg_votes_dif_score'] = avg_votes_dif_score
validation_results[orig_query]['avg_prob_dif_score'] = np.mean([np.exp(prob)-np.exp(orig_prob) for prob,q in best_preds2])

In [265]:
validation_results

defaultdict(dict,
            {'iterate over pandas dataframe': {'avg_answers_dif_score': -0.61666666666666659,
              'avg_prob_dif_score': -6.3273023e-06,
              'avg_unique_items': 11.333333333333334,
              'avg_votes_dif_score': -11.927777777777779}})

In [104]:
def summarize_results(suggestions, original):  
    summaries = defaultdict(dict)
    for k in suggestions:
        votes = 0
        answers = 0
        #distances = []
        for vote,answer,query,url in d[k]:
            votes += int(vote)
            answers += int(answer)
            #distances.append ()# cosine distance fucntion here)
            # concordance
        summaries[k]['votes'] = votes
        summaries[k]['answers'] = answers
    return summaries
    #summaries[d]['distances'] = # calculate average distance

In [105]:
sug_q_summaries = summarize_results(sug_q_results)
orig_q_summary = summarize_results(orig_q_results)

In [108]:
orig_q_summary[orig_query]

{'answers': 36, 'votes': 594}

In [109]:
for k in sug_q_summaries:
    print(sug_q_summaries[k])

{'votes': 518, 'answers': 29}
{'votes': 136, 'answers': 19}
{'votes': 593, 'answers': 35}


**looks pretty good**

the highest rated sentences are pretty good substitutes == 

I think the score method works well for capturing the whole meaning of the sentence because it calculates the probability of the sentence itself --

problem is that it doesn't compare it to the original (ie, a suggested query may have higher probability of occuring..ie make more sense ... but it doesn't necessarily mean it conveys the same question/meaning as the original query).... would need something like cosine distance to get similarity.  ideally we want both -- more probable phrasing while maintaining similarity to original query.

To me this suggests that this method might be a good choice for determining which sentences to suggest in the first place -- low probability sentences probably shouldn't be shown (except maybe if there is one rare word in the sentence that is intentional)

...from here, i could grab the top 10 suggestions (minus the original query), get query results from stack over flow, and calculate a few things:

-- amount of overlap with the top 5

In [None]:
print(q)
model.score([clean(q)])

**note**
you can also get a score for the original word and then show only the suggestions that have a better score ... or within a certain range.

In [None]:
clean(q)[1:]

### exploring cosine similarity

In [None]:
def sent2vec(sentence, model):
    s = 0.0
    vec = clean(sentence)
    for word in vec:
        s += model[word]
    return s/len(vec) # using sum or average makes no difference in cosine similarity

In [None]:
q_vec = sent2vec(q, model_kv)

In [None]:
vecs = []
for sentence in preds:
    vecs.append(sent2vec(sentence, model))

In [None]:
from scipy.spatial.distance import cosine

In [None]:
#gensim.models.keyedvectors.KeyedVectors.
dist = []
for vec in vecs:
    dist.append(cosine(q_vec, vec))

In [None]:
preds_cos =[(p,q) for p,q in zip(dist,preds)]
preds_cos.sort()
preds_cos

I'm not liking this method too much -- the lowest ranked item (err, largest cosine distance) is actually a pretty good substitue for the original

**notes**:
  - 'iteration' substituations work well, possibly because verb?
  - 

In [None]:
model_full.mo

In [None]:
model_full.most_similar(['someone'])

In [None]:
predict_similar('Someone flagged my question as already answered, but it\'s not', rmv_stop_words=True)

In [None]:
a = clean('Adding new column to existing DataFrame in Python pandas')
b = clean('How can I add a new computed column in a dataframe?') # [duplicate]
c = clean('pandas create new column based on values from other columns')
d = clean('Add new column in Pandas DataFrame Python')
e = clean('Calling an external command in Python')

In [None]:
import numpy as np

In [None]:
def title_vec(token_list, model):
    s = model[token_list[0]]*0
    for word in token_list:
        s = s + model[word]
    return s

In [None]:
for x in [a,b,c]:
    print(len(x))

In [None]:
a_vec = title_vec(a, model_full)
b_vec = title_vec(b, model_full)
c_vec = title_vec(c, model_full)
d_vec = title_vec(d, model_full)
e_vec = title_vec(e, model_full)

In [None]:
import scipy as sp

print(sp.spatial.distance.euclidean(a_vec,b_vec))
print(sp.spatial.distance.euclidean(a_vec,c_vec))
print(sp.spatial.distance.euclidean(b_vec,c_vec))
print(sp.spatial.distance.euclidean(a_vec,e_vec))
print(sp.spatial.distance.euclidean(a_vec,d_vec))

# looking at AOL data for possible validation

In [None]:
import pandas as pd

In [None]:
path='/Users/stevenfelix/Downloads/'
file='user-ct-test-collection-01.txt'
data = pd.read_table(path+file, delimiter='\t')
data.iloc[0:100,:]

In [None]:
import numpy as np

In [None]:
'.com' in 'rentdirect.com'

In [None]:
rem = ['.gov', '.com', '.edu', 'www', '.net', 'http', '.org']
for x in rem:
    data = data[[(x not in str(v)) for _,v in data.Query.iteritems()]]

In [None]:
data.shape

In [None]:
data.iloc[1:200,:]