In [1]:
import gensim
#import logging

#import string
from nltk.tokenize import RegexpTokenizer # tokenizing
from nltk.corpus import stopwords  # list of stop words
from nltk.stem.wordnet import WordNetLemmatizer # lemmatizer

from itertools import product
import numpy as np

import re

# Loading model

Choose one option from below.  The first 2 are the same model, which can be used to calculate a probability score for a phrase. The first is just the word vectors (no hidden layer weights). The second is the full model (can calculate score).

The last file is the negative sampling model, which always for the 'predict_output_word' function

In [2]:
# load full model with softmax, CBOW, and no negative sampling
path = "/Users/stevenfelix/Documents/DataScience_local/Insight/Demo_App/flaskexample/models/"
file = 'model_full_50M_sg0_sz250_win5_min3_hs1_neg0'
model= gensim.models.word2vec.Word2Vec.load(path+file)

# Prediction algorithm

**text preprocessing**

In [3]:
contractions = re.compile(r"'|-|\"")
# all non alphanumeric
symbols = re.compile(r'(\W+)', re.U)
# single character removal
singles = re.compile(r'(\s\S\s)', re.I|re.U)
# separators (any whitespace)
seps = re.compile(r'\s+')
# tokenizer
tokenizer = RegexpTokenizer(r'\w+') # tokens separated by white spice
# stop words
stops = set(stopwords.words('english')) # list of english stop words

# cleaner (order matters)
def clean(text, rmv_stop_words=True, return_tokens=False): 
    text = text.lower()
    text = contractions.sub('', text)
    text = symbols.sub(r' \1 ', text)
    text = singles.sub(' ', text)
    text = seps.sub(' ', text)
    tokens = tokenizer.tokenize(text)     # tokenize
    if rmv_stop_words:
        tokens = [i for i in tokens if not i in stops] # remove stop words
        text = ' '.join(tokens)
    if return_tokens:
        return tokens
    return text

**generating and ranking alternatives**

In [14]:
""" These generate alternative queries and score them and filter them """

def generate_alternatives(query, n, model, rmv_stop_words=True, return_tokens=True, tags=[]):
    try:
        syns = get_similar(query, model, rmv_stop_words=rmv_stop_words, return_tokens=return_tokens, tags=tags) # synonyms
    except Exception as e:
        return str(e)
    combs = get_combinations(syns) # combinations
    probs = [model.score([sug])[0] for sug in combs] # probabilities
    preds_probs =[(p,q) for p,q in zip(probs,combs)] # combine with queries
    q_score = model.score([tokenizer.tokenize(query)])[0] # score for original query
    preds_probs.sort(reverse=True)
    sugs =  [q for p,q in preds_probs[0:n]]
    return [' '.join(title) for title in sugs],syns

def get_similar(query, model, rmv_stop_words, return_tokens, tags, threshold=.55):
    q = clean(query, rmv_stop_words=rmv_stop_words, return_tokens=return_tokens)
    # turn each word  of query into its own list
    d = [[x] for x in q]
    for x in d:
        # for each word in original query, add topn similar words to list
        # TO DO: catch exceptions
        if x[0] not in tags:
            x.extend([syn for syn,score in model.most_similar(x[0]) if score > threshold])        
    return d

def get_combinations(l):
    combs = [x for x in product(*l)]
    return combs

In [44]:
suggestions,synonyms = generate_alternatives('create column pandas dataframe', 5, model, rmv_stop_words=True, return_tokens=True, tags=[])

In [45]:
suggestions

['add column pandas dataframe',
 'create columns pandas dataframe',
 'adding columns pandas dataframe',
 'adding column pandas dataframe',
 'add columns pandas dataframe']

In [12]:
synonyms

[['create',
  'creating',
  'add',
  'generate',
  'creat',
  'creation',
  'construct',
  'adding',
  'creates',
  'generating',
  'created'],
 ['column',
  'columns',
  'row',
  'colum',
  'table',
  'colums',
  'coloumn',
  'rows',
  'cell',
  'field'],
 ['pandas', 'multiindexed'],
 ['dataframe',
  'dataframes',
  'multiindexing',
  'datetimeindex',
  'df',
  'read_csv',
  'pivot_table',
  'rolling_mean',
  'asfreq',
  'value_counts']]