# Include-2-Shared-Functions

- Import packages for computation and data manipulation
- Set up plotting if needed
- Set how dataframes are rendered in Jupyter notebooks
- Define functions used in the other notebooks

In [None]:
# Python logging to monitor gensim models
#### TURN ON AS NEEDED ####
#import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Packages for computation and data manipulation

In [None]:
import numpy as np # for number crunching
import pandas as pd # for data loading and manipulation
import time
import pickle
import re
import random
import itertools

## Set up plotting packages, modules, and styles

In [None]:
# Import the pyplot module from the matplotlib library
from matplotlib import pyplot as plt
# Use Jupyter magics to plot inline without needing to call plt.show()
# From the documentation (https://stackoverflow.com/questions/43027980/)
# "With backend = 'inline', the output of plotting commands is displayed inline within frontends 
#   like the Jupyter notebook, directly below the code cell that produced it. 
#   The resulting plots will then also be stored in the notebook document."
%matplotlib inline

In [None]:
# Import the Seaborn library (by Michael Waskom)
import seaborn as sns
# Set the visual styles
sns.set(context = 'notebook', 
        style = 'darkgrid',
        palette = 'deep', 
        font = 'sans-serif', 
        font_scale = 1.3, 
        color_codes = True, 
        rc = None
       )

In [None]:
# List the matplotlib styles available
#print(plt.style.available)

In [None]:
#### SET the matplotlib style HERE ####
style = 'seaborn-darkgrid'
plt.style.use(style)

In [None]:
#### UNCOMMENT TO TEST ####
# Test out the style settings
#print("Here's what the {} style looks like...".format(style))
#fig, axes = plt.subplots(1, 4, figsize=(12, 4))
#axes[0].set_xlim(0, 0.5)

In [None]:
# Plotnine for ggplot
## NOTE: This will throw the following warning:
### FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. 
### Please use the pandas.tseries module instead. from pandas.core import datetools
### Not sure how to handle it.

#from plotnine import *

## Jupyter notebook display settings

In [None]:
# Make sure all columns of a dataframe are displayed
# https://stackoverflow.com/questions/47022070/
from IPython.display import display
pd.options.display.max_columns = None

In [None]:
# Make sure that a dataframe column value (e.g., a large text field) is not truncated
# https://stackoverflow.com/questions/25351968
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 100)

In [None]:
# Configure slide scrolling
# from hfinger at https://github.com/damianavila/RISE/issues/185
#### NOTE: Have to restart notebook server after running it the first time ####
from notebook.services.config import ConfigManager
cm = ConfigManager()
cm.update('livereveal', {'width': 1024, 'height': 768, 'scroll': True})

## Data pre-processing functions

In [None]:
# Get a list of each attribute and the first n values for that attribute in the data set
#### SET n HERE ####
display_n = 3

def get_first_n_vals(dataFrame, n=display_n):
    feature_list = list(dataFrame)
    first_n = [list(dataFrame[attribute][0:n]) for attribute in feature_list]
    return list(enumerate(list(zip(feature_list, first_n))))

In [None]:
# For each feature, how many/what percentage of rows are missing values?
# From https://datascience.stackexchange.com/questions/12645/

def num_missing_values_per_feature(dataFrame, display='percentage'):
    if display == 'count':
        return dataFrame.isnull().sum(axis=0)
    else:
        return dataFrame.isnull().sum(axis=0)/len(dataFrame)

## Gradient Descent

### Penalty or Cost Functions

In [None]:
# PENALTY FUNCTIONS - SOME EXAMPLES

# multiplier is a positive number > 0 that determines the slope

# Linear Penalty Function
def linearPenalty(x, multiplier=1): 
    return x * multiplier

# Flipped/Inverse Linear Penalty Function
def invLinearPenalty(x, multiplier=1):
    return -x * multiplier

# Linear for negative x and zero for positive x
def leftLinearPenalty(x, multiplier=1):
    if(x < 0): return -x * multiplier
    else: return 0
    
# Linear for positive x and zero for negative x
def rightLinearPenalty(x, multiplier=1):
    if(x < 0): return 0
    else: return x * multiplier

# V shape penalty
def VPenalty(x, multiplier=1):
    if (x < 0): return -x * multiplier
    else: return x
    
# Inverted V shape penalty
def invertedVPenalty(x, multiplier=1):
    if (x < 0): return x * multiplier
    else: return -x * multiplier
    
# Positive parabola penalty
def squaredPenalty(x, multiplier=1):
    return (x**2) * multiplier

# Inverted parabola penalty
def invertedSquaredPenalty(x, multiplier=1):
    return -(x**2) * multiplier

# Non-linear penalty
def nonLinearPenalty(x, multiplier=1):
    return x + x**2 + x**3

In [None]:
penaltyFunctions = {linearPenalty: "Linear Penalty", 
                    invLinearPenalty: "Inverse Linear Penalty",
                    leftLinearPenalty: "Left-Linear Penalty",
                    rightLinearPenalty: "Right-Linear Penalty",
                    VPenalty: "V Penalty",
                    invertedVPenalty: "Inverted-V Penalty",
                    squaredPenalty: "Squared Penalty",
                    invertedSquaredPenalty: "Inverted Squared Penalty",
                    nonLinearPenalty: "Non-Linear Penalty"
                   }

In [None]:
# Plot the penalty function for a given list of error values and a given penalty function
def penaltyPlot(errorList, penaltyFunction):
    # Set up the x-axis
    num_points = 200
    x = np.linspace(min(errorList), max(errorList), num_points)
    fig, ax = plt.subplots(figsize=(6,4))
    ax.set(xlabel='Predicted Value - Actual Value')
    ax.set(ylabel='Penalty')
    ax.axvline(x=0, color='black')
    ax.axhline(y=0, color='black')
    ax.set(title=penaltyFunctions[penaltyFunction])
    ax.plot(x, list(map(penaltyFunction,x)))

In [None]:
# Add a column of ones to the first column of a dataframe
# and turn it into a matrix
def df_addOnes(dataFrame):
    vals = dataFrame.values
    #add_ones_column = zip(np.ones(len(dataFrame)), vals)
    #feature_matrix = np.matrix([val for val in add_ones_column])
    feature_matrix = np.c_[np.ones(len(dataFrame)), vals]
    return feature_matrix

In [None]:
# Making it easy to calculate the total penalty over the entire dataset
def penalty(df_features, df_output, paramater_value_list, penalty_function):
    
    # df_features is a dataframe of the features (no column of ones added)
    # df_output is a dataframe of the output column (target variable)
    # parameter_value_list is a list of w0, w1, ..., wn+1 where n is the number of features
    #  i.e., the number of columns in df_features.
    
    # Cost of being wrong calculated over the entire data set
    # Will take X and add a first column of 1s to it to enable the matrix multiplication
    # Therefore: X is an m x n matrix and theta is a n x 1 matrix
    
    #### Turn the function inputs into matrices ####
    # Get X and y into the right shapes for use in the penalty function
    # Add a first column of ones to the feature matrix
    # Add a column of 1s to X 
    feature_matrix = df_addOnes(df_features)
    output_matrix = np.matrix(df_output.values)
    parameter_matrix = np.matrix(paramater_value_list).T
    
    #print(feature_matrix.shape, parameter_matrix.shape, output_matrix.shape)
    
    # Difference between the predicted and the actual value
    error = (feature_matrix * parameter_matrix) - output_matrix
    #print(error.shape)
    
    # penaltyPerOutput is an m x 1 matrix where each element is the penalty for
    # the input and its associated output for a particular value of W
    
    # Apply a penalty function to the errors from each row of the dataset
    penaltyPerOutput = list(map(penalty_function,error))
    
    # totalPenalty is the sum of the penalties of each row of the dataset
    totalPenalty = np.sum(penaltyPerOutput)
    
    # The penalty of getting it wrong is 1/2m of the totalPenalty (normalized penalty)
    # m is the number of rows in df_features
    totalPenaltyNorm = totalPenalty / (2 * len(df_features))
    
    return totalPenaltyNorm

In [None]:
# Implement Gradient Descent 
# **NOTE: ONLY for a squared penalty function**
def gradientDescent(df_features, 
                    df_output, 
                    init_params_list, 
                    num_iterations=100, 
                    learning_rate=0.0001, 
                    penalty_function=squaredPenalty):
    # df_features is a dataframe with the features
    # df_ouptut is a dataframe of the output column
    # init_params_list is the list of initial W values, e.g., [-1.0, 3.53]
    # num_iterations is the number of steps taken by the algorithm as it descends the penalty surface
    # learning_rate is the multiplier that determines step size (smaller = smaller step size)
    # penalty_function is the penalty function applied to the machine learning problem
    # NOTE: The formula for gradient descent we're implementing works only for the squaredPenalty function
    
    # Get the inputs into matrix form so we can use matrix multiplication (more efficient)
    feature_matrix = df_addOnes(df_features)
    m = len(feature_matrix) # number of rows of data
    output_matrix = np.matrix(df_output.values)
    parameter_matrix = np.matrix(init_params_list).T

    # This is the initial value of the parameters in matrix form
    w = parameter_matrix
    
    # Set up arrays to capture the running results
    # Specify dtype=object because we're putting arrays into an array
    #running_w = np.empty(num_iterations, dtype = object)
    #running_w = np.array([[ 0.50182941],[-0.07935517]])
    running_w = np.array(parameter_matrix)
    # don't have to specify dtype for the other arrays because we're putting single values into the array
    running_error = np.zeros(num_iterations) 
    running_normError = np.zeros(num_iterations)
    running_penalty = np.zeros(num_iterations)
    
    # Iterate over the dataset num_iterations times and adjust the values of the parameters each time
    for i in range(num_iterations):
        #print(w)
        for j in range(len(parameter_matrix)):
            error = ((feature_matrix * w) - output_matrix).T * np.matrix(feature_matrix[:,j]).T
            normError = (learning_rate/m) * error
            w[j] = w[j] - normError
            #print(w[j])
           
        # w, error, normError and penalty after each iteration
        #running_w[i] = w
        running_w = np.append(running_w, w, axis=0)
        #print(i)
        #print(w)
        #print(running_w)
        running_error[i] = np.sum((feature_matrix * w) - output_matrix.T)
        running_normError[i] = (learning_rate/m) * running_error[i]
        #running_penalty[i] = penalty_function(running_error[i])
        running_penalty[i] = penalty_function(running_normError[i])
    

    # w is the value of parameters afer num_iterations
    #print(w)

    # Get the running_w into the right form
    # From https://jasonstitt.com/python-group-iterator-list-function
    running_w = list(zip(*[iter(running_w)] * len(parameter_matrix)))
    
    # error after num_iterations
    final_error = np.sum((feature_matrix * w) - output_matrix.T)

    # Penalty after num_iterations
    final_penalty = penalty_function(final_error)
    
    return w, final_penalty, running_w, running_penalty

## Text cleaning functions

Jason Brownlee has a good [tutorial](https://machinelearningmastery.com/clean-text-machine-learning-python/) on cleaning text using plain Python or using NLTK.

In [13]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Translation table for removing punctuations
table = str.maketrans('', '', string.punctuation)

STOP_WORDS = set(stopwords.words('english'))
# Identified after an initial analysis of the corpus
#REMOVE_WORDS = ['bc', 'at', 'asking', 'we', 'hoping', 'meeting', 'understand', 'inquiry', 
#                'could', 'need', 'request', 'looking', 'v', 'u', 'etc', 'client', 'would', 
#                'you', 'like', 'speak', 'schedule', 'call', 'analyst', 'discus', 'me', 'hi', 
#                'hello', 'follow', 'up', 'set', 'question', 'thought', 'please', 'thank',
#               ]

REMOVE_WORDS = ['i', 'im', 'id', 'would', 'hi']

wordnet_lemmatizer = WordNetLemmatizer()

def prep_doc(document):
    '''
    Following https://machinelearningmastery.com/clean-text-machine-learning-python/
    
    1. Tokenize the entire document on whitespace.
    2. Remove punctuation.
    3. Normalize case.
    4. Remove stopwords.
    5. Lemmatize
    6. Clean up the remaining items -- non-ASCII characters, empty strings, specific words, numbers
    '''
    # Tokenize
    #tokens = nltk.word_tokenize(document)
    tokens = document.split()
    
    # Strip all punctuations
    stripped = [token.translate(table) for token in tokens]
    
    # Normalize case
    normalized = [strip.lower() for strip in stripped]
    
    # Remove stopwords
    stopped = [norm for norm in normalized if not norm in STOP_WORDS]
       
    # Lemmatize
    lemmatized = [wordnet_lemmatizer.lemmatize(stop) for stop in stopped]
    
    # Remove non-ASCII tokens (e.g., '\x96')
    asciied = [re.sub(r'[^\x00-\x7F]', r'', lem) for lem in lemmatized]
    
    # Remove empty tokens ''
    # Empty strings have truth value FALSE; hence non-empty strings are TRUE
    # https://stackoverflow.com/questions/9573244/most-elegant-way-to-check-if-the-string-is-empty-in-python
    misc = [asc for asc in asciied if asc]
    
    # remove strings that are numerals
    cleaned = [mis for mis in misc if mis.isdigit() == False]
    
    final = [clean for clean in cleaned if not clean in REMOVE_WORDS]
    
    return final

In [None]:
#### TO DO ####
# Given a field name of a text field in a dataframe, create a text corpus.
## The text corpus cleans each piece of text using clean_doc above
## It then combines all of the cleaned_docs into one huge list of tokenized lists where
## each tokenized list is a sentence from a document.

## Simple text functions

In [None]:
NUM_RESULTS = 50

# Frequency counts of words in a corpus from the top of the list to a specific point
## or from any specific point in the list all the way to the bottom of the list
# Calculate the frequency of occurrence of words
from collections import defaultdict
import operator

def word_freq(corpus_text, num_results=NUM_RESULTS):
    # corpus_text is the output of create_text_corpus
    # NOTE: corpus_text MUST be a list of tokenized lists of words
    # e.g., [[word1, ..., word7], ..., [word23, ..., word99]]
    
    # Stitch the lists in corpus_text into one big list of words in the entire corpus
    corpus_words_list = [item for sublist in corpus_text for item in sublist]
    frequency = defaultdict(int)
    for word in corpus_words_list:
        frequency[word] += 1
    
    if num_results > 0:
        return sorted(frequency.items(), key=operator.itemgetter(1), reverse=True)[0:num_results]
    elif num_results < 0:
        return sorted(frequency.items(), key=operator.itemgetter(1), reverse=True)[num_results:]
    else:
        # return the frequencies for the entire vocabulary when num_results is set to 0
        return sorted(frequency.items(), key=operator.itemgetter(1), reverse=True)

In [None]:
# Return words that occur between 2 frequency values
def word_freq_between(text_corpus, greater_than=200, less_than=500):
    
    # Get the entire vocabulary by frequency
    item_freq = word_freq(text_corpus, 0)
    
    return [(word[0], word[1]) for word in item_freq if (list(word)[1] >= greater_than) and (list(word)[1] < less_than)]

## Simple search functions that don't require an NLP model

In [None]:
# Very simple search function
## Given a document and a word in that document, find all occurrences of the word
## Then return the word along with its n neighbors on either side of it

# If the corpus is small, then then it can be turned into a single document -- i.e.,
## a single list of words.

def get_word_context(document, word, window=6):
    # find the indices of each occurrence of the word in the document
    indices = [i for i, x in enumerate(document) if x == word]
    
    context = []
    # get the word in context
    for index in indices:
        context.append(document[index-window:index+window])
               
    return context

In [None]:
# Very simple search function to get word context across the entire corpus
# This is for a corpus that's built out of individual files
def get_word_context_corpus(corpus, file_list, word, window=6):
    
    file_info = []
    doc_info = []
    
    doc_index = 0 # keeps track of the file associated with the document
    for document in corpus:
        doc_name = file_list[doc_index]
        context = get_word_context(document, word, window)
        # if the context is not an empty list, append it to doc_info
        if context:
            file_info.append(doc_name)
            context_strings = []
            for item in context:
                context_strings.append(" ".join(item))
            doc_info.append(context_strings)
        doc_index += 1
        
    # Put it into a dataframe for display
    df_results = pd.DataFrame({'Call Note': file_info, 'Matching Phrases': doc_info})
    
    return df_results

## Serialization (save and load) functions

In [None]:
# pickle a list 
import pickle
def pickle_list(file_path, list_to_pickle):
    with open(file_path, 'wb') as f:
        pickle.dump(list_to_pickle, f)

In [None]:
# Load a pickled list
import pickle
def load_list(file_path):
    with open(file_path, 'rb') as f:
        unpickled_list = pickle.load(f)
    
    return unpickled_list

In [None]:
# Serialize the pyLDAvis prepared file to disk
def save_LDAvis_content(LDAvis_content, file_path):
     with open(file_path, 'wb') as f:
        pickle.dump(LDAvis_content, f)

In [None]:
# load the pre-prepared ldavis_content from disk
def load_LDAvis_content(file_path):
    with open(file_path, 'rb') as f:
        LDAvis_prepared = pickle.load(f)
        
    return LDAvis_prepared

## Topic modeling functions

In [None]:
from gensim.corpora import Dictionary, MmCorpus
from gensim import corpora, models, similarities
from gensim.models.phrases import Phrases, Phraser

from gensim.models.ldamulticore import LdaMulticore
from gensim.models import Word2Vec 
from gensim.models.word2vec import LineSentence # use when reading sentences from large files
from gensim.models import TfidfModel

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
# pyLDAvis for visualizing topic models
import pyLDAvis
import pyLDAvis.gensim
import warnings

In [None]:
# For the NMF topic model
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize

# scikit learn model persistance -- saving and loading
from sklearn.externals import joblib
# joblib.dump(scikit_model, 'filename.pkl') # save the model
# scikit_model = joblib.load('filename.pkl') # load the model

In [None]:
TOP_N = 20

def explore_topic(nlp_model, topic_number, topn=TOP_N):
    """
    accept a user-supplied nlp_model and topic number and
    print out a formatted list of the top terms
    """
        
    print(u'{:20} {}'.format(u'Term', u'Frequency'))

    for term, frequency in nlp_model.show_topic(topic_number, topn=TOP_N):
        print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))

In [None]:
# From https://medium.com/ml2vec/topic-modeling-is-an-unsupervised-learning-approach-to-clustering-documents-to-discover-topics-fdfbf30e27df

def get_lda_topics(model, num_topics, top_n=TOP_N):
    '''
    Show the words that make up the topics in an LDA topic model
    '''
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = top_n);
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words];
    
    return pd.DataFrame(word_dict)

In [None]:
# From https://medium.com/ml2vec/topic-modeling-is-an-unsupervised-learning-approach-to-clustering-documents-to-discover-topics-fdfbf30e27df

def get_nmf_topics(model, num_topics, n_top_words=TOP_N):
    
    #the word ids obtained need to be reverse-mapped to the words so we can print the topic names.
    vectorizer = CountVectorizer(analyzer='word')
    feat_names = vectorizer.get_feature_names()
    
    word_dict = {}
    for i in range(num_topics):
        
        #for each topic, obtain the largest values, and add the words they map to into the dictionary.
        words_ids = model.components_[i].argsort()[:-n_top_words - 1:-1]
        words = [feat_names[key] for key in words_ids]
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = words
    
    return pd.DataFrame(word_dict)

## Search functions on already built NLP models

In [None]:
# Return the results of a query
def get_query_results(query_string, 
                      search_dict, 
                      df_corpus, 
                      df_cols_to_display, 
                      num_results=25
                     ):
    
    '''
    query_string is a string of any length
    search_dict is a dict that contains the name of the phraser, dictionary,
      model, and index to use for the search.
    df_corpus is the complete dataframe of the corpus being searched  
    df_cols_to_display are the cols of df_corpus to display in the search results dataframe
    
    '''
    t0 = time.time()
    # Process the query string into a list of tokens
    clean_query = prep_doc(query_string)
    
    # Convert the list of tokens into phrases if necessary
    if search_dict['phraser'] != '':
        phrased_query = search_dict['phraser'][clean_query]
    else:
        phrased_query = clean_query
    
    # For everything EXCEPT Doc2Vec proceed as follows
    if search_dict['index'] != 'doc2vec':
        # Use the dictionary to transform the phrased_query into a bag of words vector
        bow_query = search_dict['dictionary'].doc2bow(phrased_query)
    
        # Transform the bag of words vector into a vector in the topic model's space
        model_query = search_dict['model'][bow_query]
    
        # Calculate the similarity of the query to each document in the corpus
        sims = search_dict['index'][model_query]
    
        # Sort the similarity scores in descending order
        sims_sorted = sorted(enumerate(sims), key=lambda item: -item[1])[0:num_results]
    else:
        # Create the Doc2Vec sims on the fly
        query_vector = search_dict['model'].infer_vector(phrased_query)
        sims_sorted = d2v_trigram_100.docvecs.most_similar(positive=[query_vector], topn=num_results)
    
    # Build a dataframe for displaying the search results
    dataFrame_content = []
    for item in sims_sorted:
        dataFrame_content.append(df_corpus.iloc[item[0]][df_cols_to_display].values)
        
    df_results = pd.DataFrame.from_records(dataFrame_content, columns=df_cols_to_display)
    
    t1 = time.time()
    print("Search results obtained in {:.3} seconds.".format(t1-t0))
    print("The query is: {}".format(query_string))
    print("Here are the top {} results:".format(num_results))
    
    return df_results

In [None]:
# Jaccard similarity is a metric for measuring how well a model matches an observation to an 
# existing observation.
# Slightly modified from 
#    http://dataconomy.com/2015/04/implementing-the-five-most-popular-similarity-measures-in-python/
def jaccard_similarity(x,y):
    # x and y are tokenized sentences
    #print(set(x))
    #print(set(y))
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    
    try:
        jac_score = intersection_cardinality/float(union_cardinality)
    except ZeroDivisionError:
        jac_score = 0.
 
    return jac_score

In [None]:
# Measure the jaccard similarity between the search results returned for any query
## This is a measure of the variation in the search results that are output for any single query string
def intra_search_overlap(df_search_output, plot_title=''): 
    '''
    Measure the jaccard similarity between the search results returned for any query.
    This is a measure of the variation in the search results that are output for any single query string.
    Display the intra-search result similarity as a heatmap.
    
    df_search_output is the result of a search which returns ONLY the 'CLIENT_QUESTION_PROCESSED' column.
    '''
    
    j_scores_intra = []
    for i in range(len(df_search_output)):
        j_score_row = []
        for j in range(len(df_search_output)):
            j_score = jaccard_similarity(df_search_output.iloc[i].values[0], 
                                         df_search_output.iloc[j].values[0]
                                        )
            j_score_row.append(j_score)
    
        j_scores_intra.append(j_score_row)

    # Put the jaccard scores in a dataframe for display using Seaborn
    df_display = pd.DataFrame(j_scores_intra, columns=list(range(0,len(df_search_output))))
    
    # Create the heatmap
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.set_title(plot_title)
    sns.heatmap(df_display, cmap="YlGnBu")

In [None]:
def inter_search_overlap(df_search_ouputs, 
                         column_name='Short Description Tokens', 
                         plot_title='Inter-Search Overlap of Tech Disciplies'):
    '''
    
    '''
    # Join the tokens returned by each search result into a single big list of tokens for that search query
    search_output_tokens = []
    for df_out in df_search_outputs:
        ab = itertools.chain(df_out[column_name].values)
        flat_ab = [item for sublist in list(ab) for item in sublist]
        search_output_tokens.append(flat_ab)
        
    # for each pair of lists in search_output_tokens, get the jaccard distance
    j_scores_inter = []
    for i in range(len(search_output_tokens)):
        j_score_row = []
        for j in range(len(search_output_tokens)):
            j_score = jaccard_similarity(search_output_tokens[i], search_output_tokens[j])
            j_score_row.append(j_score)
    
        j_scores_inter.append(j_score_row)

    # Put the jaccard scores in a dataframe for display using Seaborn
    df_display = pd.DataFrame(j_scores_inter, columns=list(range(0,len(search_output_tokens))))
    
    #return df_display
    # Create the heatmap
    fig, ax = plt.subplots(figsize=(12, 8))
    ax.set_title(plot_title)
    sns.heatmap(df_display, cmap="BuPu") 

## Word2Vec functions

In [1]:
# Display the entire table of vector embedding values
def display_vectors(w2v_KeyedVecs):
    
    #### NOTE: the full model is not used - only the model's KeyedVectors ####
    # build a list of the terms, integer indices,
    # and term counts from the given Word2Vec model vocabulary
    ordered_vocab = [(term, voc.index, voc.count) for term, voc in w2v_KeyedVecs.vocab.items()]

    # sort by the term counts, so the most common terms appear first
    ordered_vocab = sorted(ordered_vocab, key=lambda item: -item[2])

    # unzip the terms, integer indices, and counts into separate lists
    ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

    # create a DataFrame with the food2vec vectors as data,
    # and the terms as row labels
    word_vectors = w2v_KeyedVecs.vectors[term_indices, :]
    
    # create a dataframe for displaying the vectors
    df_display = pd.DataFrame(word_vectors, index=ordered_terms)
    
    return df_display

In [None]:
NUM_RESULTS = 20

# Based on Patrick Harrison and Radim Rahurek

def pos_related_terms(w2v_KeyedVec, token, topn=NUM_RESULTS):
    
    """
    look up the topn most similar terms to token
    and print them as a formatted list
    """
    try:
        for word, similarity in w2v_KeyedVec.most_similar(positive=[token], topn=NUM_RESULTS):
            print(u'{:20} {}'.format(word, round(similarity, 3)))
    except KeyError:
        print("Sorry, try a different term")
        
def neg_related_terms(w2v_KeyedVec, token, topn=NUM_RESULTS):
    """
    look up the topn most similar terms to token
    and print them as a formatted list
    """
    try:
        for word, similarity in w2v_KeyedVec.most_similar(negative=[token], topn=NUM_RESULTS):
            print(u'{:20} {}'.format(word, round(similarity, 3)))
    except KeyError:
        print("Sorry, try a different term")
        

def word_algebra(w2v_KeyedVec, add_string, subtract_string, topn=NUM_RESULTS):
    """
    combine the vectors associated with the words provided
    in add_string and subtract_string, look up the topn most similar
    terms to the combined vector, and print the result(s)
    Use add_string=None or '' or subtract=None or '' to leave the fields empty
    """
    # Prep the strings
    if add_string != None:
        add = prep_doc(add_string)
    else:
        add = add_string
        
    if subtract_string != None:
        subtract = prep_doc(subtract_string)
    else:
        subtract = subtract_string
    
    try:
        answers = w2v_KeyedVec.most_similar(positive=add, negative=subtract, topn=NUM_RESULTS)
        for term, similarity in answers:
            print(term)
    except KeyError:
        print("Sorry, one or more terms is not in the vocabulary - please try different terms.")
    
        
def odd_one_out(w2v_KeyedVec, token_string):
    
    token_list = prep_doc(token_string)
    
    try:
        odd_one = w2v_KeyedVec.doesnt_match(token_list)
    except ValueError:
        odd_one = "Sorry, one or more terms is not in the vocabulary - please try different terms."
    
    return odd_one

## t-SNE Visualizations

In [None]:
# Import the Scikit Learn t-SNE model
from sklearn.manifold import TSNE

In [None]:
def prep_tsne_input(keyed_vectors, num_words=1000):
    
    '''
    Take a set of KeyedVectors produced by a Word2Vec model and prep it 
    for input into Scikit Learn's TSNE model.
    
    num_words cuts down the complexity by selecting a subset of words from the vocabulary (the num_words most frequent)
    '''
    t0 = time.time()
    df_vecs = display_vectors(keyed_vectors)
    t1 = time.time()
    print("t-SNE input dataframe created in {:.2f} secs.".format(t1-t0))
    
    # df_vecs is the input to the t-SNE model
    tsne = TSNE()
    tsne_input = df_vecs.head(num_words)
    t2 = time.time()
    tsne_vectors = tsne.fit_transform(tsne_input.values)
    t3 = time.time()
    print("t-SNE vectors created in {:.2f} secs.".format(t3-t2))
    
    # Convert the tsne_vectors into a dataframe
    # These can then be used to visualize t-SNE using Bokeh
    df_tsne_vectors = pd.DataFrame(tsne_vectors,
                                   index=pd.Index(tsne_input.index),
                                   columns=[u'x_coord', u'y_coord']
                                  )
    
    return df_tsne_vectors

In [None]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value

output_notebook()

def plot_tsne(df_tsne_vectors, dot_color='orange', title_add=''):
    '''
    Visualize the t-SNE vectors using Bokeh.
    '''
    
    # Map the vocabulary to the t-SNE vectors
    df_tsne_vectors[u'word'] = df_tsne_vectors.index
    
    # add df_tsne_vectors as a ColumnDataSource for Bokeh
    plot_data = ColumnDataSource(df_tsne_vectors)
    
    # create the plot and configure the
    ## title, dimensions, and tools
    tsne_plot = figure(title=u't-SNE Word Embeddings' + title_add,
                       plot_width = 800,
                       plot_height = 800,
                       tools= (u'pan, wheel_zoom, box_zoom,'
                               u'box_select, reset'),
                       active_scroll=u'wheel_zoom')

    # add a hover tool to display words on roll-over
    tsne_plot.add_tools( HoverTool(tooltips = u'@word') )

    # draw the words as circles on the plot
    tsne_plot.circle(u'x_coord', u'y_coord', source=plot_data,
                     color=dot_color, line_alpha=0.2, fill_alpha=0.1,
                     size=10, hover_line_color=u'black')

    # configure visual elements of the plot
    tsne_plot.title.text_font_size = value(u'16pt')
    tsne_plot.xaxis.visible = False
    tsne_plot.yaxis.visible = False
    tsne_plot.grid.grid_line_color = None
    tsne_plot.outline_line_color = None

    # Display the plot
    show(tsne_plot);