In [1]:
import gensim
import os
import string
import itertools
from nltk.corpus import brown, movie_reviews, treebank, webtext, gutenberg
import sense2vec
from operator import itemgetter
from joke_model import JokeModel
from language_models import Sense2VecModel, Word2VecModel
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt

In [2]:
# define the results array parts with "constants"
class RESULT:
    PlainText, TaggedText, TaggedWords, SimGrid, EntGrid, MinEnt, MaxEnt = range(7)

In [3]:
# DEBUGGING
_VERBOSE = True

In [4]:
def load_stopwords(fname='stopwords.txt'):
    # stopwords =  ['a','to','of', 'so', 'on', 'the', 'into']
    # stopwords += ['i', 'me', 'my', 'you', 'us', 'we', 'them', 'she', 'her', 'he', 'him']
    # stopwords += ['and', 'or', 'but']
    # stopwords += ['had', 'have', "'ve"]
    # stopwords += ['is', 'are', 'am', "'m", 'be']
    # stopwords += ["'s", "'d"]
    stopWords = set(stopwords.words('english')) | {'would'}
    return stopWords

In [5]:
def load_stoptags(fname='stoppos.txt'):
    allpos = ['ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 
            'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'NORP', 
            'FACILITY', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LANGUAGE']
    keeppos = ['ADJ', 'ADV', 'INTJ', 'NOUN',  
            'PROPN', 'SCONJ', 'SYM', 'VERB', 'X', 'NORP', 
            'FACILITY', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LANGUAGE']
    stoppos = list(set(allpos) - set(keeppos))
    return stoppos


In [6]:
def get_similarities(this_model, this_joke):
    # probably want to make these global so it doesn't have to do this for EVERY joke
    # or put them in the model?
    stop_words = load_stopwords()
    stop_tags = load_stoptags()

    # remove stopwords
    joke_words = [w for w in this_joke.split() if w.split('|')[0].lower() not in stop_words]
    # remove unwanted tags
    joke_words = [w for w in joke_words if w.split('|')[1] not in stop_tags]
    # remove OOV words
    joke_words = [w for w in joke_words if this_model.in_vocab(w)]

    sim_grid = pd.DataFrame(index=joke_words, columns=joke_words)
    #sim_grid = sim_grid.fillna(-1.0)

    pairs = list(itertools.combinations(joke_words,2))
    for (left_word,right_word) in pairs:
        try:
            this_sim = this_model.similarity(left_word, right_word)
            sim_grid[left_word][right_word] = this_sim
            sim_grid[right_word][left_word] = this_sim
        except:
            # we could use this to build a stopword list
            # or we could use ContVec? to reconstruct a new vector for the OOV word?
            print("one of these words is not in vocab: {0}, {1}".format(left_word,right_word))
    return [sim_grid, this_joke, joke_words]


In [7]:
def rank_similarities(this_joke, ascending=True):
    sim_list = sorted([(this_joke[RESULT.SimGrid][x][y],'{} {}'.format(x,y)) 
                   for xi,x in enumerate(this_joke[RESULT.TaggedWords])
                   for yi,y in enumerate(this_joke[RESULT.TaggedWords]) if xi < yi])
    return sim_list

def meandiff_similarities(this_joke):
    sim_grid = this_joke[RESULT.SimGrid].replace(-1,np.nan).mean()
    sim_grid = np.abs(this_joke[RESULT.SimGrid].replace(-1,np.nan) - sim_grid.mean())
    sim_list = sorted([(sim_grid[x][y],'{} {}'.format(x,y)) 
                       for xi,x in enumerate(this_joke[RESULT.TaggedWords])
                       for yi,y in enumerate(this_joke[RESULT.TaggedWords]) if xi < yi])
    return sim_list

# print(results[10][0],meandiff_similarities(results[10]))
# for r in results:
#      print('{}\n\t{}\n\t{}\n\t{}\n\t{}'.format(r[0],
#                                                rank_similarities(r)[0],
#                                                rank_similarities(r)[-1],
#                                                meandiff_similarities(r)[0],
#                                                meandiff_similarities(r)[-1]))

In [8]:
def plot_similarities(this_joke,save_plot=False):
    fig, ax1 = plt.subplots(figsize=(7,4))
    
    similarities = this_joke[3].replace(-1,1) # show unit similarity
    for xi, x in enumerate(this_joke[2]):
        for y in range(len(this_joke[2])):
            similarities[x][y] = np.nan if xi > y else similarities[x][y]
    heatmap = ax1.imshow(similarities, cmap='hot', interpolation='nearest')
    ax1.set_xticks(ticks=range(len(this_joke[2])))
    ax1.set_xticklabels(this_joke[2], rotation=60,ha='right')
    ax1.set_yticks(ticks=range(len(this_joke[2])))
    ax1.set_yticklabels(this_joke[2])

    plt.colorbar(heatmap)
    plt.title("Pairwise word similarity (cosine similarity of word vectors)")
    if save_plot:
        print('not saved.')
        pass
        # but we need to maybe save the plots in a folder plots/joke_id
    
    plt.show()

In [9]:
def get_entropies(pos_tagged_list):
    entropies = [model.entropy(string) for string in pos_tagged_list]
#     print(entropies)
#     print(np.cumsum(entropies))
    return list(entropies)


In [10]:
def plot_entropy(this_joke,save_plot=False,show_sims=True,unit_height=False):
    # twin scales code suggested by https://matplotlib.org/devdocs/gallery/api/two_scales.html
    # this_joke[4] should contain the word-by-word entropy list
    # this_joke[5] should contain the minimum entropy "parse"
    # this_joke[6] should contain the maximum entropy "parse"
    fig, ax1 = plt.subplots(figsize=(7,4))
    
    individual_entropies = this_joke[RESULT.EntGrid]
    cumulative_entropy = list(np.cumsum(this_joke[RESULT.EntGrid]))
    min_pos_entropy = get_entropies(this_joke[5])
    max_pos_entropy = get_entropies(this_joke[6])
        
    if unit_height:
        max_ent = cumulative_entropy[-1]
        individual_entropies /= max_ent
        cumulative_entropy /= max_ent
        min_pos_entropy /= max_ent
        max_pos_entropy /= max_ent
        ax1.set_ylim([-0.1,1.1])
        
    ax1.plot(cumulative_entropy, label='cumulative')
    ax1.plot(individual_entropies, label='jokePOS')
    ax1.plot(min_pos_entropy, label='minPOS')
    ax1.plot(max_pos_entropy, label='maxPOS')
    ax1.set_ylabel('Entropy (nat)')

    if show_sims:
        # plot the similarity ranges on ax2
        maxes = this_joke[RESULT.SimGrid].replace(-1,np.nan).max(axis=1).tolist()
        mins = this_joke[RESULT.SimGrid].replace(-1,np.nan).min(axis=1).tolist()
        averages = this_joke[RESULT.SimGrid].mean().tolist()
        ax2 = ax1.twinx()
        if unit_height:
            ax2.set_ylim([-0.1,1.1])
    #    ax2.set_position([box.x0, box.y0, box.width * 0.8, box.height])
        ax2.plot(maxes,'g^',linestyle='None')
        ax2.plot(mins,'gv',linestyle='None')
        ax2.plot(averages, 'kx', linestyle='dotted')
        ax2.set_ylabel('Min/Max Similarity')


    
    box = ax1.get_position()

    # Shrink current axis by 20%
#     ax1.set_position([box.x0, box.y0, box.width * 0.8, box.height])

#    ax1.legend()
    # Put a legend to the right of the current axis
#     ax1.legend(loc='center left', bbox_to_anchor=(1, 0.5))

    ax1.legend(loc='upper center', bbox_to_anchor=(0.5, 1.12), ncol=4, fancybox=False, shadow=False)

    ax1.set_xticks(ticks=range(len(this_joke[2])))
    ax1.set_xticklabels(this_joke[2], rotation=23, ha='right')


    if save_plot:
        print('not saved.')
        pass
        # but we need to maybe save the plots in a folder plots/joke_id
    
    plt.show()

In [None]:
def main(model_choice='s2v',joke_choice='jokes.txt'):
    print("Load the model")
    if model_choice == 'w2v':
        print(">>word2vec - extended corpora")
        model = Word2VecModel(model_choice)
    elif model_choice == 's2v':
        print(">>sense2vec - reddit hivemind corpus")
        model = Sense2VecModel(model_choice)
    else:
        raise NotImplementedError

    try:
        print("Load the jokes")
        jokes = JokeModel(joke_choice,named_entities=False)
    except:
        raise Exception('Could not load file "'+joke_choice+'"')


    results = [[j,None,None,[None]] for j in jokes.raw_jokes()]
    joke_id = 0
    for joke in jokes.tagged_jokes():
        print(results[joke_id][0])
        mns, mnw, mxs, mxw, grid, pos_joke, pos_joke_words = get_similarities(model, joke)
        results[joke_id][1] = pos_joke
        results[joke_id][2] = pos_joke_words
        results[joke_id][3] = grid
        joke_id += 1
        
    with open(model_choice+'_'+joke_choice+'.pkl','wb') as pkl_file:
        pickle.dump(results, pkl_file)



In [11]:
def load_model(model, model_size=None, recalculate=False, write=True):
    '''
    model_choice: string representing a language model: Sense2VecModel, Word2VecModel
    model_size: model-specific size string
    from_file: string for loading instead of recalculating
    '''
    # These are the current known language model classes
    model_functions = {'Sense2Vec' : Sense2VecModel,
                       'Word2Vec'  : Word2VecModel} 

    # validate model_choice
    if model not in model_functions:
        valid_values = ', '.join([k for k,v in model_functions.items()])
        raise ValueError("model value must be one of the following: '{}'".format(valid_values))
    
    
    file_name = model + ('_' + model_size if model_size is not None else '') + '.pkl'
    if not recalculate:
        try:
            if _VERBOSE: print('Loading {} model from {} ...'.format(model, file_name))
            with open(file_name,'rb') as pkl_file:
                language_model = pickle.load(pkl_file)
            if _VERBOSE: print('Loaded.')
        except:
            recalculate = True
            
    if recalculate:
        if _VERBOSE: print('Building {} model ...'.format(model))
        try:
            language_model = model_functions[model](model, model_size)
        except:
            raise NotImplementedError
        if _VERBOSE: print("Done.")
        if write:
            with open(file_name,'wb') as pkl_file:
                pickle.dump(language_model, pkl_file)
            if _VERBOSE: print('results saved to {}'.format(file_name))

    return language_model

In [12]:
def load_text(text_choice='jokes.txt'):
    try:
        if _VERBOSE: print("Loading text from {}".format(text_choice))
        jokes = JokeModel(text_choice,named_entities=False)
        if _VERBOSE: print("Done.")
    except:
        raise Exception('Could not load file "'+text_choice+'"')
    return jokes

In [37]:
def get_text_metrics(lang_model, joke_model, recalculate=False, write=True):
    '''
    lang_model: Language Model
              : selected language model to use
    joke_model: Joke Model 
              : model_choice
    returns   : results list

    '''
    file_name = lang_model.model_type+'_'+joke_model.joke_file+'.pkl'
    if not recalculate:
        try:
            with open(file_name,'rb') as pkl_file:
                results = pickle.load(pkl_file)
            if _VERBOSE: print('previous results loaded from {}'.format(file_name))
        except:
            recalculate=True
    
    if recalculate:
        results = [[j,None,None,[None],None,None,None] for j in joke_model.raw_jokes()]
        for joke_id, joke in enumerate(joke_model.tagged_jokes()):
            if _VERBOSE: print(results[joke_id][RESULT.PlainText])
            grid, pos_joke, pos_joke_words = get_similarities(lang_model, joke)
            results[joke_id][RESULT.TaggedText] = pos_joke
            results[joke_id][RESULT.TaggedWords] = pos_joke_words
            results[joke_id][RESULT.SimGrid] = grid

            # maybe we don't store this?
#            results[joke_id][RESULT.EntGrid] = get_entropies(pos_joke_words) # might only work for s2v
            results[joke_id][RESULT.EntGrid] = list([lang_model.entropy(string) for string in pos_joke_words])

            # should probably make this a function

            permuted_tagged_sentence = [' '.join(item) for item 
                                        in list(itertools.product(*[lang_model.pos_list(w) for w in pos_joke_words]))]
            pts_sorted = sorted([[lang_model.entropy(p),p] for p in permuted_tagged_sentence])
            results[joke_id][5] = list([lang_model.entropy(string) for string in pts_sorted[0][1].split()]) # minimum entropy version tagged_string list
            results[joke_id][6] = list([lang_model.entropy(string) for string in pts_sorted[-1][1].split()]) # maximum entropy version tagged_string list

        if write:
            with open(file_name,'wb') as pkl_file:
                pickle.dump(results, pkl_file)
            if _VERBOSE: print('results saved to {}'.format(file_name))

    return results

In [30]:
s2v_model = load_model(model='Sense2Vec', recalculate=True)

Building Sense2Vec model ...
counting tokens
Done.
results saved to Sense2Vec.pkl


In [15]:
jokes = load_text('jokes.txt')
non_jokes = load_text('non_jokes.txt')

Loading text from jokes.txt
Done.
Loading text from non_jokes.txt
Done.


In [35]:
#print(get_similarities(s2v_model,"i|PRON am|VERB on|ADP a|DET whiskey|NOUN diet|NOUN .|PUNCT i|PRON have|VERB lost|VERB three|QUANTITY pounds|QUANTITY already|ADV .|PUNCT"))
j = "i|PRON am|VERB on|ADP a|DET whiskey|NOUN diet|NOUN .|PUNCT i|PRON have|VERB lost|VERB three|QUANTITY ducks|NOUN already|ADV .|PUNCT"
stop_words = load_stopwords()
stop_tags = load_stoptags()
this_model = s2v_model
joke_words = [w for w in j.split() if w.split('|')[0].lower() not in stop_words]
print(joke_words)
joke_words = [w for w in joke_words if w.split('|')[1] not in stop_tags]
print(joke_words)
#joke_words = [w for w in joke_words if this_model.in_vocab(w)]
for w in joke_words:
    if s2v_model.in_vocab(w):
        print(w)


['whiskey|NOUN', 'diet|NOUN', '.|PUNCT', 'lost|VERB', 'three|QUANTITY', 'ducks|NOUN', 'already|ADV', '.|PUNCT']
['whiskey|NOUN', 'diet|NOUN', 'lost|VERB', 'three|QUANTITY', 'ducks|NOUN', 'already|ADV']
whiskey|NOUN
diet|NOUN
lost|VERB
three|QUANTITY
ducks|NOUN
already|ADV


In [38]:
s2v_joke_results = get_text_metrics(s2v_model,jokes,recalculate=True)
s2v_non_joke_results = get_text_metrics(s2v_model,non_jokes,recalculate=True)

i wasn't originally going to get a brain transplant, but then i changed my mind.
i was going to get a brain transplant and then i changed my mind.
i'd tell you a chemistry joke but i know i wouldn't get a reaction.
i'm glad i know sign language, it's pretty handy.
i have a few jokes about unemployed people but it doesn't matter none of them work.
i used to be a banker, but then i lost interest.
i hate insect puns, they really bug me.
insect puns bug me.
it's hard to explain puns to kleptomaniacs because they always take things literally.
i was so sad and crying when i lost my playstation 3 but unfortunately, there was nobody to console me!
i'm on a whiskey diet. i've lost three days already.
she broke into song when she couldn't find the key.
she had a boyfriend with a wooden leg, but broke it off.
corduroy pillows are making headlines.
if you want to catch a squirrel just climb a tree and act like a nut.
a magician was walking down the street and turned into a grocery store.
time flie

In [39]:
w2v_model = load_model(model='Word2Vec',model_size='full',recalculate=True)

Building Word2Vec model ...
Done.
results saved to Word2Vec_full.pkl


In [40]:
print("jokes")
w2v_joke_results = get_text_metrics(w2v_model,jokes,recalculate=True)
print("non-jokes")
w2v_non_joke_results = get_text_metrics(w2v_model,non_jokes,recalculate=True)

jokes
i wasn't originally going to get a brain transplant, but then i changed my mind.


AttributeError: 'Word2VecModel' object has no attribute 'pos_list'

In [19]:
print(w2v_model.in_vocab("pounds|NOUN"))
print(w2v_non_joke_results[11][RESULT.TaggedText])
print(w2v_non_joke_results[11][RESULT.TaggedWords])
print([w for w in w2v_non_joke_results[11][RESULT.TaggedText].split() if w.split('|')[0].lower() not in load_stopwords()])
print([w for w in w2v_non_joke_results[11][RESULT.TaggedText].split() if w.split('|')[1] not in load_stoptags()])
print([w for w in w2v_non_joke_results[11][RESULT.TaggedText].split() if w2v_model.in_vocab(w.split('|')[0])])

this_joke = w2v_non_joke_results[11][RESULT.TaggedText]
# remove stopwords
joke_words = [w for w in this_joke.split() if w.split('|')[0].lower() not in load_stopwords()]
# remove unwanted tags
joke_words = [w for w in joke_words if w.split('|')[1] not in load_stoptags()]
# remove OOV words
joke_words = [w for w in joke_words if w2v_model.in_vocab(w)]
print(joke_words)

for ji,j in enumerate(non_jokes.tagged_jokes()):
    if ji==11: print(j)

True
i|PRON am|VERB on|ADP a|DET whiskey|NOUN diet|NOUN .|PUNCT
i|PRON have|VERB lost|VERB three|QUANTITY pounds|QUANTITY already|ADV .|PUNCT
['whiskey|NOUN', 'diet|NOUN', 'lost|VERB', 'three|QUANTITY', 'already|ADV']
['whiskey|NOUN', 'diet|NOUN', '.|PUNCT', 'lost|VERB', 'three|QUANTITY', 'pounds|QUANTITY', 'already|ADV', '.|PUNCT']
['am|VERB', 'whiskey|NOUN', 'diet|NOUN', 'have|VERB', 'lost|VERB', 'three|QUANTITY', 'pounds|QUANTITY', 'already|ADV']
['i|PRON', 'am|VERB', 'on|ADP', 'a|DET', 'whiskey|NOUN', 'diet|NOUN', '.|PUNCT', 'i|PRON', 'have|VERB', 'lost|VERB', 'three|QUANTITY', 'pounds|QUANTITY', 'already|ADV', '.|PUNCT']
['whiskey|NOUN', 'diet|NOUN', 'lost|VERB', 'three|QUANTITY', 'pounds|QUANTITY', 'already|ADV']
i|PRON am|VERB on|ADP a|DET whiskey|NOUN diet|NOUN .|PUNCT
i|PRON have|VERB lost|VERB three|QUANTITY pounds|QUANTITY already|ADV .|PUNCT


In [None]:
a = 10
b = 11
plot_entropy(w2v_joke_results[a],save_plot=False,show_sims=True,unit_height=True)
plot_entropy(w2v_non_joke_results[b],save_plot=False,show_sims=True,unit_height=True)

# use for discovering and explaining
plot_similarities(w2v_joke_results[a],save_plot=False)
plot_similarities(w2v_non_joke_results[b],save_plot=False)

print(w2v_joke_results[a][RESULT.PlainText])
print(w2v_non_joke_results[b][RESULT.PlainText])

In [None]:
for r in s2v_joke_results:
    plot_entropy(r,show_sims=True,save_plot=False,unit_height=True)
    plot_similarities(r,save_plot=False)

In [None]:
joke_choice='non_jokes.txt'
try:
    print("Load the jokes")
    jokes = JokeModel(joke_choice,named_entities=False)
    print("Loaded.")
except:
    raise Exception('Could not load file "'+joke_choice+'"')

In [None]:
try:
    with open(model_choice+'_'+joke_choice+'.pkl','rb') as pkl_file:
        results = pickle.load(pkl_file)
    print("Loaded from file")
except:
    results = [[j,None,None,[None],None,None,None] for j in jokes.raw_jokes()]
    for joke_id, joke in enumerate(jokes.tagged_jokes()):
        print(results[joke_id][0])
        mns, mnw, mxs, mxw, grid, pos_joke, pos_joke_words = get_similarities(model, joke)
        results[joke_id][1] = pos_joke
        results[joke_id][2] = pos_joke_words
        results[joke_id][3] = grid

        # maybe we don't store this?
        results[joke_id][4] = get_entropies(pos_joke_words) # might only work for s2v

        # should probably make this a function

        permuted_tagged_sentence = [' '.join(item) for item 
                                    in list(itertools.product(*[model.pos_list(w) for w in pos_joke_words]))]
        pts_sorted = sorted([[model.entropy(p),p] for p in permuted_tagged_sentence])
        results[joke_id][5] = pts_sorted[0][1].split() # minimum entropy version tagged_string list
        results[joke_id][6] = pts_sorted[-1][1].split() # maximum entropy version tagged_string list

        with open(model_choice+'_'+joke_choice+'.pkl','wb') as pkl_file:
            pickle.dump(results, pkl_file)


In [None]:
for r in results:
    plot_entropy(r,show_sims=True,save_plot=False,unit_height=True)
    plot_similarities(r,save_plot=False)