# Problem 3
Solution set for CS 155 Set 5, 2019/2020

Authors: Suraj Nair, Sid Murching, Alex Cui

In [2]:
import numpy as np
from P3CHelpers import *
from keras.models import Sequential
from keras.layers.core import Dense, Activation
import sys

Using TensorFlow backend.


## 3D:
Fill in the generate_traindata and find_most_similar_pairs functions

In [3]:
def get_word_repr(word_to_index, word):
    """
    Returns one-hot-encoded feature representation of the specified word given
    a dictionary mapping words to their one-hot-encoded index.

    Arguments:
        word_to_index: Dictionary mapping words to their corresponding index
                       in a one-hot-encoded representation of our corpus.

        word:          Word whose feature representation we wish to compute.

    Returns:
        feature_representation:     Feature representation of the passed-in word.
    """
    unique_words = word_to_index.keys()
    # Return a vector that's zero everywhere besides the index corresponding to <word>
    feature_representation = np.zeros(len(unique_words))
    feature_representation[word_to_index[word]] = 1
    return feature_representation    

def generate_traindata(word_list, word_to_index, window_size=4):
    """
    Generates training data for Skipgram model.

    Arguments:
        word_list:     Sequential list of words (strings).
        word_to_index: Dictionary mapping words to their corresponding index
                       in a one-hot-encoded representation of our corpus.

        window_size:   Size of Skipgram window. Defaults to 2 
                       (use the default value when running your code).

    Returns:
        (trainX, trainY):     A pair of matrices (trainX, trainY) containing training 
                              points (one-hot-encoded vectors) and their corresponding labels
                              (also one-hot-encoded vectors)

    """
    trainX = []
    trainY = []
    vocab_size = len(word_to_index)
    for i in range(len(word_list)):
        # Extracts the words at each spot
        curr_word = word_list[i]
        # Loop over window of words near index i (index of current word)
        # Add pairs (x = curr_word, y = window_word) to our training data
        # for each window_word in the window.
        for j in range(1, window_size):
            ahead_idx = i + j
            behind_idx = i - j
            if ahead_idx < len(word_list):
                ahead_word = word_list[ahead_idx]
                y = get_word_repr(word_to_index, ahead_word)
                x = get_word_repr(word_to_index, curr_word)
                trainX.append(x)
                trainY.append(y)
            if behind_idx > 0:
                behind_word = word_list[behind_idx]
                y = get_word_repr(word_to_index, behind_word)
                x = get_word_repr(word_to_index, curr_word)
                trainX.append(x)
                trainY.append(y)
    return np.array(trainX), np.array(trainY)

In [4]:
def find_most_similar_pairs(filename, num_latent_factors):
    """
    Find the most similar pairs from the word embeddings computed from
    a body of text
    
    Arguments:
        filename:           Text file to read and train embeddings from
        num_latent_factors: The number of latent factors / the size of the embedding
    """
    # Load in a list of words from the specified file; remove non-alphanumeric characters
    # and make all chars lowercase.
    sample_text = load_word_list(filename)

    # Create word dictionary
    word_to_index = generate_onehot_dict(sample_text)
    print("Textfile contains %s unique words"%len(word_to_index))
    # Create training data
    trainX, trainY = generate_traindata(sample_text, word_to_index)
    # Build our model
    vocab_size = len(word_to_index)
    model = Sequential()
    # <hidden_layer> contains our latent factors (vector representation of each word)	
    hidden_layer = Dense(num_latent_factors, input_dim = vocab_size)
    model.add(hidden_layer)
    # <output_layer> transforms the outputs of <hidden_layer> into a vector of size <vocab_size>.
    output_layer = Dense(vocab_size)
    model.add(output_layer)
    model.add(Activation('softmax'))

    # Compile and fit our model
    model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])
    model.fit(trainX, trainY, batch_size = 100, nb_epoch = 50, verbose=2)
    weights, biases = hidden_layer.get_weights()
    print("Hidden layer weight matrix shape: ", weights.shape)
    output_weights, output_biases = output_layer.get_weights()
    print("Output layer weight matrix shape: ", output_weights.shape)

    # Find and print most similar pairs
    similar_pairs = most_similar_pairs(weights, word_to_index)
    for pair in similar_pairs[:30]:
        print(pair)

## 3G:
Run the function below and report your results for dr_seuss.txt.

In [5]:
find_most_similar_pairs('data/dr_seuss.txt', 10)

 - 333s - loss: 5.6554 - acc: 0.0351
Epoch 2/50
 - 1s - loss: 5.2166 - acc: 0.0583
Epoch 3/50
 - 1s - loss: 4.8308 - acc: 0.0588
Epoch 4/50
 - 1s - loss: 4.7709 - acc: 0.0588
Epoch 5/50
 - 1s - loss: 4.7525 - acc: 0.0596
Epoch 6/50
 - 1s - loss: 4.7381 - acc: 0.0602
Epoch 7/50
 - 1s - loss: 4.7249 - acc: 0.0617
Epoch 8/50
 - 1s - loss: 4.7122 - acc: 0.0617
Epoch 9/50
 - 1s - loss: 4.6987 - acc: 0.0623
Epoch 10/50
 - 1s - loss: 4.6841 - acc: 0.0637
Epoch 11/50
 - 1s - loss: 4.6681 - acc: 0.0717
Epoch 12/50
 - 1s - loss: 4.6505 - acc: 0.0690
Epoch 13/50
 - 1s - loss: 4.6313 - acc: 0.0717
Epoch 14/50
 - 1s - loss: 4.6113 - acc: 0.0730
Epoch 15/50
 - 1s - loss: 4.5900 - acc: 0.0747
Epoch 16/50
 - 1s - loss: 4.5681 - acc: 0.0788
Epoch 17/50
 - 1s - loss: 4.5460 - acc: 0.0819
Epoch 18/50
 - 1s - loss: 4.5231 - acc: 0.0841
Epoch 19/50
 - 1s - loss: 4.5006 - acc: 0.0848
Epoch 20/50
 - 1s - loss: 4.4779 - acc: 0.0899
Epoch 21/50
 - 1s - loss: 4.4555 - acc: 0.0924
Epoch 22/50
 - 1s - loss: 4.433

In [7]:
""" Outputs:
Hidden layer weight matrix shape:  (308, 10)
Output layer weight matrix shape:  (10, 308)
Pair(today, tomorrow), Similarity: 0.98951316
Pair(tomorrow, today), Similarity: 0.98951316
Pair(finger, top), Similarity: 0.9871409
Pair(top, finger), Similarity: 0.9871409
Pair(gone, today), Similarity: 0.9847697
Pair(fox, goat), Similarity: 0.98089796
Pair(goat, fox), Similarity: 0.98089796
Pair(heads, grows), Similarity: 0.9799606
Pair(grows, heads), Similarity: 0.9799606
Pair(shoe, foot), Similarity: 0.97770756
Pair(foot, shoe), Similarity: 0.97770756
Pair(off, shoe), Similarity: 0.9769319
Pair(zeep, tomorrow), Similarity: 0.9766475
Pair(likes, wink), Similarity: 0.97186935
Pair(wink, likes), Similarity: 0.97186935
Pair(drink, wink), Similarity: 0.9711438
Pair(his, book), Similarity: 0.96769226
Pair(book, his), Similarity: 0.96769226
Pair(hop, then), Similarity: 0.96711636
Pair(then, hop), Similarity: 0.96711636
Pair(eight, nine), Similarity: 0.9662633
Pair(nine, eight), Similarity: 0.9662633
Pair(mouse, fox), Similarity: 0.96473193
Pair(these, pets), Similarity: 0.9645774
Pair(pets, these), Similarity: 0.9645774
Pair(cannot, hear), Similarity: 0.96193874
Pair(hear, cannot), Similarity: 0.96193874
Pair(ride, fly), Similarity: 0.9618145
Pair(fly, ride), Similarity: 0.9618145
Pair(thing, drink), Similarity: 0.96120024
"""

' Outputs:\nHidden layer weight matrix shape:  (308, 10)\nOutput layer weight matrix shape:  (10, 308)\nPair(today, tomorrow), Similarity: 0.98951316\nPair(tomorrow, today), Similarity: 0.98951316\nPair(finger, top), Similarity: 0.9871409\nPair(top, finger), Similarity: 0.9871409\nPair(gone, today), Similarity: 0.9847697\nPair(fox, goat), Similarity: 0.98089796\nPair(goat, fox), Similarity: 0.98089796\nPair(heads, grows), Similarity: 0.9799606\nPair(grows, heads), Similarity: 0.9799606\nPair(shoe, foot), Similarity: 0.97770756\nPair(foot, shoe), Similarity: 0.97770756\nPair(off, shoe), Similarity: 0.9769319\nPair(zeep, tomorrow), Similarity: 0.9766475\nPair(likes, wink), Similarity: 0.97186935\nPair(wink, likes), Similarity: 0.97186935\nPair(drink, wink), Similarity: 0.9711438\nPair(his, book), Similarity: 0.96769226\nPair(book, his), Similarity: 0.96769226\nPair(hop, then), Similarity: 0.96711636\nPair(then, hop), Similarity: 0.96711636\nPair(eight, nine), Similarity: 0.9662633\nPair(