# Problem 3
Solution set for CS 155 Set 5, 2019/2020

Authors: Suraj Nair, Sid Murching, Alex Cui

In [5]:
import numpy as np
from P3CHelpers import *
import sys
# load data into PyTorch format
import torch
import torch.nn as nn
import torch.optim as optim

## 3D:
Fill in the generate_traindata and find_most_similar_pairs functions

In [2]:
def get_word_repr(word_to_index, word):
    """
    Returns one-hot-encoded feature representation of the specified word given
    a dictionary mapping words to their one-hot-encoded index.

    Arguments:
        word_to_index: Dictionary mapping words to their corresponding index
                       in a one-hot-encoded representation of our corpus.

        word:          Word whose feature representation we wish to compute.

    Returns:
        feature_representation:     Feature representation of the passed-in word.
    """
    unique_words = word_to_index.keys()
    # Return a vector that's zero everywhere besides the index corresponding to <word>
    feature_representation = np.zeros(len(unique_words))
    feature_representation[word_to_index[word]] = 1
    return feature_representation    

def generate_traindata(word_list, word_to_index, window_size=4):
    """
    Generates training data for Skipgram model.

    Arguments:
        word_list:     Sequential list of words (strings).
        word_to_index: Dictionary mapping words to their corresponding index
                       in a one-hot-encoded representation of our corpus.

        window_size:   Size of Skipgram window. Defaults to 2 
                       (use the default value when running your code).

    Returns:
        (trainX, trainY):     A pair of matrices (trainX, trainY) containing training 
                              points (one-hot-encoded vectors) and their corresponding output_word
                              (also one-hot-encoded vectors)

    """
    trainX = []
    trainY = []
    vocab_size = len(word_to_index)
    for i in range(len(word_list)):
        # Extracts the words at each spot
        curr_word = word_list[i]
        # Loop over window of words near index i (index of current word)
        # Add pairs (x = curr_word, y = window_word) to our training data
        # for each window_word in the window.
        for j in range(1, window_size):
            ahead_idx = i + j
            behind_idx = i - j
            if ahead_idx < len(word_list):
                ahead_word = word_list[ahead_idx]
                y = get_word_repr(word_to_index, ahead_word)
                x = get_word_repr(word_to_index, curr_word)
                trainX.append(x)
                trainY.append(y)
            if behind_idx > 0:
                behind_word = word_list[behind_idx]
                y = get_word_repr(word_to_index, behind_word)
                x = get_word_repr(word_to_index, curr_word)
                trainX.append(x)
                trainY.append(y)
    return np.array(trainX), np.array(trainY)

In [10]:
def find_most_similar_pairs(filename, num_latent_factors):
    """
    Find the most similar pairs from the word embeddings computed from
    a body of text
    
    Arguments:
        filename:           Text file to read and train embeddings from
        num_latent_factors: The number of latent factors / the size of the embedding
    """
    # Load in a list of words from the specified file; remove non-alphanumeric characters
    # and make all chars lowercase.
    sample_text = load_word_list(filename)

    # Create word dictionary
    word_to_index = generate_onehot_dict(sample_text)
    print("Textfile contains %s unique words"%len(word_to_index))
    # Create training data
    trainX, trainY = generate_traindata(sample_text, word_to_index)
    # Build our model
    vocab_size = len(word_to_index)

    # set batch size
    batch_size = 100
    
    # transform to torch tensor
    tensor_x = torch.Tensor(trainX)
    # Pytorch expects the label to be a categorical label, not a one-hot vector
    # Thus, we convert one-hot to categorical here. In keras, it should be kept
    # as a one-hot vector.
    # For example, the label: [0, 0, 1, 0] should be converted to [2]
    tensor_y = torch.Tensor(np.argmax(trainY, axis=1)).long()

    dataset = torch.utils.data.TensorDataset(tensor_x,tensor_y)

    training_data_loader = torch.utils.data.DataLoader(dataset,
                                                       batch_size=batch_size,
                                                       shuffle=True)
    model = nn.Sequential(
        nn.Linear(vocab_size, num_latent_factors),
        nn.Linear(num_latent_factors, vocab_size)
        # softmax activation included in cross entropy calculation, so do not
        # include it in your model.
    )
    # our model has some # of parameters:
    print('Note: Matrices are the weights,',
          'vectors are the bias vectors')
    for i, p in enumerate(model.parameters()):
        print('Layer', i, ' has shape:', p.data.shape, '\n')

    # For a multi-class classification problem
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    # set number training epochs
    n_epochs = 50
    # store metrics for plotting
    training_accuracy_history = np.zeros([n_epochs, 1])
    training_loss_history = np.zeros([n_epochs, 1])

    # Train the model, iterating on the data in batches
    for epoch in range(n_epochs):
        print('Epoch', (epoch+1), end='')
        train_total = 0
        train_correct = 0
        # train
        model.train()
        for i, data in enumerate(training_data_loader):
            input_word, output_word = data
            optimizer.zero_grad()
            # forward pass
            output = model(input_word)
            # calculate categorical cross entropy loss
            loss = criterion(output, output_word)
            # backward pass
            loss.backward()
            optimizer.step()

            # track training accuracy
            _, predicted = torch.max(output.data, 1)
            train_total += output_word.size(0)
            train_correct += (predicted == output_word).sum().item()
            # track training loss
            training_loss_history[epoch] += loss.item()
        training_loss_history[epoch] /= len(training_data_loader)
        training_accuracy_history[epoch] = train_correct / train_total
        print('\n\tloss: ',training_loss_history[epoch,0], 'acc:', training_accuracy_history[epoch,0])


    weights = list(model.parameters())[0].data.numpy().T
    print("Hidden layer weight matrix shape: ", weights.shape)
    output_weights = list(model.parameters())[2].data.numpy().T
    print("Output layer weight matrix shape: ", output_weights.shape)

    # Find and print most similar pairs
    similar_pairs = most_similar_pairs(weights, word_to_index)
    for pair in similar_pairs[:30]:
        print(pair)

## 3G:
Run the function below and report your results for dr_seuss.txt.

In [7]:
find_most_similar_pairs('data/dr_seuss.txt', 10)

Textfile contains 308 unique words
Note: Matrices are the weights, vectors are the bias vectors
Layer 0  has shape: torch.Size([10, 308]) 

Layer 1  has shape: torch.Size([10]) 

Layer 2  has shape: torch.Size([308, 10]) 

Layer 3  has shape: torch.Size([308]) 

Epoch 1
	loss:  5.638266773223877 acc: 0.024655547498187092
Epoch 2
	loss:  5.212681678771973 acc: 0.03964225284022238
Epoch 3
	loss:  4.8604635124206546 acc: 0.05922165820642978
Epoch 4
	loss:  4.773799694061279 acc: 0.05897993715252599
Epoch 5
	loss:  4.7612561492919925 acc: 0.05897993715252599
Epoch 6
	loss:  4.740185813903809 acc: 0.05914108452179518
Epoch 7
	loss:  4.7290279579162595 acc: 0.05897993715252599
Epoch 8
	loss:  4.707237537384033 acc: 0.06325034243815969
Epoch 9
	loss:  4.695675735473633 acc: 0.06486181613085167
Epoch 10
	loss:  4.671096858978271 acc: 0.06864877930867778
Epoch 11
	loss:  4.651651601791382 acc: 0.06647328982354363
Epoch 12
	loss:  4.637347282409668 acc: 0.07662557408750302
Epoch 13
	loss:  4.618