In [None]:
# Not sure if we need all of this...
import numpy as np
import torch
import torch.nn as nn

import matplotlib.pyplot as plt
from random import shuffle

import numpy as np
import pandas as pd
from scipy import spatial
import copy
import nltk

import time

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [None]:
# Import encoding dictionaries. i.e. bert_dict['cat'] = [0.5, -.4, 0.001,...]
bert_dict = np.load('bert.npy', allow_pickle = True); bert_dict = bert_dict[()]
glove_dict = np.load('glove.npy', allow_pickle = True); glove_dict = glove_dict[()]

# Part of speech tagging (already done on GloVe embeddings)
bert_dict_copy = bert_dict.copy()

for key in bert_dict_copy:  
    tag = nltk.pos_tag([key.strip()])[0][1]
    if tag not in ['NN', 'NNP']: 
        del bert_dict[key]

# Turn BERT into indexable embeddings
all_bert_words = list(bert_dict.keys())
word_to_idx = dict(zip(all_bert_words, range(len(all_bert_words))))
idx_to_word = dict(zip(range(len(all_bert_words)), all_bert_words))

# Get BERT embeddings as giant matrix
bert_embedding = [bert_dict[word] for word in all_bert_words]
bert_embedding = np.vstack(bert_embedding) # shape = (30000, 768)

In [None]:
# Read data from txt file
def read_data(examples, data): 
    '''
    Takes in dataset from examples and data (make sure they're for the same data!!!) and appends each example to
    a matrix. 
    For example, 
        board = 'cat', 'dog' (from examples)
        gold_word = 'pet' (from data)
    Will be appended as: 
        [[word_to_idx['cat], word_to_idx['dog']], [word_to_idx['pet']]]
    '''
    data_matrix = []
    with open(examples) as examples, open(data) as data:
        for line1, line2 in zip(examples, data):
            input_line_1 = line1.split('.')
            good_words = input_line_1[1].strip('\n').split(',')[:2]
            
            input_line_2 = line2.split('.')
            gold_word = input_line_2[1].strip()
            
            if input_line_1[0] == input_line_2[0]: 
                board = [word_to_idx[word.strip()] for word in good_words]
                if gold_word == 'No good clues': 
                    continue
            
                try: 
                    data_matrix.append([board, [word_to_idx[gold_word]]])
                except: 
                    continue
            else: 
                print(input_line_1[0])
                print(input_line_2[0])
                print('Uh oh, misaligment!')
                break
            
    return data_matrix

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, hidden_size):
        super(NeuralNetwork, self).__init__()
        
        # Layer 1
        self.neural_network_1 = nn.Linear(2304, hidden_size)
        
        # Layer 2
        self.neural_network_2 = nn.Linear(hidden_size, 1)
        
        # Tanh
        self.tanh = nn.Tanh()

    # x is of size V x 1536 
    def forward(self, x):  
        # First layer
        s = self.neural_network_1(x)
        
        # First Tanh
        s = self.tanh(s)
        
        # Second layer
        s = self.neural_network_2(s)
        
        return s
    
class NeuralNetwork_with_dropout(nn.Module):
    def __init__(self, hidden_size):
        super(NeuralNetwork_with_dropout, self).__init__()
        
        # Linear layer 1
        self.neural_network_1 = nn.Linear(2304, hidden_size)
        
        # Dropout layer 1
        self.dropout_1 = nn.Dropout(p=0.5)
        
        # Layer 2
        self.neural_network_2 = nn.Linear(hidden_size, 1)
        
        # Dropout layer 2
        self.dropout_2 = nn.Dropout(p=0.2)
        
        # Tanh
        self.tanh = nn.Tanh()

    # x is of size V x 1536 
    def forward(self, x):  
        # First layer
        s = self.neural_network_1(x)
        
        # Dropout with probability 0.5
        s = self.dropout_1(s)
        
        # First Tanh
        s = self.tanh(s)
        
        # Second layer
        s = self.neural_network_2(s)
        
        # Dropout with probability 0.2
        s = self.dropout_2(s)
        
        return s
    
def loss_function(p_dist, gold_word):
    return torch.nn.CrossEntropyLoss()(p_dist, gold_word)

In [None]:
from jupyterplot import ProgressPlot

def train(model, train_data, optimizer, loss_function, params): 
    epochs = params['num_epochs']
    embedding = params['embedding']
    
    embedding_pt = torch.tensor(embedding, requires_grad=False)
    V, K = embedding.shape
    
    all_loss = []
    mini_batch_loss = []
    pp = ProgressPlot()
    
    batch_size = V
    
    for epoch in range(epochs): 
        print('Epoch: {}'.format(epoch))

        # Set model to train
        model.train()

        ## Prep minibatches of size 4 (currently hard-coded, can be changed later)
        #m = len(train_data)
        #num_batches = round(m / 4)
        
        epoch_loss = 0

        for example in train_data:
            board, gold_word = example

            board_pt = torch.tensor(board, requires_grad=False)
            gold_word_pt = torch.tensor(gold_word, requires_grad=False)

            board_embedding_pt = embedding_pt[board_pt].view(1, 2 * K)  # 2 x 768
            board_embedding_big = board_embedding_pt.expand(V, 2 * K)  # V x (2 X 768)

            # Add embedding for each vocab word to board_embedding_big
            all_input = torch.cat([embedding_pt, board_embedding_big], dim=1)  # (V x 1536)
            
            # TODO : Handle cases where V % 32 != 0 (OK EUGENE?)
            for i in range(all_input.size(0) // batch_size): 
                output = model(all_input[i*batch_size : (i+1)*batch_size])
                output_T = torch.transpose(output, 0, 1)

                loss = loss_function(output_T, gold_word_pt[i*batch_size : (i+1)*batch_size])

                mini_batch_loss.append(float(np.array([loss.detach().cpu().numpy()])[0]))
                pp.update(np.mean(mini_batch_loss[-min(len(mini_batch_loss), 100):]))
                
                # Append loss from batch to epoch_loss
                epoch_loss += loss

                # Clear previous gradients
                optimizer.zero_grad()

                # Backprop
                loss.backward()

                # Perform update using
                optimizer.step()
            
        all_loss.append(epoch_loss)
        print('Epoch loss: {}'.format(epoch_loss))
    
    pp.finalize()
    
    plt.plot(all_loss)
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.show()

In [None]:
def evaluate(model, val_data, params, idx_to_word, word_to_idx): 
    # This function takes in a model, validation training set, loss function, and params
    # And will return the fraction of examples where
    # The truth clue (i.e. human chosen) is within the top 10 that the model spits out
    
    model.eval() #Set model to evaluation mode

    count = 0
    
    embedding_pt = torch.tensor(params['embedding'], requires_grad=False)

    V, K = params['embedding'].shape #(27296, 768)
    
    for example in val_data: 
        board, gold_word = example; #board = [X, Y], gold_word = [Z]
           
        board_pt = torch.tensor(board, requires_grad=False) #tensorfy the board, and say its not a variable that can be updated with backprop

        board_embedding_pt = embedding_pt[board_pt].view(1, 2 * K) # torch.Size([1, 1536])
        board_embedding_big = board_embedding_pt.expand(V, 2 * K) # torch.Size([27296, 1536])
        all_input = torch.cat([embedding_pt, board_embedding_big], dim=1) # torch.Size([27296, 2304])
        
        output = model(all_input) #torch.Size([27296, 1]), this should be a score for each potential clue
        
        output = torch.transpose(output, 0, 1) # need to take the transpose before doing top k
        scores, indices = output.topk(100) #top k gives you the scores and indices of the top k scoring elements
        clue_words = [idx_to_word[int(index)] for index in indices[0]] #converts indices into the words
        
        if idx_to_word[gold_word[0]] in clue_words:
            count += 1

    return (count/len(val_data))

In [None]:
# Random seed for reproducible experiments
torch.manual_seed(0)

# Load data
data_matrix = read_data('examples_all.txt', 'data_all.txt')
# train_data, val_data = random_split(data_matrix, [50, 21])

train_data, val_data = train_test_split(data_matrix, test_size=0.8)

# Includes things like # of epochs, etc.
params = {
    'num_epochs' : 10,
    'embedding' : bert_embedding
}

model = NeuralNetwork(hidden_size = 50)

# Choice of optimizer
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

start_time = time.time()
train(model, train_data, optimizer, loss_function, params)
print('Time: {}'.format(time.time() - start_time))

In [None]:
start_time = time.time()
evaluate(model, val_data[0:100], params, idx_to_word, word_to_idx)
print('Time: {}'.format(time.time() - start_time))

In [None]:
def train_and_evaluate(model, train_data, val_data, optimizer, loss_function, params): 
    epochs = params['num_epochs']
    embedding = params['embedding']
    
    embedding_pt = torch.tensor(embedding, requires_grad=False)
    V, K = embedding.shape
    
    all_loss = []
    mini_batch_loss = []
    pp = ProgressPlot()

    val_accuracies = []
    train_accuracies = []
    
    batch_size = V
    
    for epoch in range(epochs): 
        print('Epoch: {}'.format(epoch))
        
        shuffle(train_data)

        # Set model to train
        model.train()
        
        val_acc = evaluate(model, val_data, params, idx_to_word, word_to_idx)
        val_accuracies.append(val_acc)
        train_acc = evaluate(model, train_data, params, idx_to_word, word_to_idx)
        train_accuracies.append(train_acc)
        
        epoch_loss = 0

        for example in train_data:
            board, gold_word = example

            board_pt = torch.tensor(board, requires_grad=False)
            gold_word_pt = torch.tensor(gold_word, requires_grad=False)

            board_embedding_pt = embedding_pt[board_pt].view(1, 2 * K)  # 2 x 768
            board_embedding_big = board_embedding_pt.expand(V, 2 * K)  # V x (2 X 768)

            # Add embedding for each vocab word to board_embedding_big
            all_input = torch.cat([embedding_pt, board_embedding_big], dim=1)  # (V x 1536)
            
            # TODO : Handle cases where V % 32 != 0 (OK EUGENE?)
            for i in range(all_input.size(0) // batch_size): 
                output = model(all_input[i*batch_size : (i+1)*batch_size])
                output_T = torch.transpose(output, 0, 1)

                loss = loss_function(output_T, gold_word_pt[i*batch_size : (i+1)*batch_size])

                mini_batch_loss.append(float(np.array([loss.detach().cpu().numpy()])[0]))
                pp.update(np.mean(mini_batch_loss[-min(len(mini_batch_loss), 100):]))
                
                # Append loss from batch to epoch_loss
                epoch_loss += loss

                # Clear previous gradients
                optimizer.zero_grad()

                # Backprop
                loss.backward()

                # Perform update using
                optimizer.step()
            
        all_loss.append(epoch_loss)
        print('Epoch loss: {}'.format(epoch_loss))
        
    val_acc = evaluate(model, val_data, params, idx_to_word, word_to_idx)
    val_accuracies.append(val_acc)
    train_acc = evaluate(model, train_data, params, idx_to_word, word_to_idx)
    train_accuracies.append(train_acc)
    
    pp.finalize()
    
    plt.plot(train_accuracies, color='dodgerblue')
    plt.plot(val_accuracies, color='firebrick')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.show()

In [None]:
torch.manual_seed(1)

# Load data
data_matrix = read_data('examples_all.txt', 'data_all.txt')

train_data, val_data = train_test_split(data_matrix, test_size=0.8)

# Includes things like # of epochs, etc.
params = {
    'num_epochs' : 10,
    'embedding' : bert_embedding
}

model = NeuralNetwork(hidden_size = 50)

optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

start_time = time.time()
train_and_evaluate(model, train_data, val_data[0:100], optimizer, loss_function, params)
print('Time: {}'.format(time.time() - start_time))

In [None]:
### Testing with dropout
model = NeuralNetwork_with_dropout(hidden_size = 50)

# Choice of optimizer
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

start_time = time.time()
train_and_evaluate(model, train_data, val_data[0:100], optimizer, loss_function, params)
print('Time: {}'.format(time.time() - start_time))