# Bidirectional LSTM with word2vec

## Data Representation
The input to our model is a sentence string, we will represent a sentence by word2vec vector. The shape is (300,), type is float64. 

In [1]:
import pickle

def save_pickle(obj, path):
    with open(path, "wb") as f:
        pickle.dump(obj, f)


def load_pickle(path):
    with open(path, "rb") as f:
        return pickle.load(f)

In [2]:
import os

import gensim.downloader
from gensim.models import KeyedVectors
import numpy as np

W2V_EMBEDDING_DIM = 300
SEQ_LEN = 52

def load_word2vec():
    # doesn't work
    # word2vec_model = gensim.downloader.load("word2vec-google-news-300")
    word2vec_file = 'TempFiles/GoogleNews-vectors-negative300.bin'
    word2vec_model = KeyedVectors.load_word2vec_format(word2vec_file, binary=True)
    return word2vec_model

def create_or_load_slim_w2v(words_list, cache_w2v=True):
    """
    We are trying to get a smaller word2vec dictionary: word2vec dict only for words which appear in the training dataset.
    :param words_list: list of words to use for the w2v dict
    :param cache_w2v: whether to save locally the small w2v dictionary
    :return: dictionary which maps the known words to their vectors
    """
    w2v_path = "TempFiles/w2v_dict.pkl"
    if not os.path.exists(w2v_path):
        full_w2v = load_word2vec()
        w2v_emb_dict = {k: full_w2v[k] for k in words_list if k in full_w2v}
        if cache_w2v:
            save_pickle(w2v_emb_dict, w2v_path)
    else:
        w2v_emb_dict = load_pickle(w2v_path)
    return w2v_emb_dict


def sentence_to_embedding(sent, word_to_vec, seq_len=SEQ_LEN, embedding_dim=300):
    """
    this method gets a sentence and a word to vector mapping, and returns a list containing the
    words embeddings of the tokens in the sentence.
    :param sent: a list of word (string)
    :param word_to_vec: a word to vector mapping.
    :param seq_len: the fixed length for which the sentence will be mapped to.
    :param embedding_dim: the dimension of the w2v embedding
    :return: numpy ndarray of shape (seq_len, embedding_dim) with the representation of the sentence
    """
    sentence_embedding = np.zeros((seq_len, embedding_dim))
    for i in range(min([len(sent), seq_len])):
        word = sent[i]
        try:
            word_embedding = word_to_vec[word]
            sentence_embedding[i] = word_embedding
        except:
            pass
    return sentence_embedding

## Data Loader

In [3]:
from processSST import SentimentTreeBank
from pytorchDataloader import DataManager, TRAIN, VAL, TEST
# load the dataset
dataset = SentimentTreeBank()
# the function that will map a sentence to vector is get_w2v_average
sent_func = sentence_to_embedding
# The param it takes other than the Sentence object: word2Vec_dic, W2V_EMBEDDING_DIM
# initialize the dictionary that map a word to Word2Vec vectors
words_list = list(dataset.get_word_counts().keys())
word2Vec_dic = create_or_load_slim_w2v(words_list)
# We just know that the embedding size of word2Vec is 300
sent_func_kwargs = {"word_to_vec": word2Vec_dic, "embedding_dim": W2V_EMBEDDING_DIM, "seq_len": SEQ_LEN}
# pass it to the dataManager
data_manager = DataManager(use_sub_phrases=False, 
                           sentiment_dataset=dataset, 
                           sent_func=sent_func, sent_func_kwargs=sent_func_kwargs, 
                           batch_size=50)
train_dataloader = data_manager.get_torch_iterator(data_subset="train")


## Training
We will use the bidirectional LSTM architecture

Define the model  
Regarding LSTM: 
If we passed in a sentence as list of words, each word as representation, then it will run recurrently word by word for each layer, and the h_n and c_n will be the hidden_layer after the final word. Better shown in a graph. I always get confused by the "layer of LSTM" to "LSTM cell for each word". each layer will handle an entire sentence, and out put just one h_n for each sentence. For each sentence there are many words so it might be passed to the same layer of LSTM recurrently, each time a word passed to the LSTM cell it will create a h_n, but this h_n will be passed again into the same layer in the next word's step. 
Also, bi-directional is not two layer of LSTM stacking up. (It might be a little confusing from the graph), it is the same layer, just first time we pass the sentence (list of word) in the front order and second time the reverse order. So each layer will create two h_n and c_n pair. 
On the other hand, if we pass a sentence in as on single vector (average_word2vec), then it will pass through the LSTM  once (because it's basically just one word) (but sentence of one word is a list of one vector, here we don't even have a list, just one vector, so we basically didn't use the "recurrent" attribute at all). 

In [4]:
import torch
import torch.nn as nn
class LSTM(nn.Module):
    """
    An LSTM for sentiment analysis with architecture as described in the exercise description.
    """

    def __init__(self, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.LSTM = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=n_layers,
            bidirectional=True,
            batch_first=True,
            dropout=dropout,
            dtype=torch.float64
        )
        self.linear = nn.Linear(in_features=hidden_dim * 2, out_features=1,dtype=torch.float64)
        return

    def forward(self, text):
        """

        :param text: tensor of (batch_size, representation_dim), with avg_word2vec it's probably (batch_size, 300).
        but for the real embedding is probably (batch_size, 52, 300), (52, 300) is 52 words, each with 300 dim
        embedding.
        :return:

        Sam's note:
        Regarding output of LSTM:
        c_n and h_n: cell state and hidden state: both of size (num_layers * num_directions=2, batch_size, hidden_size)
        somehow the batch size is not first......
        output_of_lstm: the entire output for the batch (if there are 50 samples in a batch, there are 50 output), each
        output is size of (seq_len(num of words, regulated to 52 words),hidden_size * num_directions=2) 
        
        because as explained above, in one layer of LSTM, it will pass words to itself 52 times recursively. 
        and pytorch will record all of the "intermediate" output that is feed to next iteration-- 52 of them
        
        if it's bi directional, it will concatenate the resulting hidden state of both direction, for each sample. 

        """
        output_of_lstm, (h_n, c_n) = self.LSTM(text)
        last_output = output_of_lstm[:, -1,:]
        return self.linear(last_output)
    
    def predict(self, text):
        """
        Sam's Note: just use self(text) will return the prediction of the model. We are just adding another layer of sigmoid function here at prediction time.
        :param text: 
        :return: 
        """
        prediction_before_sigmoid = self(text)
        return nn.Sigmoid()(prediction_before_sigmoid)

And the function for training a batch, an epoch, etc. 

In [5]:
def binary_accuracy(preds, y):
    """
    This method returns tha accuracy of the predictions, relative to the labels.
    You can choose whether to use numpy arrays or tensors here.
    I use Tensor here
    :param preds: a vector of predictions
    :param y: a vector of true labels
    :return: scalar value - (<number of accurate predictions> / <number of examples>)
    """
    number_of_accurate_predictions = (torch.round(preds) == y).sum()
    number_of_examples = y.shape[0]
    return (number_of_accurate_predictions / number_of_examples).item()



In [6]:
def train_batch(model, optimizer, criterion, batch):
    """
    Sam's note: 
    All the parameters we want to update is automatically set requires_grad=True. So backward() will upgrade the gradients. Because we are just using simple LSTM and Linear from pytorch.nn, so we don't need to worry about this. 
    Maybe later if we want to parameterize something, we would need to set that newly added tensor's requires_grad=True. Just not something we need to worry about right now. Here are the code to check that
    
    # Check if the model parameters have requires_grad=True
    for name, param in model.named_parameters():
        print(f'{name}: requires_grad={param.requires_grad}')
        
    :param model:
    :param optimizer:
    :param criterion:
    :param batch: a list of two tensor: [X, y], shape of X is (batch_size, representation_of_sentence), shape of y is (batch_size, representation_of_target(usually just a number)) -> probably (batch_size,)
    :return:
    """
    # reset the gradient after every backward pass instead of accumulate for the entire epoch
    optimizer.zero_grad()
    # assign tensor to device, and to the correct type
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    X = batch[0].to(device).to(torch.float64)
    y = batch[1].to(device).to(torch.float64)
    # Forward pass the X: also automatically "use the model to predict y based on X" (This will be LSTM-Linear)
    y_pred = model(X)
    # prediction is in (batch_size, 1) shape, but original y is in (batch_size,) shape, so we need to add another dim
    y = torch.reshape(y, y_pred.shape)
    # get the loss, so that we can preform backpropagation
    '''here user prediction before sigmoid for criteria, why?'''
    loss = criterion(input=y_pred, target=y)
    # now back propagate the loss to update the parameters of the model
    loss.backward()
    optimizer.step()
    # computes loss and accuracy: why use sigmoid here?
    accuracy_value = binary_accuracy(preds=y_pred, y=y)
    loss_value = loss.item()
    batch_size = batch[0].shape[0]
    return loss_value * batch_size, accuracy_value * batch_size


In [7]:
def train_epoch(model, data_iterator, optimizer, criterion):
    """
    This method operates one epoch (pass over the whole train set) of training of the given model,
    and returns the accuracy and loss for this epoch
    Assume model has method predict.
    :param model: the model we're currently training
    :param data_iterator: an iterator, iterating over the training data for the model.
    :param optimizer: the optimizer object for the training process.
    :param criterion: the criterion object for the training process.
    """
    total_loss, total_accuracy = 0, 0
    total_sample_size = 0
    for batch in data_iterator:
        batch_size = batch[0].shape[0]
        total_sample_size += batch_size
        loss, accuracy = train_batch(model, optimizer, criterion, batch)
        total_loss += loss
        total_accuracy += accuracy
    return total_loss / total_sample_size, total_accuracy / total_sample_size


In [8]:
def train_model(model_name, model, data_manager: DataManager, n_epochs, lr, weight_decay=0.):
    """
    Runs the full training procedure for the given model. The optimization should be done using the Adam
    optimizer with all parameters but learning rate and weight decay set to default.
    :param model_name: name of model
    :param model: module of one of the models implemented in the exercise
    :param data_manager: the DataManager object
    :param n_epochs: number of times to go over the whole training set
    :param lr: learning rate to be used for optimization
    :param weight_decay: parameter for l2 regularization
    """
    train_data_iterator = data_manager.get_torch_iterator(data_subset=TRAIN)
    evaluate_data_iterator = data_manager.get_torch_iterator(data_subset=VAL)
    '''lr and weight_decay should set to default? Then what's parameter for?'''
    optimizer = torch.optim.Adam(params=model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = F.binary_cross_entropy_with_logits
    '''train and evaluate data'''
    for i in range(n_epochs):
        loss, accuracy = train_epoch(model, train_data_iterator, optimizer, criterion)
        # loss, accuracy = evaluate(model, evaluate_data_iterator, criterion)
        # evaluate_stats_recorder.update_error_and_accuracy(epoch_number=i, loss=loss,
        #                                                   accuracy=accuracy)
    # '''save trained model'''
    # save_model(model, "{}".format(model_name), n_epochs, optimizer)
    # save_pickle(data_manager, "{}_DataManager".format(model_name))
    # save_pickle(training_stats_recorder, "{}_stats_{}".format(model_name, TRAIN))
    # save_pickle(evaluate_stats_recorder, "{}_stats_{}".format(model_name, VAL))


In [9]:
import torch.nn.functional as F
model = LSTM(embedding_dim=W2V_EMBEDDING_DIM, hidden_dim=100, n_layers=2, dropout=0.5)
# data_manager already initialized from above
train_model('model_name', model, data_manager=data_manager, n_epochs=1, lr=0.001, weight_decay=0.0001)