In [0]:
#!conda install -c conda-forge jsonnet
pip install allennlp

Collecting allennlp
[?25l  Downloading https://files.pythonhosted.org/packages/30/8c/72b14d20c9cbb0306939ea41109fc599302634fd5c59ccba1a659b7d0360/allennlp-0.8.4-py3-none-any.whl (5.7MB)
[K     |████████████████████████████████| 5.7MB 3.0MB/s 
Collecting flaky (from allennlp)
  Downloading https://files.pythonhosted.org/packages/02/42/cca66659a786567c8af98587d66d75e7d2b6e65662f8daab75db708ac35b/flaky-3.5.3-py2.py3-none-any.whl
Collecting awscli>=1.11.91 (from allennlp)
[?25l  Downloading https://files.pythonhosted.org/packages/0e/f0/5478c9831ca7724fb7c8d63563d51e9788ce218ed2263abc30f3b214a684/awscli-1.16.174-py2.py3-none-any.whl (1.6MB)
[K     |████████████████████████████████| 1.6MB 36.7MB/s 
[?25hCollecting ftfy (from allennlp)
[?25l  Downloading https://files.pythonhosted.org/packages/8f/86/df789c5834f15ae1ca53a8d4c1fc4788676c2e32112f6a786f2625d9c6e6/ftfy-5.5.1-py3-none-any.whl (43kB)
[K     |████████████████████████████████| 51kB 26.8MB/s 
[?25hCollecting unidecode (from all

In [0]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.optim as optim
import torch.backends.cudnn as cudnn

from typing import *
from pathlib import Path

from allennlp.data.dataset_readers import LanguageModelingReader
from allennlp.data.tokenizers import CharacterTokenizer
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.data import Vocabulary
from allennlp.data.iterators import BasicIterator
from allennlp.training import Trainer
from sklearn.model_selection import train_test_split
from allennlp.modules.seq2seq_encoders import PytorchSeq2SeqWrapper
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.models import Model
from allennlp.nn.util import get_text_field_mask

import matplotlib.pyplot as plt



In [0]:
class LanguageModel(Model):
    def __init__(self, encoder, vocab,
                 embedding_dim=50):
        super().__init__(vocab=vocab)
        # char embedding
        self.vocab_size = vocab.get_vocab_size()
        self.padding_idx = vocab.get_token_index("@@PADDING@@")
        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size(),
            embedding_dim=embedding_dim,
            padding_index=self.padding_idx,
        )
        self.embedding = BasicTextFieldEmbedder({"tokens": token_embedding})
        self.encoder = encoder
        self.projection = nn.Linear(self.encoder.hidden_size, self.vocab_size)
        self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_idx)
    
    def forward(self, input_tokens,
                output_tokens):
        embs = self.embedding(input_tokens)
        x, _ = self.encoder(embs)
        x = self.projection(x)
        if output_tokens is not None:
            loss = self.loss(x.view((-1, self.vocab_size)), output_tokens["tokens"].flatten())
        else:
            loss = None
        return {"loss": loss, "logits": x}

In [0]:
# initialization of model weights
def init_Ws(parameters):
    for p in parameters:
        if p.data.ndimension() > 1:
            nn.init.xavier_uniform_(p.data)
        else:
            nn.init.zeros_(p.data)

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.input_size, self.hidden_size = input_size, hidden_size

        # define nn.Parameter for all the parameters in RNN
        self.W_ih = Parameter(torch.Tensor(input_size, hidden_size))
        self.W_hh = Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_h = Parameter(torch.Tensor(hidden_size))

        init_Ws(self.parameters())
    
    def forward(self, x, init_state=None):
        """Assumes x is of shape (batch, sequence, feature)"""
        bs, seq_size, _ = x.size()
        hidden_seq = []
        if init_state is None:
            h_t = torch.zeros(self.hidden_size).to(x.device)
        else:
            h_t = init_state

        # BEGIN OF TODO ----------------------------------------------
        for t in range(seq_size):
            x_t = x[:,t,:]
            
            h_t = torch.tanh((x_t @ self.W_ih) + (h_t @ self.W_hh) + self.b_h)
            hidden_seq.append(h_t.unsqueeze(0))
            
        # END OF TODO ----------------------------------------------

        hidden_seq = torch.cat(hidden_seq, dim=0)
        # reshape from shape (sequence, batch, feature) to (batch, sequence, feature)
        hidden_seq = hidden_seq.transpose(0, 1).contiguous()
        return hidden_seq, h_t

In [0]:
class naiveLSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        # BEGIN OF TODO ----------------------------------------------
        # define nn.Parameter for all the parameters in LSTM
        # input gate
         # define nn.Parameter for all the parameters in RNN
        self.W_ii = Parameter(torch.Tensor(input_size, hidden_size))
        self.W_hi = Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_i = Parameter(torch.Tensor(hidden_size))
        # forget gate
        self.W_if = Parameter(torch.Tensor(input_size, hidden_size))
        self.W_hf = Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_f = Parameter(torch.Tensor(hidden_size))
        # compressed
        self.W_ig = Parameter(torch.Tensor(input_size, hidden_size))
        self.W_hg = Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_g = Parameter(torch.Tensor(hidden_size))

        # output gate
        self.W_io = Parameter(torch.Tensor(input_size, hidden_size))
        self.W_ho = Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_o = Parameter(torch.Tensor(hidden_size))

        # END OF TODO ----------------------------------------------
        
        init_Ws(self.parameters())
        
    def forward(self, x, init_states=None):
        """Assumes x is of shape (batch, sequence, feature)"""
        bs, seq_size, _ = x.size()
        hidden_seq = []
        if init_states is None:
            h_t, c_t = torch.zeros(self.hidden_size).to(x.device), torch.zeros(self.hidden_size).to(x.device)
        else:
            h_t, c_t = init_states

        # BEGIN OF TODO ----------------------------------------------
        for t in range(seq_size):
            x_t = x[:,t,:]
            i_t = torch.sigmoid((x_t @ self.W_ii) + (h_t @ self.W_hi) + self.b_i)
            f_t = torch.sigmoid((x_t @ self.W_if) + (h_t @ self.W_hf) + self.b_f)
            g_t = torch.tanh((x_t @ self.W_ig) + (h_t @ self.W_hg) + self.b_g)
            o_t = torch.sigmoid((x_t @ self.W_io) + (h_t @ self.W_ho) + self.b_o)
            c_t = f_t * c_t + i_t * g_t
            h_t = o_t * torch.tanh(c_t)
            hidden_seq.append(h_t.unsqueeze(0))
        # END OF TODO ----------------------------------------------

        hidden_seq = torch.cat(hidden_seq, dim=0)
        # reshape from shape (sequence, batch, feature) to (batch, sequence, feature)
        hidden_seq = hidden_seq.transpose(0, 1).contiguous()
        return hidden_seq, (h_t, c_t)

In [0]:
def main():

    DATA_ROOT = Path("./")

    char_tokenizer = CharacterTokenizer(lowercase_characters=True)

    reader = LanguageModelingReader(
        tokens_per_instance=500,
        tokenizer=char_tokenizer,
        token_indexers = {"tokens": SingleIdTokenIndexer()},
    )

    train_ds = reader.read(DATA_ROOT / "brown.txt")
    train_ds, val_ds = train_test_split(train_ds, random_state=0, test_size=0.1)

    vocab = Vocabulary.from_instances(train_ds)

    iterator = BasicIterator(batch_size=32)
    iterator.index_with(vocab)


    # Question 1 -----------------------------------------------------------------
    def train(model, epochs):
        trainer = Trainer(
            model=model.cuda() if torch.cuda.is_available() else model,
            optimizer=optim.Adam(model.parameters()),
            iterator=iterator, train_dataset=train_ds, 
            validation_dataset=val_ds, num_epochs=epochs,
            cuda_device=0 if torch.cuda.is_available() else -1
        )
        return trainer.train()

    lm_pytorchRNN = LanguageModel(nn.RNN(50, 125, batch_first=True), vocab)
    train(lm_pytorchRNN, epochs=30)

    lm_pytorchLSTM = LanguageModel(nn.LSTM(50, 125, batch_first=True), vocab)
    train(lm_pytorchLSTM, epochs=30)
    
    # Question 1 -----------------------------------------------------------------

    
    # Question 2 & 3 -----------------------------------------------------------------
    def train_q2(model, epochs):
        trainer = Trainer(
            model,
            optimizer=optim.Adam(model.parameters()),
            iterator=iterator, train_dataset=train_ds, 
            validation_dataset=val_ds, num_epochs=epochs,
            cuda_device=-1
        )
        return trainer.train()

    lm_RNN = LanguageModel(RNN(50, 125), vocab)
    train_q2(lm_RNN, epochs=1)

    lm_naiveLSTM = LanguageModel(naiveLSTM(50, 125), vocab)
    train_q2(lm_naiveLSTM, epochs=1)
    
    # Question 2 & 3 -----------------------------------------------------------------
    # Question 4 -----------------------------------------------------------------
    test_batch = next(iterator(train_ds))
    test_embeddings = lm_naiveLSTM.embedding(test_batch["input_tokens"])
    #print(test_embeddings.size())
    rnn = RNN(50, 125)
    h_0 = torch.zeros(rnn.hidden_size, requires_grad=True).to(test_embeddings.device)
    h_t = h_0
    grads = []
    #print(test_embeddings.size())
    for t in range(100):
        x_t = test_embeddings[:,t,:]
        # BEGIN OF TODO ----------------------------------------------
        # one tep of RNN
        x_t = test_embeddings[:,t,:]
        h_t = torch.tanh((x_t @ rnn.W_ih) + (h_t @ rnn.W_hh) + rnn.b_h)   
        
        # END OF TODO ----------------------------------------------
        loss = h_t.abs().sum()
        loss.backward(retain_graph=True)
        grads.append(torch.norm(h_0.grad).item())
        h_0.grad.zero_()
        rnn.zero_grad()

    #print(grads)
    plt.plot(grads)
    plt.xlabel("time steps")
    plt.ylabel("loss")
    plt.show()
    
    # Question 4 -----------------------------------------------------------------

    # Question 5 & 6-----------------------------------------------------------------
    # LSTM gradient with forget gate (three settings for forget gate, three settings for input gate)

    lstm = naiveLSTM(50, 125)

    # generate 
    hidden_size = lstm.hidden_size
    h_0, c_0 = (torch.zeros(hidden_size, requires_grad=True), 
                torch.zeros(hidden_size, requires_grad=True))
    grads = []
    h_t, c_t = h_0, c_0
    
    # BEGIN OF TODO ----------------------------------------------
    # Question 5
    # 1) without forget gate:
    
    

    # 2) with forget gate, b_f = 1

    # 3) with forget gate, b_f = 0

    # Question 6
    # 1) without input gate:

    # 2) with input gate, b_f = 1

    # 3) with input gate, b_f = 0

    # END OF TODO ----------------------------------------------
    grads = []
    new_b_i= torch.zeros(lstm.b_i.size())
    for t in range(100):

        # BEGIN OF TODO --------Different experiments by setting biases on input and forget gates to different values.---------------
        x_t = test_embeddings[:,t,:]
        i_t = torch.sigmoid((x_t @ lstm.W_ii) + (h_t @ lstm.W_hi) + new_b_i)
        f_t = torch.sigmoid((x_t @ lstm.W_if) + (h_t @ lstm.W_hf) + lstm.b_f)
        g_t = torch.tanh((x_t @ lstm.W_ig) + (h_t @ lstm.W_hg) + lstm.b_g)
        o_t = torch.sigmoid((x_t @ lstm.W_io) + (h_t @ lstm.W_ho) + lstm.b_o)
        c_t = f_t * c_t + i_t * g_t
        h_t = o_t * torch.tanh(c_t)
        # END OF TODO ----------------------------------------------
        loss = h_t.abs().sum()
        loss.backward(retain_graph=True)
        grads.append(torch.norm(h_0.grad).item())
        h_0.grad.zero_()
        lstm.zero_grad()
    print(grads)
    plt.plot(grads)
    plt.xlabel("time steps")
    plt.ylabel("loss")
    plt.show()

    
    

In [0]:
if __name__ == '__main__':
    main()
    



0it [00:00, ?it/s][A[A

  0%|          | 0/11994 [00:00<?, ?it/s][A[A

 37%|███▋      | 4464/11994 [00:00<00:00, 43343.45it/s][A[A

 49%|████▊     | 5843/11994 [00:00<00:00, 25349.68it/s][A[A

 84%|████████▎ | 10034/11994 [00:00<00:00, 28400.16it/s][A[A

100%|██████████| 11994/11994 [00:00<00:00, 33182.86it/s][A[A
1it [00:14, 14.43s/it][A
169it [00:14, 10.10s/it][A
612it [00:14,  7.07s/it][A
1034it [00:14,  4.95s/it][A
1405it [00:14,  3.47s/it][A
1833it [00:14,  2.43s/it][A
2196it [00:15,  1.70s/it][A
2624it [00:15,  1.19s/it][A
2992it [00:15,  1.20it/s][A
3423it [00:15,  1.72it/s][A
3856it [00:15,  2.45it/s][A
4255it [00:15,  3.50it/s][A
4679it [00:15,  5.00it/s][A
5080it [00:15,  7.14it/s][A
5510it [00:15, 10.19it/s][A
5913it [00:16, 14.54it/s][A
6347it [00:16, 20.74it/s][A
6754it [00:16, 29.56it/s][A
7190it [00:16, 42.10it/s][A
7610it [00:16, 59.89it/s][A
8022it [00:16, 84.97it/s][A
8455it [00:16, 120.38it/s][A
8865it [00:16, 169.66it/s][A
9289it