<a href="https://colab.research.google.com/github/samibahig/IFT6390/blob/main/Assignment_2_4_1%2C_4_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

import torch

import torch.nn as nn

import torch.nn.functional as F
import math, copy
import pickle

from torch.autograd import Variable

def clones(module, N):
    """
    A helper function for producing N identical layers (each with their own parameters).

    inputs:
        module: a pytorch nn.module
        N (int): the number of copies of that module to return

    returns:
        a ModuleList with the copies of the module (the ModuleList is itself also a module)
    """
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])



# Problem 1

class RNN(nn.Module):

    """ A stacked vanilla RNN with Tanh nonlinearities."""

    def __init__(self, emb_size, hidden_size, seq_len, batch_size, vocab_size,

                 num_layers, dp_keep_prob):

        """

        Initialization of the parameters of the recurrent and fc layers.

        Supports any number of stacked hidden layers (specified by num_layers),

        uses an input embedding layer, and includes fully connected layers with

        dropout after each recurrent layer.



        emb_size:     The number of units in the input embeddings

        hidden_size:  The number of hidden units per layer

        seq_len:      The length of the input sequences

        vocab_size:   The number of tokens in the vocabulary (10,000 for Penn TreeBank)

        num_layers:   The depth of the stack (i.e. the number of hidden layers at

                      each time-step)

        dp_keep_prob: The probability of *not* dropping out units in the

                      non-recurrent connections.

                      Dropout is not applied on the recurrent connections.

        """

        super(RNN, self).__init__()


        # Parameters
        self.emb_size = emb_size

        self.hidden_size = hidden_size

        self.seq_len = seq_len

        self.batch_size = batch_size

        self.vocab_size = vocab_size

        self.dp_keep_prob = dp_keep_prob

        self.num_layers = num_layers

        # Input Embedding Layer
        self.embeddings = nn.Embedding(self.vocab_size,self.emb_size)
                                       
                                       



        # Create layers

        self.layers = nn.ModuleList()

        # The first layer

        self.layers.append(nn.Linear(emb_size + hidden_size, hidden_size))

        # The hidden layers

        self.layers.extend(clones(nn.Linear(2*hidden_size, hidden_size), num_layers-1))

        # Dropout

        self.dropout = nn.Dropout(1 - self.dp_keep_prob)

        # The output layer

        self.out_layer = nn.Linear(hidden_size, vocab_size)


        # Initialize all weights
        self.init_weights()



    def init_weights(self):

        """Initialize the embedding and output weights uniformly."""

        # Intialize embedding weights unformly in the range [

        nn.init.uniform_(self.embeddings.weight, -0.1, 0.1)

        # For every layer

        for i in range(self.num_layers):

            # Initialize the weights and biases uniformly

            b = 1/math.sqrt(self.hidden_size)

            nn.init.uniform_(self.layers[i].weight, -b, b)

            nn.init.uniform_(self.layers[i].bias, -b, b)

        # Initialize output layer weights uniformly in the range [-0.1, 0.1]

        # And all the biases to 0

        nn.init.uniform_(self.out_layer.weight, -0.1, 0.1)

        nn.init.zeros_(self.out_layer.bias)



    def init_hidden(self):

        """Initialize the hidden states to zero.



        This is used for the first mini-batch in an epoch, only.

        """

        return torch.zeros(self.num_layers, self.batch_size, self.hidden_size)



    def forward(self, inputs, hidden):

        """ Compute the recurrent updates.



        Compute the forward pass, using nested python for loops.

        The outer for loop iterates over timesteps, and the inner for loop iterates

        over hidden layers of the stack.



        Within these for loops, the parameter tensors and nn.modules

        created in __init__ are used to compute the recurrent updates according to

        the equations provided in the .tex of the assignment.



        Arguments:

            - inputs: A mini-batch of input sequences, composed of integers that

                        represent the index of the current token(s) in the vocabulary.

                            shape: (seq_len, batch_size)

            - hidden: The initial hidden states for every layer of the stacked RNN.

                            shape: (num_layers, batch_size, hidden_size)



        Returns:

            - Logits for the softmax over output tokens at every time-step.

                  **Do NOT apply softmax to the outputs!**

                  Pytorch's CrossEntropyLoss function (applied in run_exp.py) does

                  this computation implicitly.

                        shape: (seq_len, batch_size, vocab_size)

            - The final hidden states for every layer of the stacked RNN.

                  These will be used as the initial hidden states for all the

                  mini-batches in an epoch, except for the first, where the return

                  value of self.init_hidden will be used.

                        shape: (num_layers, batch_size, hidden_size)

        """

        if inputs.is_cuda:

            device = inputs.get_device()

        else:

            device = torch.device("cpu")



        # Apply the Embedding layer on the input

        embed_out = self.embeddings(inputs)# shape (seq_len,batch_size,emb_size)



        # Create a tensor to store outputs during the Forward

        logits = torch.zeros(self.seq_len, self.batch_size, self.vocab_size).to(device)



        # For each time step

        for timestep in range(self.seq_len):

            # Apply dropout on the embedding result

            input_ = self.dropout(embed_out[timestep])

            # For each layer

            for layer in range(self.num_layers):

                # Calculate the hidden states

                # And apply the activation function tanh on it
                
                hidden[layer] = torch.tanh(self.layers[layer](torch.cat([input_, hidden[layer]], 1)))

                # Apply dropout on this layer, but not for the recurrent units

                input_ = self.dropout(hidden[layer])

            # Store the output of the time step

            logits[timestep] = self.out_layer(input_)



        return logits, hidden



    # Problem 4.2

    def generate(self, inputs, hidden, generated_seq_len):
        if inputs.is_cuda:
            device = inputs.get_device()
        else:
            device = torch.device("cpu")
        logits = torch.zeros(generated_seq_len, self.batch_size).to(device)
        
        for timestep in range(generated_seq_len):
            embed_out = self.embeddings(inputs)
            input_ = embed_out
            for layer in range(self.num_layers):
                    hidden[layer] = torch.tanh(self.layers[layer](torch.cat([input_, hidden[layer]], 1)))  
                    input_ = self.dropout(hidden[layer])
            inputs = torch.argmax(self.out_layer(input_),dim=1)
            logits[timestep] = torch.argmax(self.out_layer(input_),dim=1)        
        return logits
   

# Problem 2

class GRU(nn.Module): # Implement a stacked GRU RNN

    """A stacked gated recurrent unit (GRU) RNN.



    Follow the same template as the RNN (above), but use the equations for

    GRU, not Vanilla RNN.



    Use the attribute names that are provided.



    Initialize the embedding and output weights uniformly in the range [-0.1, 0.1]

    and output biases to 0 (in place). The embeddings should not use a bias vector.

    Initialize all other (i.e. recurrent and linear) weights AND biases uniformly

    in the range [-k, k] where k is the square root of 1/hidden_size



    IMPORTANT: For each init method, use a call to nn.init once for the weights

    and once for the biases, in that order. If you follow the wrong order or

    call nn.init a different number of times the Gradescope tests will fail.

    """

    def __init__(self, emb_size, hidden_size, seq_len, batch_size, vocab_size,

               num_layers, dp_keep_prob):

        super(GRU, self).__init__()

        # Model parameters

        self.emb_size = emb_size

        self.hidden_size = hidden_size

        self.seq_len = seq_len

        self.vocab_size = vocab_size

        self.num_layers = num_layers

        self.dp_keep_prob = dp_keep_prob

        self.batch_size = batch_size



        self.word_embeddings = nn.Embedding(vocab_size, emb_size)



        # Create "reset gate" layers

        self.r = nn.ModuleList()
        self.r.append(nn.Linear(emb_size + hidden_size, hidden_size))
        self.r.extend(clones(nn.Linear(2*hidden_size, hidden_size), num_layers-1))
        

        # "forget gate" layers

        self.z = nn.ModuleList()
        self.z.append(nn.Linear(emb_size + hidden_size, hidden_size))
        self.z.extend(clones(nn.Linear(2*hidden_size, hidden_size), num_layers-1))
        


        # Create the "memory content" layers
        self.h = nn.ModuleList()
        self.h.append(nn.Linear(emb_size + hidden_size, hidden_size))
        self.h.extend(clones(nn.Linear(2*hidden_size, hidden_size), num_layers-1))
        
        

        # Dropout

        self.dropout = nn.Dropout(p=(1 - dp_keep_prob))



        # The output layer

        # self.out_layer = nn.Linear(in_features=hidden_size,
        #                            out_features=vocab_size,
        #                            bias=True)

        # The output layer
        self.out_layer = nn.Linear(hidden_size, vocab_size)

        self.init_embedding_weights_uniform()

        self.init_reset_gate_weights_uniform()

        self.init_forget_gate_weights_uniform()

        self.init_memory_weights_uniform()

        self.init_out_layer_weights_uniform()



    def init_embedding_weights_uniform(self, init_range=0.1):
        nn.init.uniform_(self.word_embeddings.weight, -0.1, 0.1)
        
    def init_reset_gate_weights_uniform(self):
        
        # TODO
        for i in range(self.num_layers):
            b = 1/math.sqrt(self.hidden_size)
            nn.init.uniform_(self.r[i].weight, -b, b)
            nn.init.uniform_(self.r[i].bias, -b, b)



    def init_forget_gate_weights_uniform(self):
        
        # TODO = 0
        for i in range(self.num_layers):
            b = 1/math.sqrt(self.hidden_size)
            nn.init.uniform_(self.z[i].weight, -b, b)
            nn.init.uniform_(self.z[i].bias, -b, b)



    def init_memory_weights_uniform(self):
        
        # TODO 
        for i in range(self.num_layers):
            b = 1/math.sqrt(self.hidden_size)
            nn.init.uniform_(self.h[i].weight, -b, b)
            nn.init.uniform_(self.h[i].bias, -b, b)



    def init_out_layer_weights_uniform(self):
       nn.init.uniform_(self.out_layer.weight, -0.1, 0.1)
       nn.init.zeros_(self.out_layer.bias)


    def init_hidden(self):

        """

        This method returns a tensor of shape

        (self.num_layers, self.batch_size, self.hidden_size)

        filled with zeros as the initial hidden states of the GRU.

        """

        return torch.zeros(self.num_layers, self.batch_size, self.hidden_size)

    def forward(self, inputs, hidden):

        """ Compute the recurrent updates.



        Compute the forward pass, using nested python for loops.

        The outer for loop iterates over timesteps, and the inner for loop iterates

        over hidden layers of the stack.



        Within these for loops, the parameter tensors and nn.modules

        created in __init__ are used to compute the recurrent updates according to

        the equations provided in the .tex of the assignment.



        Arguments:

            - inputs: A mini-batch of input sequences, composed of integers that

                        represent the index of the current token(s) in the vocabulary.

                            shape: (seq_len, batch_size)

            - hidden: The initial hidden states for every layer of the stacked RNN.

                            shape: (num_layers, batch_size, hidden_size)



        Returns:

            - Logits for the softmax over output tokens at every time-step.

                  **Do NOT apply softmax to the outputs!**

                  Pytorch's CrossEntropyLoss function (applied in run_exp.py) does

                  this computation implicitly.

                        shape: (seq_len, batch_size, vocab_size)

            - The final hidden states for every layer of the stacked RNN.

                  These will be used as the initial hidden states for all the

                  mini-batches in an epoch, except for the first, where the return

                  value of self.init_hidden will be used.

                        shape: (num_layers, batch_size, hidden_size)

        """
        if inputs.is_cuda:

            device = inputs.get_device()

        else:

            device = torch.device("cpu")



        # Apply the Embedding layer on the input

        embed_out = self.word_embeddings(inputs)# shape (seq_len,batch_size,emb_size)


        # Create a tensor to store outputs during the Forward

        logits = torch.zeros(self.seq_len, self.batch_size, self.vocab_size).to(device)
        
        for timestep in range(self.seq_len):

            # Apply dropout on the embedding result

            input_ = self.dropout(embed_out[timestep])

            # For each layer

            for layer in range(self.num_layers):
        

                # Calculate the hidden states

                # And apply the activation function tanh on it
                # r_result = torch.sigmoid(self.r[layer](torch.cat([input_, hidden[layer]], 1)))
                # z_result = torch.sigmoid(self.z[layer](torch.cat([input_, hidden[layer]], 1)))
                # h_result = torch.tanh(self.h[layer](torch.cat([input_, r_result*hidden[layer]], 1)))
                # hidden[layer] = ((1-z_result)*hidden[layer]) + z_result*h_result
           
                r_result = torch.sigmoid(self.r[layer](torch.cat([input_, hidden[layer].clone()], 1)))
                z_result = torch.sigmoid(self.z[layer](torch.cat([input_, hidden[layer].clone()], 1)))
                
                h_result = torch.tanh(self.h[layer] (torch.cat([input_, r_result * hidden[layer].clone()], 1)))
                hidden[layer]= ((1 - z_result) * hidden[layer].clone()) + (z_result.clone()) * (h_result.clone())


                # Apply dropout on this layer, but not for the recurrent unit
                input_ = self.dropout(hidden[layer])

            # Store the output of the time step
            logits[timestep] = self.out_layer(input_)


        return logits, hidden


def generate(self, input, hidden, generated_seq_len):

        """

        Generate a sample sequence from the GRU.



        This is similar to the forward method but instead of having ground

        truth input for each time step, you are now required to sample the token

        with maximum probability at each time step and feed it as input at the

        next time step.



        Arguments:

            - input: A mini-batch of input tokens (NOT sequences!)

                            shape: (batch_size)

            - hidden: The initial hidden states for every layer of the stacked RNN.

                            shape: (num_layers, batch_size, hidden_size)

            - generated_seq_len: The length of the sequence to generate.

                           Note that this can be different than the length used

                           for training (self.seq_len)

        Returns:

            - Sampled sequences of tokens

                        shape: (generated_seq_len, batch_size)

        """

        # TODO = 0
        if input.is_cuda:

            device = input.get_device()

        else:

            device = torch.device("cpu")



        # Apply the Embedding layer on the input

       # shape (seq_len,batch_size,emb_size)


        # Create a tensor to store outputs during the Forward

        logits = torch.zeros(generated_seq_len, self.batch_size)
        
        for timestep in range(generated_seq_len):

            # Apply dropout on the embedding result
            embed_out = self.word_embeddings(input)
            input_ = embed_out

            # For each layer

            for layer in range(self.num_layers):
        

                # Calculate the hidden states

                # And apply the activation function tanh on it
                r_result = torch.sigmoid(self.r[layer](torch.cat([input_, hidden[layer]], 1)))
                z_result = torch.sigmoid(self.z[layer](torch.cat([input_, hidden[layer]], 1)))
                h_result = torch.tanh(self.h[layer](torch.cat([input_, r_result*hidden[layer]], 1)))
                hidden[layer] = (1-z_result)*hidden[layer] + z_result*h_result
                #hidden[layer] = torch.tanh(self.layers[layer](torch.cat([input_, hidden[layer]], 1)))

                # Apply dropout on this layer, but not for the recurrent units

                input_ = self.dropout(hidden[layer])

            # Store the output of the time step
            input = torch.argmax(self.out_layer(input_),dim=1)
            logits[timestep] = torch.argmax(self.out_layer(input_),dim=1)
        return logits
        

    



# Problem 2
##############################################################################
#
# Code for the Transformer model
#
##############################################################################

"""
Implement the MultiHeadedAttention module of the transformer architecture.
All other necessary modules have already been implemented for you.

We're building a transfomer architecture for next-step prediction tasks, and
applying it to sequential language modelling. We use a binary "mask" to specify
which time-steps the model can use for the current prediction.
This ensures that the model only attends to previous time-steps.

The model first encodes inputs using the concatenation of a learned WordEmbedding
and a (in our case, hard-coded) PositionalEncoding.
The word embedding maps a word's one-hot encoding into a dense real vector.
The positional encoding 'tags' each element of an input sequence with a code that
identifies it's position (i.e. time-step).

These encodings of the inputs are then transformed repeatedly using multiple
copies of a TransformerBlock.
This block consists of an application of MultiHeadedAttention, followed by a
standard MLP; the MLP applies *the same* mapping at every position.
Both the attention and the MLP are applied with Resnet-style skip connections,
and layer normalization.

The complete model consists of the embeddings, the stacked transformer blocks,
and a linear layer followed by a softmax.
"""

#This code has been modified from an open-source project, by David Krueger.
#The original license is included below:
#MIT License
#
#Copyright (c) 2018 Alexander Rush
#
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is
#furnished to do so, subject to the following conditions:
#
#The above copyright notice and this permission notice shall be included in all
#copies or substantial portions of the Software.
#
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
#SOFTWARE.



#----------------------------------------------------------------------------------

class MultiHeadedAttention(nn.Module):
    def __init__(self, n_heads, n_units, dropout=0.1):
        """
        n_heads: the number of attention heads
        n_units: the number of input and output units
        dropout: probability of DROPPING units
        """
        super(MultiHeadedAttention, self).__init__()
        # This sets the size of the keys, values, and queries (self.d_k) to all
        # be equal to the number of output units divided by the number of heads.
        self.d_k = n_units // n_heads
        # This requires the number of n_heads to evenly divide n_units.
        assert n_units % n_heads == 0
        self.n_units = n_units
        self.n_heads = n_heads
        # TODO ========================
        # Create the layers below. self.linears should contain 3 linear
        # layers that compute the projection from n_units => n_heads x d_k
        # (one for each of query, key and value) plus an additional final layer
        # (4 in total)

        # Note: that parameters are initialized with Glorot initialization in
        # the make_model function below (so you don't need to implement this
        # yourself).

        # Note: the only Pytorch modules you are allowed to use are nn.Linear
        # and nn.Dropout. You can also use softmax, masked_fill and the "clones"
        # function we provide.
        self.linears = clones(nn.Linear(n_units, n_units), 4)
        self.dropout = nn.Dropout(p=dropout)
        

    def attention(self, query, key, value, mask=None, dropout=None):
        # Implement scaled dot product attention
        # The query, key, and value inputs will be of size
        # batch_size x n_heads x seq_len x d_k
        # (If making a single call to attention in your forward method)
        # and mask (if not None) will be of size
        # batch_size x n_heads x seq_len x seq_len

        # As described in the .tex, apply input masking to the softmax
        # generating the "attention values" (i.e. A_i in the .tex)

        # Also apply dropout to the attention values.
        # This method needs to compare query and keys first, then mask positions
        # if a mask is provided, normalize the scores, apply dropout and then
        # retrieve values, in this particular order.
        # When applying the mask, use values -1e9 for the masked positions.
        # The method returns the result of the attention operation as well as
        # the normalized scores after dropout.

        # TODO ========================
        d_k = query.size(-1)
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
             scores = scores.masked_fill(mask == 0, -1e9)
        norm_scores = F.softmax(scores, dim = -1)

        if dropout is not None:
           norm_scores =  dropout(norm_scores)   # Tensor of shape batch_size x n_heads x seq_len x seq_len
        output = torch.matmul(norm_scores, value)# Tensor of shape batch_size x n_heads x seq_len x d_k

        return output, norm_scores





      


    def forward(self, query, key, value, mask=None):
        # Implement the masked multi-head attention.
        # query, key, and value correspond to Q, K, and V in the latex, and
        # they all have size: (batch_size, seq_len, self.n_units)
        # mask has size: (batch_size, seq_len, seq_len)
        # This method should call the attention method above
        # TODO ========================
        # 1) Do all the linear projections in batch from n_units => n_heads x d_k

        # 2) Apply attention on all the projected vectors in batch.
        # The query, key, value inputs to the attention method will be of size
        # batch_size x n_heads x seq_len x d_k

        # 3) "Concat" using a view and apply a final linear.

        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        batch_size = query.size(0)
        
        # 1) Do all the linear projections in batch from d_model => h x d_k 
        query, key, value = [l(x).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
             for l, x in zip(self.linears, (query, key, value))]
        
        # 2) Apply attention on all the projected vectors in batch. 
        x, _ = self.attention(query, key, value, mask=mask, dropout=self.dropout)
        
        # 3) "Concat" using a view and apply a final linear. 
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.n_heads * self.d_k)
        return self.linears[-1](x)










#----------------------------------------------------------------------------------
# The encodings of elements of the input sequence

class WordEmbedding(nn.Module):
    def __init__(self, n_units, vocab):
        super(WordEmbedding, self).__init__()
        self.lut = nn.Embedding(vocab, n_units)
        self.n_units = n_units

    def forward(self, x):
        #print (x)
        return self.lut(x) * math.sqrt(self.n_units)


class PositionalEncoding(nn.Module):
    def __init__(self, n_units, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, n_units)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, n_units, 2).float() *
                             -(math.log(10000.0) / n_units))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)],
                         requires_grad=False)
        return self.dropout(x)



#----------------------------------------------------------------------------------
# The TransformerBlock and the full Transformer

class TransformerBlock(nn.Module):
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(TransformerBlock, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(ResidualSkipConnectionWithLayerNorm(size, dropout), 2)

    def forward(self, x, mask):
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask)) # apply the self-attention
        return self.sublayer[1](x, self.feed_forward) # apply the position-wise MLP


class TransformerStack(nn.Module):
    """
    This will be called on the TransformerBlock (above) to create a stack.
    """
    def __init__(self, layer, n_blocks): # layer will be TransformerBlock (below)
        super(TransformerStack, self).__init__()
        self.layers = clones(layer, n_blocks)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)


class FullTransformer(nn.Module):
    def __init__(self, transformer_stack, embedding, n_units, vocab_size):
        super(FullTransformer, self).__init__()
        self.transformer_stack = transformer_stack
        self.embedding = embedding
        self.output_layer = nn.Linear(n_units, vocab_size)

    def forward(self, input_sequence, mask):
        embeddings = self.embedding(input_sequence)
        return F.log_softmax(self.output_layer(self.transformer_stack(embeddings, mask)), dim=-1)


def make_model(vocab_size, n_blocks=6,
               n_units=512, n_heads=16, dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(n_heads, n_units)
    ff = MLP(n_units, dropout)
    position = PositionalEncoding(n_units, dropout)
    model = FullTransformer(
        transformer_stack=TransformerStack(TransformerBlock(n_units, c(attn), c(ff), dropout), n_blocks),
        embedding=nn.Sequential(WordEmbedding(n_units, vocab_size), c(position)),
        n_units=n_units,
        vocab_size=vocab_size
        )

    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model


#----------------------------------------------------------------------------------
# Data processing

def subsequent_mask(size):
    """ helper function for creating the masks. """
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

class Batch:
    "Object for holding a batch of data with mask during training."
    def __init__(self, x, pad=0):
        self.data = x
        self.mask = self.make_mask(self.data, pad)

    @staticmethod
    def make_mask(data, pad):
        "Create a mask to hide future words."
        mask = (data != pad).unsqueeze(-2)
        mask = mask & Variable(
            subsequent_mask(data.size(-1)).type_as(mask.data))
        return mask


#----------------------------------------------------------------------------------
# Some standard modules

class LayerNorm(nn.Module):
    "layer normalization, as in: https://arxiv.org/abs/1607.06450"
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2


class ResidualSkipConnectionWithLayerNorm(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(ResidualSkipConnectionWithLayerNorm, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))


class MLP(nn.Module):
    """
    This is just an MLP with 1 hidden layer
    """
    def __init__(self, n_units, dropout=0.1):
        super(MLP, self).__init__()
        self.w_1 = nn.Linear(n_units, 2048)
        self.w_2 = nn.Linear(2048, n_units)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

RUN

In [None]:
#!/bin/python
# coding: utf-8

# Code outline/scaffold for
# ASSIGNMENT 2: RNNs, Attention, and Optimization
# By Tegan Maharaj, David Krueger, and Chin-Wei Huang
# Edits 2020 by Jessica Thompson, Jonathan Cornford and Lluis Castrejon
# IFT6135 at University of Montreal
# Winter 2020
#
# based on code from:
#    https://github.com/deeplearningathome/pytorch-language-model/blob/master/reader.py
#    https://github.com/ceshine/examples/blob/master/word_language_model/main.py
#    https://github.com/teganmaharaj/zoneout/blob/master/zoneout_word_ptb.py
#    https://github.com/harvardnlp/annotated-transformer

## GENERAL INSTRUCTIONS

# - We encourage you to read and understand this code; there are some notes and comments to help you.
# - Typically, all of your code to submit should be written in solution.py;
#  see further instructions at the top of that file / in TODOs.
#      - GRU recurrent unit
#      - Multi-head attention for the Transformer

# - Other than this file and solution.py, you will probably also need to modify
# and/or write additional code to create plots (learning curves, loss w.r.t.
# time, gradients w.r.t. hiddens) and to load a saved model (computing gradients
# w.r.t. hidden and for sampling from the model). This code will not be graded.


# ## PROBLEM-SPECIFIC INSTRUCTIONS:
#    - For Problem 3.1 the hyperparameter settings you should run are as follows
#            --model=RNN --optimizer=SGD --initial_lr=1.0 --batch_size=128 --seq_len=35 --hidden_size=512 --num_layers=2 --dp_keep_prob=0.8  --num_epochs=20 --save_best
#            --model=RNN --optimizer=SGD --initial_lr=1.0 --batch_size=20  --seq_len=35 --hidden_size=512 --num_layers=2 --dp_keep_prob=0.8  --num_epochs=20
#            --model=RNN --optimizer=SGD --initial_lr=10.0 --batch_size=128 --seq_len=35 --hidden_size=512 --num_layers=2 --dp_keep_prob=0.8  --num_epochs=20
#            --model=RNN --optimizer=ADAM --initial_lr=0.001 --batch_size=128 --seq_len=35 --hidden_size=512 --num_layers=2 --dp_keep_prob=0.8  --num_epochs=20
#            --model=RNN --optimizer=ADAM --initial_lr=0.0001 --batch_size=128 --seq_len=35 --hidden_size=512 --num_layers=2 --dp_keep_prob=0.8  --num_epochs=20

#    - For Problem 3.2 the hyperparameter settings you should run are as follows
#            --model=GRU --optimizer=ADAM --initial_lr=0.001 --batch_size=128 --seq_len=35 --hidden_size=512 --num_layers=2 --dp_keep_prob=0.5  --num_epochs=20 --save_best
#            --model=GRU --optimizer=SGD  --initial_lr=10.0  --batch_size=128 --seq_len=35 --hidden_size=512 --num_layers=2 --dp_keep_prob=0.5  --num_epochs=20
#            --model=GRU --optimizer=ADAM --initial_lr=0.001 --batch_size=20 --seq_len=35 --hidden_size=512 --num_layers=2 --dp_keep_prob=0.5  --num_epochs=20

#    - For Problem 3.3 the hyperparameter settings you should run are as follows
#            --model=GRU --optimizer=ADAM --initial_lr=0.001 --batch_size=128 --seq_len=35 --hidden_size=256 --num_layers=2 --dp_keep_prob=0.2  --num_epochs=20
#            --model=GRU --optimizer=ADAM --initial_lr=0.001 --batch_size=128 --seq_len=35 --hidden_size=2048 --num_layers=2 --dp_keep_prob=0.5  --num_epochs=20
#            --model=GRU --optimizer=ADAM --initial_lr=0.001 --batch_size=128 --seq_len=35 --hidden_size=512 --num_layers=4 --dp_keep_prob=0.5  --num_epochs=20

#    - For Problem 3.4 the hyperparameter settings you should run are as follows
#            --model=TRANSFORMER --optimizer=ADAM --initial_lr=0.0001 --batch_size=128 --seq_len=35 --hidden_size=512  --num_layers=6 --dp_keep_prob=0.9 --num_epochs=20
#            --model=TRANSFORMER --optimizer=ADAM --initial_lr=0.0001 --batch_size=128 --seq_len=35 --hidden_size=512  --num_layers=2 --dp_keep_prob=0.9 --num_epochs=20
#            --model=TRANSFORMER --optimizer=ADAM --initial_lr=0.0001 --batch_size=128 --seq_len=35 --hidden_size=2048 --num_layers=2 --dp_keep_prob=0.6 --num_epochs=20
#            --model=TRANSFORMER --optimizer=ADAM --initial_lr=0.0001 --batch_size=128 --seq_len=35 --hidden_size=1024 --num_layers=6 --dp_keep_prob=0.9 --num_epochs=20

# You are also encouraged to explore the hyperparameter space and try to obtain
# better validation perplexities than the given settings.
#

# - For Problem 4.1, perform all computations / plots based on saved models from
#   Problem 3.1 and 3.2. Note the --save_best flag for the first set of
#   parameters for each question (Of course you can still save other models than
#   them if you like; just add the flag --save_best). You can modify the loss
#   computation in this script (search for "LOSS COMPUTATION" to find the
#   appropriate line.

import argparse
import time
import collections
import os
import sys
import torch
import torch.nn
from torch.autograd import Variable
import torch.nn as nn
import numpy
np = numpy

# NOTE ==============================================
# This is where your models are imported
#from solution import RNN, GRU
#from solution import make_model as TRANSFORMER

##############################################################################
#
# ARG PARSING AND EXPERIMENT SETUP
#
##############################################################################

parser = argparse.ArgumentParser(description='PyTorch Penn Treebank Language Modeling')

# Arguments you may need to set to run different experiments in 4.1 & 4.2.
parser.add_argument('--data', type=str, default='data',
                    help='location of the data corpus. We suggest you change the default\
                    here, rather than passing as an argument, to avoid long file paths.')
parser.add_argument('--model', type=str, default='RNN',
                    help='type of recurrent net (RNN, GRU, TRANSFORMER)')
parser.add_argument('--optimizer', type=str, default='SGD',
                    help='optimization algo to use; SGD, SGD_LR_SCHEDULE, ADAM')
parser.add_argument('--seq_len', type=int, default=35,
                    help='number of timesteps over which BPTT is performed')
parser.add_argument('--batch_size', type=int, default=128,
                    help='size of one minibatch')
parser.add_argument('--initial_lr', type=float, default=1.0,
                    help='initial learning rate')
parser.add_argument('--hidden_size', type=int, default=512,
                    help='size of hidden layers. IMPORTANT: for the transformer\
                    this must be a multiple of 16.')
parser.add_argument('--save_best', action='store_true',
                    help='save the model for the best validation performance')
parser.add_argument('--num_layers', type=int, default=2,
                    help='number of hidden layers in RNN/GRU, or number of transformer blocks in TRANSFORMER')

# Other hyperparameters you may want to tune in your exploration
parser.add_argument('--emb_size', type=int, default=200,
                    help='size of word embeddings')
parser.add_argument('--num_epochs', type=int, default=1,
                    help='number of epochs to stop after')
parser.add_argument('--dp_keep_prob', type=float, default=0.8,
                    help='dropout *keep* probability. drop_prob = 1-dp_keep_prob \
                    (dp_keep_prob=1 means no dropout)')

# Arguments that you may want to make use of / implement more code for
parser.add_argument('--debug', action='store_true')
parser.add_argument('--save_dir', type=str, default='',
                    help='path to save the experimental config, logs, model \
                    This is automatically generated based on the command line \
                    arguments you pass and only needs to be set if you want a \
                    custom dir name')
parser.add_argument('--evaluate', action='store_true',
                    help="use this flag to run on the test set. Only do this \
                    ONCE for each model setting, and only after you've \
                    completed ALL hyperparameter tuning on the validation set.\
                    Note we are not requiring you to do this.")

# DO NOT CHANGE THIS (setting the random seed makes experiments deterministic,
# which helps for reproducibility)
parser.add_argument('--seed', type=int, default=1111,
                    help='random seed')

def set_seed(seed):
    # random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

args = parser.parse_args(args=[])
argsdict = args.__dict__
argsdict['code_file'] = sys.argv[0]

# Use the model, optimizer, and the flags passed to the script to make the
# name for the experimental dir
print("\n########## Setting Up Experiment ######################")
flags = [flag.lstrip('--').replace('/', '').replace('\\', '') for flag in sys.argv[1:]]
experiment_path = os.path.join(args.save_dir, '_'.join([argsdict['model'],
                                         argsdict['optimizer']]
                                         + flags))
# Increment a counter so that previous results with the same args will not
# be overwritten. Comment out the next four lines if you only want to keep
# the most recent results.
i = 0
while os.path.exists(experiment_path + "_" + str(i)):
    i += 1
experiment_path = experiment_path + "_" + str(i)

# Creates an experimental directory and dumps all the args to a text file
os.makedirs(experiment_path, exist_ok=True)

print ("\nPutting log in %s"%experiment_path)
argsdict['save_dir'] = experiment_path
with open (os.path.join(experiment_path,'exp_config.txt'), 'w') as f:
    for key in sorted(argsdict):
        f.write(key+'    '+str(argsdict[key])+'\n')

# Set the random seed manually for reproducibility.
set_seed(args.seed)


# Use the GPU if you have one
if torch.cuda.is_available():
    print("Using the GPU")
    device = torch.device("cuda")
else:
    print("WARNING: You are about to run on cpu, and this will likely run out \
      of memory. \n You can try setting batch_size=1 to reduce memory usage")
    device = torch.device("cpu")


###############################################################################
#
# LOADING & PROCESSING
#
###############################################################################

# HELPER FUNCTIONS
def _read_words(filename):
    with open(filename, "r") as f:
      return f.read().replace("\n", "<eos>").split()

def _build_vocab(filename):
    data = _read_words(filename)

    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))
    id_to_word = dict((v, k) for k, v in word_to_id.items())

    with open("id_to_word.pickle","wb") as f:

              pickle.dump(id_to_word,f)


    return word_to_id, id_to_word

def _file_to_word_ids(filename, word_to_id):
    data = _read_words(filename)
    return [word_to_id[word] for word in data if word in word_to_id]

# Processes the raw data from text files
def ptb_raw_data(data_path=None, prefix="ptb"):
    train_path = os.path.join(data_path, prefix + ".train.txt")
    valid_path = os.path.join(data_path, prefix + ".valid.txt")
    test_path = os.path.join(data_path, prefix + ".test.txt")

    word_to_id, id_2_word = _build_vocab(train_path)
    train_data = _file_to_word_ids(train_path, word_to_id)
    valid_data = _file_to_word_ids(valid_path, word_to_id)
    test_data = _file_to_word_ids(test_path, word_to_id)
    return train_data, valid_data, test_data, word_to_id, id_2_word

# Yields minibatches of data
def ptb_iterator(raw_data, batch_size, num_steps):
    raw_data = np.array(raw_data, dtype=np.int32)

    data_len = len(raw_data)
    batch_len = data_len // batch_size
    data = np.zeros([batch_size, batch_len], dtype=np.int32)
    for i in range(batch_size):
        data[i] = raw_data[batch_len * i:batch_len * (i + 1)]

    epoch_size = (batch_len - 1) // num_steps

    if epoch_size == 0:
        raise ValueError("epoch_size == 0, decrease batch_size or num_steps")

    for i in range(epoch_size):
        x = data[:, i*num_steps:(i+1)*num_steps]
        y = data[:, i*num_steps+1:(i+1)*num_steps+1]
        yield (x, y)


class Batch:
    "Data processing for the transformer. This class adds a mask to the data."
    def __init__(self, x, pad=-1):
        self.data = x
        self.mask = self.make_mask(self.data, pad)

    @staticmethod
    def make_mask(data, pad):
        "Create a mask to hide future words."

        def subsequent_mask(size):
            """ helper function for creating the masks. """
            attn_shape = (1, size, size)
            subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
            return torch.from_numpy(subsequent_mask) == 0

        mask = (data != pad).unsqueeze(-2)
        mask = mask & Variable(
            subsequent_mask(data.size(-1)).type_as(mask.data))
        return mask


# LOAD DATA
print('Loading data from '+args.data)
if args.data == 'SLURM_TMPDIR':
    raw_data = ptb_raw_data(data_path=os.environ['SLURM_TMPDIR'])
else:
    raw_data = ptb_raw_data(data_path=args.data)
train_data, valid_data, test_data, word_to_id, id_2_word = raw_data
vocab_size = len(word_to_id)
print('  vocabulary size: {}'.format(vocab_size))


###############################################################################
#
# MODEL SETUP
#
###############################################################################

# NOTE ==============================================
# This is where your model code will be called.
if args.model == 'RNN':
    model = RNN(emb_size=args.emb_size, hidden_size=args.hidden_size,
                seq_len=args.seq_len, batch_size=args.batch_size,
                vocab_size=vocab_size, num_layers=args.num_layers,
                dp_keep_prob=args.dp_keep_prob)
elif args.model == 'GRU':
    model = GRU(emb_size=args.emb_size, hidden_size=args.hidden_size,
                seq_len=args.seq_len, batch_size=args.batch_size,
                vocab_size=vocab_size, num_layers=args.num_layers,
                dp_keep_prob=args.dp_keep_prob)
elif args.model == 'TRANSFORMER':
    if args.debug:  # use a very small model
        model = TRANSFORMER(vocab_size=vocab_size, n_units=16, n_blocks=2)
    else:
        # Note that we're using num_layers and hidden_size to mean slightly
        # different things here than in the RNNs.
        # Also, the Transformer also has other hyperparameters
        # (such as the number of attention heads) which can change it's behavior.
        model = TRANSFORMER(vocab_size=vocab_size, n_units=args.hidden_size,
                            n_blocks=args.num_layers, dropout=1.-args.dp_keep_prob)
    # these 3 attributes don't affect the Transformer's computations;
    # they are only used in run_epoch
    model.batch_size=args.batch_size
    model.seq_len=args.seq_len
    model.vocab_size=vocab_size
else:
    print("Model type not recognized.")

model = model.to(device)

# LOSS FUNCTION
loss_fn = nn.CrossEntropyLoss()
if args.optimizer == 'ADAM':
    optimizer = torch.optim.Adam(model.parameters(), lr=args.initial_lr)

# LEARNING RATE
lr = args.initial_lr
# These variables are for learning rate schedule (which you are not asked to use)
# see SGD_LR_SCHEDULE in the main loop
lr_decay_base = 1 / 1.15
m_flat_lr = 14.0 # we will not touch lr for the first m_flat_lr epochs

###############################################################################
#
# DEFINE COMPUTATIONS FOR PROCESSING ONE EPOCH
#
###############################################################################

def repackage_hidden(h):
    """
    Wraps hidden states in new Tensors, to detach them from their history.

    This prevents Pytorch from trying to backpropagate into previous input
    sequences when we use the final hidden states from one mini-batch as the
    initial hidden states for the next mini-batch.

    Using the final hidden states in this way makes sense when the elements of
    the mini-batches are actually successive subsequences in a set of longer sequences.
    This is the case with the way we've processed the Penn Treebank dataset.
    """
    if isinstance(h, Variable):
        return h.detach_()
    else:
        return tuple(repackage_hidden(v) for v in h)


def run_epoch(model, data, is_train=False, lr=1.0):
    """
    One epoch of training/validation (depending on flag is_train).
    """
    if is_train:
        model.train()
    else:
        model.eval()
    epoch_size = ((len(data) // model.batch_size) - 1) // model.seq_len
    start_time = time.time()
    if args.model != 'TRANSFORMER':
        hidden = model.init_hidden()
        hidden = hidden.to(device)
    costs = 0.0
    iters = 0
    losses = []

    # LOOP THROUGH MINIBATCHES
    for step, (x, y) in enumerate(ptb_iterator(data, model.batch_size, model.seq_len)):
        if args.model == 'TRANSFORMER':
            batch = Batch(torch.from_numpy(x).long().to(device))
            model.zero_grad()
            outputs = model.forward(batch.data, batch.mask).transpose(1,0)
            #print ("outputs.shape", outputs.shape)
        else:
            inputs = torch.from_numpy(x.astype(np.int64)).transpose(0, 1).contiguous().to(device)#.cuda()
            model.zero_grad()
            hidden = repackage_hidden(hidden)
            targets = torch.from_numpy(y.astype(np.int64)).transpose(0, 1).contiguous().to(device)#.cuda()
            tt = torch.squeeze(targets.view(-1, model.batch_size * model.seq_len))
            outputs, hidden = model(inputs, hidden)
            
            with open("myinput.pickle","wb") as f:
              pickle.dump(inputs,f)


            


        # LOSS COMPUTATION
        # This line currently averages across all the sequences in a mini-batch
        # and all time-steps of the sequences.
        # For problem 4.1, you will (instead) need to compute the average loss
        # at each time-step separately. Hint: use the method retain_grad to keep
        # gradients for intermediate nodes of the computational graph.
        #
        loss = loss_fn(outputs.contiguous().view(-1, model.vocab_size), tt)
        costs += loss.data.item() * model.seq_len
        losses.append(costs)
        iters += model.seq_len
        if args.debug:
            print(step, loss)
        if is_train:  # Only update parameters if training
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25)
            if args.optimizer == 'ADAM':
                optimizer.step()
            else:
                for p in model.parameters():
                    if p.grad is not None:
                        p.data.add_(-lr, p.grad.data)
            if step % (epoch_size // 10) == 10:
                print('step: '+ str(step) + '\t' \
                    + "loss (sum over all examples' seen this epoch):" + str(costs) + '\t' \
                    + 'speed (wps):' + str(iters * model.batch_size / (time.time() - start_time)))
    return np.exp(costs / iters), losses



###############################################################################
#
# RUN MAIN LOOP (TRAIN AND VAL)
#
###############################################################################

print("\n########## Running Main Loop ##########################")
train_ppls = []
train_losses = []
val_ppls = []
val_losses = []
best_val_so_far = np.inf
times = []

# In debug mode, only run one epoch
if args.debug:
    num_epochs = 1
else:
    num_epochs = args.num_epochs

# MAIN LOOP
for epoch in range(num_epochs):
    t0 = time.time()
    print('\nEPOCH '+str(epoch)+' ------------------')
    if args.optimizer == 'SGD_LR_SCHEDULE':
        lr_decay = lr_decay_base ** max(epoch - m_flat_lr, 0)
        lr = lr * lr_decay # decay lr if it is time

    # RUN MODEL ON TRAINING DATA
    train_ppl, train_loss = run_epoch(model, train_data, True, lr)

    # RUN MODEL ON VALIDATION DATA
    val_ppl, val_loss = run_epoch(model, valid_data)


    # SAVE MODEL IF IT'S THE BEST SO FAR
    if val_ppl < best_val_so_far:
        best_val_so_far = val_ppl
        if args.save_best:
            print("Saving model parameters to best_params.pt")
            torch.save(model.state_dict(), os.path.join(args.save_dir, 'best_params.pt'))
        # NOTE ==============================================
        # You will need to load these parameters into the same model
        # for a couple Problems: so that you can compute the gradient
        # of the loss w.r.t. hidden state as required in Problem 4.1
        # and to sample from the the model as required in Problem 4.2
        # We are not asking you to run on the test data, but if you
        # want to look at test performance you would load the saved
        # model and run on the test data with batch_size=1

    # LOC RESULTS
    train_ppls.append(train_ppl)
    val_ppls.append(val_ppl)
    train_losses.extend(train_loss)
    val_losses.extend(val_loss)
    times.append(time.time() - t0)
    log_str = 'epoch: ' + str(epoch) + '\t' \
            + 'train ppl: ' + str(train_ppl) + '\t' \
            + 'val ppl: ' + str(val_ppl)  + '\t' \
            + 'best val: ' + str(best_val_so_far) + '\t' \
            + 'time (s) spent in epoch: ' + str(times[-1])
    print(log_str)
    with open (os.path.join(args.save_dir, 'log.txt'), 'a') as f_:
        f_.write(log_str+ '\n')

# SAVE LEARNING CURVES
lc_path = os.path.join(args.save_dir, 'learning_curves.npy')
print('\nDONE\n\nSaving learning curves to '+lc_path)
np.save(lc_path, {'train_ppls':train_ppls,
                  'val_ppls':val_ppls,
                  'train_losses':train_losses,
                  'val_losses':val_losses,
                  'times':times})
# NOTE ==============================================
# To load these, run
# >>> x = np.load(lc_path, allow_pickle=True)[()]
# You will need these values for plotting learning curves




########## Setting Up Experiment ######################

Putting log in RNN_SGD_f_root.localsharejupyterruntimekernel-31825cd4-e972-4354-a099-094e5179a934.json_3
Using the GPU
Loading data from data
  vocabulary size: 10000

########## Running Main Loop ##########################

EPOCH 0 ------------------
step: 10	loss (sum over all examples' seen this epoch):3201.6700077056885	speed (wps):21518.61148393303
step: 30	loss (sum over all examples' seen this epoch):8395.52184343338	speed (wps):22085.885556120156


KeyboardInterrupt: ignored

In [None]:
##########################
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from torch.autograd import Variable
import pickle


if torch.cuda.is_available():
    print("Using the GPU")
    device = torch.device("cuda")
else:
    print("WARNING: You are about to run on cpu, and this will likely run out \
      of memory. \n You can try setting batch_size=1 to reduce memory usage")
    device = torch.device("cpu")
    
with open('myinput.pickle', 'rb') as f:
    loaded_obj = pickle.load(f)
loaded_obj.shape
loaded_obj_2 = loaded_obj[0:1,]
print(loaded_obj_2.shape)
hidden = model.init_hidden()
hidden = hidden.to(device)
hidden = repackage_hidden(hidden)
# the model will be the best model from problem 3.1 and 3.2 
inputs = loaded_obj_2.view(128).to(device)
inputs = torch.tensor(inputs).to(device)

## Select the number of timesteps
samples = model.generate(inputs, hidden,35)
samples = samples.transpose(0,1)


with open('id_to_word.pickle', 'rb') as f:
    id_to_word = pickle.load(f)


row_num = 0
for row in samples:
  phrase = id_to_word[int(inputs[row_num])]
  row_num = row_num +1
  for column in range(len(row)):
      phrase = phrase +' ' + id_to_word[int(row[column])] 
  print(phrase, '\n')





Using the GPU


FileNotFoundError: ignored