In [1]:
%matplotlib inline

In [2]:
# Author: sanketvmehta
# Base code from: http://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html

In [3]:
import torch
import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.parameter import Parameter

torch.manual_seed(1)

<torch._C.Generator at 0x7f0782012360>

In [5]:
'''
Embedding module contains "weight" as its attribute
weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim)

Have a look at the following documentation for more info
http://pytorch.org/docs/master/_modules/torch/nn/modules/sparse.html#Embedding
'''

# an Embedding module containing 20 tensors of size 5
embedding = nn.Embedding(20, 5)

# a batch of 2 samples of 4 indices each
example_lookup_tensor = torch.LongTensor([[1,2,4,5],[4,3,2,9]])
input_1 = Variable(example_lookup_tensor)
# embedding(input_1)

In [6]:
# example with padding_idx
embedding = nn.Embedding(10, 3, padding_idx=5)
input_2 = Variable(torch.LongTensor([[0,2,0,5]]))
# print(embedding(input_2))

# print(Parameter(torch.Tensor(10, 3)))

In [7]:
# Initializing embeddings with pre-trained weights
emb = nn.Embedding(10, 2) # construct embedding with desired parameters

preloaded_weights = torch.randn(10, 2) # Initialize randomly
emb.weight = nn.Parameter(preloaded_weights)
# print(emb.weight)

# More optimized version which frees space of initial assignment and re-assigns one with new preloaded_weights
emb1 = nn.Embedding(10, 3)
del emb1.weight
preloaded_weights1 = torch.randn(10, 3) # Initialize randomly
emb1.weight = nn.Parameter(preloaded_weights1)
# print(emb1.weight)

'''
If one wants to initialize pre-trained weights and freeze them (no further training) 
while setting "weight" attribute to Parameter ()...make its requires_grad = False

Have a look at this documentation for more info
http://pytorch.org/docs/master/notes/autograd.html#excluding-subgraphs
'''

emb = nn.Embedding(10, 2) # construct embedding with desired parameters

preloaded_weights = torch.randn(10, 2) # Initialize randomly
emb.weight = nn.Parameter(preloaded_weights, requires_grad=False)
print(emb.weight.requires_grad)

False


In [8]:
word_to_ix = {"hello": 0, "world": 1}
embeds = nn.Embedding(2, 5)  # 2 words in vocab, 5 dimensional embeddings
lookup_tensor = torch.LongTensor([word_to_ix["hello"]])
hello_embed = embeds(autograd.Variable(lookup_tensor))
print(hello_embed)

Variable containing:
-0.6021  0.0531 -0.1751 -0.1346 -1.0441
[torch.FloatTensor of size 1x5]



In [9]:
embeds = nn.Embedding(2, 5)
type(embeds)

torch.nn.modules.sparse.Embedding

In [10]:
# N-Gram Language Model

In [12]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10
# We will use Shakespeare Sonnet 2
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()

# we should tokenize the input, but we will ignore that for now
# build a list of tuples.  Each tuple is ([ word_i-2, word_i-1 ], target word)

trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
            for i in range(len(test_sentence) - 2)]
# print the first 3, just so you can see what they look like
print(trigrams[:3])

vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}


class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        
#         Your module will always sub-class nn.Module and so you call 
#         super().__init__() to leverage various functionalities provided by base class nn.Module
        super(NGramLanguageModeler, self).__init__()
    
        '''    
        Define the parameters that you will need.  In this case, we need A and b, 
        the parameters of the embeddings (embedding.weights) and
        Torch defines nn.Linear(), which provides the affine map.
        '''

        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
#         self.embeddings.weight.requires_grad = False
        
        
#         NOTE! The non-linearity log_softmax does not have parameters! So we don't need to worry about that here
        
        
#         Input is context_size (n-1) previous words and each one is of embedding_dim dimension
#         Further size of hidden layer in this example is 128
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
    
#         Output layer is size of our vocabulary
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
#         inputs is list and so we get tensor of size length of list * embedding_dimension
#         in one-dimensional vector so we use view to reShape
        embeds = self.embeddings(inputs).view((1, -1))
    
#         We use relu activation function in first hidden layer and note that we don't have any parameters for non-linearities
        out = F.relu(self.linear1(embeds))
    
        out = self.linear2(out)
        
#         In last layer we use log_softmax non-linearity
        log_probs = F.log_softmax(out)
        return log_probs


losses = []
# Negative log-likelihood loss
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)

print(model.embeddings(autograd.Variable(torch.LongTensor([1]))))

# We want to set weight for our embedding to be non-trainable or freeze them
model.embeddings.weight.requires_grad = False

print(type(model.parameters()))

for param in model.parameters():
    print(param.requires_grad)

#     optimizer requires that all parameters have "requires_grad" to be true
#     One option is to filter model.parameters() and only pass those which have "requires_grad" as true 
filtered_model_parameters = filter(lambda x: x.requires_grad, model.parameters())

optimizer = optim.SGD(filtered_model_parameters, lr=0.001)

for epoch in range(10):
    total_loss = torch.Tensor([0])
    
#     trigrams is list of tuples (context, target)
    for context, target in trigrams:

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in variables)
        context_idxs = [word_to_ix[w] for w in context]
        context_var = autograd.Variable(torch.LongTensor(context_idxs))

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_var)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a variable)
        loss = loss_function(log_probs, autograd.Variable(
            torch.LongTensor([word_to_ix[target]])))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        
#         Update parameters
        optimizer.step()

        total_loss += loss.data
    losses.append(total_loss)
print(losses)  # The loss decreased every iteration over the training data!
print(model.embeddings(autograd.Variable(torch.LongTensor([1]))))

[(['When', 'forty'], 'winters'), (['forty', 'winters'], 'shall'), (['winters', 'shall'], 'besiege')]
Variable containing:
-2.4774 -0.1273  0.2934 -0.1200 -0.2976 -0.4123 -1.2042 -0.6512  0.5797  0.6054
[torch.FloatTensor of size 1x10]

<class 'generator'>
False
True
True
True
True
[
 519.3201
[torch.FloatTensor of size 1]
, 
 517.0247
[torch.FloatTensor of size 1]
, 
 514.7433
[torch.FloatTensor of size 1]
, 
 512.4750
[torch.FloatTensor of size 1]
, 
 510.2187
[torch.FloatTensor of size 1]
, 
 507.9735
[torch.FloatTensor of size 1]
, 
 505.7385
[torch.FloatTensor of size 1]
, 
 503.5125
[torch.FloatTensor of size 1]
, 
 501.2951
[torch.FloatTensor of size 1]
, 
 499.0882
[torch.FloatTensor of size 1]
]
Variable containing:
-2.4774 -0.1273  0.2934 -0.1200 -0.2976 -0.4123 -1.2042 -0.6512  0.5797  0.6054
[torch.FloatTensor of size 1x10]

