#### Pytorch's LSTM expects all of its inputs to be 3D tensors

1. The first axis is the sequence itself
2. The second indexes instances in the mini-batch
3. The third indexes elements of the input

In [None]:
# Author: Robert Guthrie

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7fa17c4cb510>

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

#### example

In [None]:
lstm = nn.LSTM(3, 3)
inputs = [torch.randn(1, 3) for _ in range(5)]
# h_0, c_o
hidden = (torch.randn(1, 1, 3),
          torch.randn(1, 1, 3))

for item in inputs:
  out, hidden = lstm(item.view(1, 1, -1), hidden)

#### concat entire sequence at once

In [None]:
inputs = [torch.randn(1, 3) for _ in range(5)]
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))
out, hidden = lstm(inputs, hidden)

In [None]:
out

tensor([[[-0.6383, -0.1405, -0.1033]],

        [[-0.1460, -0.0367, -0.2437]],

        [[-0.4672, -0.0584, -0.2693]],

        [[-0.5145,  0.0216, -0.2656]],

        [[-0.1504,  0.0266, -0.1379]]], grad_fn=<MkldnnRnnLayerBackward0>)

#### Example: An LSTM for Part-of-Speech Tagging

Let $w_1, \cdots w_M$ be input sentence, where $w_i \in V$. Let T be our tag set, and $y_i$ be the tag of word $w_i$.

In [None]:
def prepare_sequence(seq, to_ix):
  idxs = [to_ix[w] for w in seq]
  return torch.tensor(idxs, dtype = torch.long).cuda() 

In [None]:
training_data = [
    # Tags are: DET - determiner; NN - noun; V - verb
    # For example, the word "The" is a determiner
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]

word_to_ix = {}
for sentence, tags in training_data:
  for word in sentence:
    if word not in word_to_ix:
      # assign each word with unique index
      word_to_ix[word] = len(word_to_ix)

tag_to_ix = {"DET": 0, "NN": 1, "V": 2}
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

In [None]:
word_to_ix

{'The': 0,
 'dog': 1,
 'ate': 2,
 'the': 3,
 'apple': 4,
 'Everybody': 5,
 'read': 6,
 'that': 7,
 'book': 8}

### Model

In [None]:
class LSTMTagger(nn.Module):
  def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
    super(LSTMTagger, self).__init__()
    self.hidden_dim = hidden_dim

    self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim)
    self.hidden2tag = nn.Linear(hidden_dim, tagset_size)


  def forward(self, sentence):
    embedings = self.word_embeddings(sentence)
    lstm_out, _ = self.lstm(embedings.view(len(sentence), 1, -1))
    tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
    tag_scores = F.log_softmax(tag_space, dim=1)
    return tag_scores

### model before training

In [None]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
model.to(device)
loss_func = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.05)

with torch.no_grad():
  inputs = prepare_sequence(training_data[0][0], word_to_ix)
  tag_scores = model(inputs)
  print(tag_scores)

tensor([[-1.1110, -1.3284, -0.9018],
        [-1.0342, -1.3603, -0.9470],
        [-1.0272, -1.2792, -1.0113],
        [-1.0824, -1.1896, -1.0304],
        [-0.9993, -1.3510, -0.9865]], device='cuda:0')


### Training model

In [None]:
for epoch in range(300):
  for sentence, tags in training_data:
    model.zero_grad()
    sentence_input = prepare_sequence(sentence, word_to_ix)
    targets = prepare_sequence(tags, tag_to_ix)

    tag_scores = model(sentence_input)

    loss = loss_func(tag_scores, targets)
    loss.backward()
    optimizer.step()

# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(tag_scores)

tensor([[-1.0863, -0.5567, -2.4143],
        [-1.3844, -0.5839, -1.6513],
        [-1.6265, -1.9055, -0.4237],
        [-0.3350, -1.7497, -2.1998],
        [-1.9936, -0.2018, -3.0682]], device='cuda:0')


### Use word embedding 

1. We need to define an index for each word when using embeddings.
2. Embeddings are stored as a $|V| \times D$ maxtrix, where D is the dimensionality of the embeddings. 
3. We can use torch.nn.Embedding(vocab_size, embedding_dim) to create word embedding

In [None]:
word_to_ix = {"hello": 0, "world": 1}
embeds = nn.Embedding(2, 5)
lookup_tensor = torch.tensor([word_to_ix["hello"]], dtype=torch.long)
hello_embed = embeds(lookup_tensor)
print(hello_embed)

tensor([[-0.6502, -0.1108,  0.3221,  2.5179, -2.1227]],
       grad_fn=<EmbeddingBackward0>)


### Examples

In [None]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()

In [None]:
ngrams = [
    (
        [test_sentence[i - j - 1] for j in range(CONTEXT_SIZE)],
        test_sentence[i]
    )
    for i in range(CONTEXT_SIZE, len(test_sentence))
]
ngrams[0]

(['forty', 'When'], 'winters')

In [None]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)

word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(CONTEXT_SIZE, len(raw_text) - CONTEXT_SIZE):
    context = (
        [raw_text[i - j - 1] for j in range(CONTEXT_SIZE)]
        + [raw_text[i + j + 1] for j in range(CONTEXT_SIZE)]
    )
    target = raw_text[i]
    data.append((context, target))
print(data[:5])

[(['are', 'We', 'to', 'study'], 'about'), (['about', 'are', 'study', 'the'], 'to'), (['to', 'about', 'the', 'idea'], 'study'), (['study', 'to', 'idea', 'of'], 'the'), (['the', 'study', 'of', 'a'], 'idea')]


In [None]:
class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, vocab_size)
        self.context_size = context_size

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        inputs = torch.sum(embeds, dim=0).view(1, -1)
        output = self.linear1(inputs)
        output = F.log_softmax(output, dim=1)
        return output

In [None]:
losses = []
loss_function = nn.NLLLoss()
model = CBOW(vocab_size, EMBEDDING_DIM, CONTEXT_SIZE * 2)
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [None]:
for epoch in range(10):
  total_loss = 0
  for context, target in data:
    context_idxs = torch.tensor([word_to_ix[word] for word in context], dtype=torch.long)
    optimizer.zero_grad()
    log_prob = model(context_idxs)
    loss = loss_function(log_prob, torch.tensor([word_to_ix[target]], dtype=torch.long))
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
  losses.append(total_loss)


In [None]:
model.embeddings.weight[word_to_ix["Computational"]]

tensor([-0.6469, -1.1509, -0.5670, -0.9811,  0.3269, -2.0446,  0.6276, -0.5581,
         0.1497,  1.0674], grad_fn=<SelectBackward0>)