In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [3]:
import numpy as np

## prepare data

In [141]:
with open("../task/data/ikea") as f:
    lines = [x[:-1] for x in f.readlines() if len(x[:-1]) > 0]

In [142]:
def elongate_name(name):
    return name + '_'*(12 - len(name))

In [145]:
lines = np.array(list(map(elongate_name,lines)))

In [146]:
lines

array(['ABSORB______', 'ADMETE______', 'AGAM________', ...,
       'ÖRTER_______', 'ÖSTERBYMO___', 'ÖSTERÖ______'], dtype='<U12')

In [155]:
letters = set([char for name in lines for char in name])

In [156]:
from collections import Counter

In [157]:
Counter([char for name in lines for char in name])

Counter({'A': 829,
         'B': 230,
         'S': 551,
         'O': 348,
         'R': 671,
         '_': 8309,
         'D': 300,
         'M': 287,
         'E': 560,
         'T': 502,
         'G': 295,
         'N': 556,
         'Y': 109,
         'I': 494,
         'K': 355,
         'L': 617,
         'U': 215,
         'X': 18,
         'Å': 75,
         'P': 167,
         'V': 182,
         'Ä': 125,
         'F': 144,
         'J': 135,
         'H': 100,
         'Ö': 107,
         'C': 48,
         'Z': 2,
         'W': 1})

In [158]:
sym_to_num = dict(zip(sorted(letters), np.arange(len(letters))))

In [159]:
# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return sym_to_num[letter]

In [160]:
def letterToTensor(letter, n_letters = len(letters)):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

In [161]:
def lineToTensor(line, n_letters = len(letters)):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

In [162]:
lineToTensor('HELL').argmax(2)

tensor([[ 7],
        [ 4],
        [11],
        [11]])

## Create the model:



In [231]:
class LSTMModel(nn.Module):

    def __init__(self, inp_dim, hidden_dim, vocab_size):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        
        self.lstm = nn.LSTM(inp_dim, hidden_dim, num_layers = 3)

        self.out2prediction = nn.Linear(hidden_dim, vocab_size)
        
        self.hidden = self.init_hidden()

    def init_hidden(self, batch_size = BATCH_SIZE):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(3, batch_size, self.hidden_dim),
                torch.zeros(3, batch_size, self.hidden_dim))

    def logits(self, sentence, batch_size = BATCH_SIZE):
        lstm_out, self.hidden = self.lstm(sentence.reshape(len(sentence), batch_size, -1), self.hidden)
        
        out = self.out2prediction(lstm_out)
        return out
    
    def forward(self, sentence):
        out = self.logits(sentence)
        out = out.reshape(-1, out.shape[2])
        return F.log_softmax(out, 1)

## Train model

In [201]:
INP_DIM = len(letters)
HIDDEN_DIM = 16
BATCH_SIZE = 10

In [189]:
def get_minibatch(data, do_return = 0):
    np.random.shuffle(data)
    
    data = data[: len(data) // BATCH_SIZE * BATCH_SIZE]
    for names in np.array_split(data, data.shape[0] / BATCH_SIZE):
        #(len, batch, n_letters)
        inp = [lineToTensor(name) for name in names]
        inp = torch.cat(inp, dim = 1)
        #print(inp.shape)
        
        target = inp.argmax(2).reshape(-1)
        
        yield inp, target


In [190]:
batch = get_minibatch(lines[:10], 0)

In [191]:
batch = [x for x in batch]

In [198]:
batch[0][0].shape

torch.Size([12, 10, 29])

In [204]:
model(batch[0][0])

tensor([[-3.1728, -3.3560, -3.5930,  ..., -3.1822, -3.5951, -3.1344],
        [-3.1729, -3.3566, -3.5920,  ..., -3.1812, -3.5953, -3.1346],
        [-3.1720, -3.3557, -3.5938,  ..., -3.1818, -3.5953, -3.1340],
        ...,
        [-3.1465, -3.3668, -3.5911,  ..., -3.1594, -3.6214, -3.1494],
        [-3.1468, -3.3669, -3.5905,  ..., -3.1597, -3.6214, -3.1500],
        [-3.1440, -3.3667, -3.5925,  ..., -3.1594, -3.6227, -3.1486]],
       grad_fn=<LogSoftmaxBackward>)

In [296]:
model = LSTMModel(INP_DIM, HIDDEN_DIM, INP_DIM)

In [297]:
model

LSTMModel(
  (lstm): LSTM(29, 16, num_layers=3)
  (out2prediction): Linear(in_features=16, out_features=29, bias=True)
)

In [298]:
model.hidden = model.init_hidden(1)

In [306]:
F.softmax(model.logits(lineToTensor(lines[2]), 1).reshape(-1, 29), 1)

tensor([[0.0332, 0.0335, 0.0343, 0.0270, 0.0336, 0.0285, 0.0345, 0.0352, 0.0360,
         0.0308, 0.0453, 0.0381, 0.0366, 0.0351, 0.0298, 0.0429, 0.0327, 0.0262,
         0.0376, 0.0384, 0.0352, 0.0348, 0.0280, 0.0382, 0.0323, 0.0430, 0.0279,
         0.0384, 0.0327],
        [0.0332, 0.0335, 0.0343, 0.0271, 0.0336, 0.0285, 0.0346, 0.0352, 0.0361,
         0.0308, 0.0453, 0.0381, 0.0367, 0.0350, 0.0298, 0.0429, 0.0327, 0.0262,
         0.0376, 0.0384, 0.0352, 0.0349, 0.0280, 0.0382, 0.0323, 0.0430, 0.0279,
         0.0384, 0.0327],
        [0.0332, 0.0335, 0.0343, 0.0271, 0.0336, 0.0285, 0.0346, 0.0352, 0.0361,
         0.0308, 0.0453, 0.0381, 0.0367, 0.0349, 0.0298, 0.0429, 0.0328, 0.0262,
         0.0376, 0.0384, 0.0352, 0.0349, 0.0280, 0.0381, 0.0322, 0.0430, 0.0279,
         0.0385, 0.0327],
        [0.0332, 0.0335, 0.0343, 0.0271, 0.0335, 0.0285, 0.0347, 0.0352, 0.0361,
         0.0308, 0.0453, 0.0381, 0.0367, 0.0349, 0.0297, 0.0429, 0.0328, 0.0262,
         0.0375, 0.0384, 0.0352

In [302]:
[num_to_sym[x] for x in model.logits(lineToTensor(lines[0]), 1).reshape(-1, len(letters)).argmax(1).numpy()]

['K', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'K']

In [303]:
lines[1]

'SPARSAM_____'

In [307]:
criterion = nn.NLLLoss()

In [308]:
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [309]:
epochs = 10

In [310]:
from tqdm import tqdm_notebook

In [314]:
out.shape

torch.Size([120, 29])

In [312]:
for epoch in range(epochs):
    epoch_loss = 0
    for i, (inp, target) in enumerate(tqdm_notebook(get_minibatch(lines))):
        model.hidden = model.init_hidden()
        
        out = model(inp)
        
        loss = criterion(out[:], target[:])
        loss.backward()
        print(model.lstm.all_weights[0][0].grad.sum().item())
        optimizer.step()
        
        epoch_loss += loss.item()
    print("_________________", epoch_loss)
        

A Jupyter Widget

-2.4244022369384766
-2.406557083129883
-2.3668346405029297
-2.318957805633545
-2.3057732582092285
-2.2864599227905273
-2.2506613731384277
-2.2260191440582275
-2.2094650268554688
-2.180917739868164
-2.161099910736084
-2.1347203254699707
-2.1423041820526123
-2.1405467987060547
-2.139617681503296
-2.139402389526367
-2.1163156032562256
-2.1009819507598877
-2.094451427459717
-2.0799858570098877
-2.0738799571990967
-2.069404363632202
-2.085393190383911
-2.0740694999694824
-2.0875582695007324
-2.0975582599639893
-2.126966714859009
-2.152482748031616
-2.1663498878479004
-2.186352252960205
-2.2269363403320312
-2.246551513671875
-2.2688302993774414
-2.313317060470581
-2.3613312244415283
-2.4007534980773926
-2.4318346977233887
-2.4598097801208496
-2.504549980163574
-2.539842367172241
-2.557931423187256
-2.5816328525543213
-2.608647346496582
-2.641688108444214
-2.662412643432617
-2.6992027759552
-2.735395669937134
-2.768686056137085
-2.7720184326171875
-2.829505681991577
-2.832099199295044
-2.8411

KeyboardInterrupt: 

In [103]:
lines[2]

'KNYCK'

In [219]:
num_to_sym = {}

for item in sym_to_num.items():
    num_to_sym[item[1]] = item[0]

In [115]:
sym_to_num['S']

17

In [218]:
np.exp(model(lineToTensor("KNYCK")).detach().numpy())

RuntimeError: shape '[5, 10, -1]' is invalid for input of size 145

In [62]:
sym_to_num = {'h':0, 'e':1, 'l':2, 'o':3}

In [63]:
sym_to_num

{'h': 0, 'e': 1, 'l': 2, 'o': 3}

In [65]:
sym_to_num['e']

1

In [52]:
model(lineToTensor(s))

tensor([[3.6599e-24, 1.0000e+00, 6.0662e-29, 5.0154e-29],
        [1.0000e+00, 1.5718e-17, 5.0758e-27, 3.8477e-27],
        [3.6460e-24, 1.0000e+00, 6.0311e-29, 4.9855e-29],
        [1.0000e+00, 1.5742e-17, 5.0698e-27, 3.8429e-27],
        [3.6413e-24, 1.0000e+00, 6.0183e-29, 4.9746e-29],
        [1.0000e+00, 1.5750e-17, 5.0673e-27, 3.8409e-27],
        [3.6394e-24, 1.0000e+00, 6.0127e-29, 4.9700e-29],
        [1.0000e+00, 1.5754e-17, 5.0660e-27, 3.8399e-27]],
       grad_fn=<SoftmaxBackward>)

In [347]:
lineToTensor('hehehe').argmax(2).long().squeeze()[1:]

tensor([1, 0, 1, 0, 1])

In [189]:
lineToTensor('hehehe')

tensor([[[1., 0., 0., 0.]],

        [[0., 1., 0., 0.]],

        [[1., 0., 0., 0.]],

        [[0., 1., 0., 0.]],

        [[1., 0., 0., 0.]],

        [[0., 1., 0., 0.]]])

Train the model:



In [None]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)
    print(tag_scores)

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(tag_scores)

Exercise: Augmenting the LSTM part-of-speech tagger with character-level features
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In the example above, each word had an embedding, which served as the
inputs to our sequence model. Let's augment the word embeddings with a
representation derived from the characters of the word. We expect that
this should help significantly, since character-level information like
affixes have a large bearing on part-of-speech. For example, words with
the affix *-ly* are almost always tagged as adverbs in English.

To do this, let $c_w$ be the character-level representation of
word $w$. Let $x_w$ be the word embedding as before. Then
the input to our sequence model is the concatenation of $x_w$ and
$c_w$. So if $x_w$ has dimension 5, and $c_w$
dimension 3, then our LSTM should accept an input of dimension 8.

To get the character level representation, do an LSTM over the
characters of a word, and let $c_w$ be the final hidden state of
this LSTM. Hints:

* There are going to be two LSTM's in your new model.
  The original one that outputs POS tag scores, and the new one that
  outputs a character-level representation of each word.
* To do a sequence model over characters, you will have to embed characters.
  The character embeddings will be the input to the character LSTM.


