In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

In [2]:
from nltk.tokenize import word_tokenize
import numpy as np    

##### LSTM  in PyTorch

The input_size argument to any RNN says how many features will there be for each step in a sequence, not what it's length is going to be. Keras uses static graphs, so it needs to know the length of the sequence upfront, PyTorch has dynamic autodifferentiation so it doesn't care about the sequence length - you can use a different one every iteration.

In [3]:
# LSTM(input_size = num features, hidden_size, num_layers)
rnn = nn.LSTM(2, 10, 1)

# input (seq_len = num timesteps, batch, input_size = num_features)
input = Variable(torch.randn(4, 3, 2))
output, hn = rnn(input)

#print(input)
#print(output)


see also https://github.com/moskomule/pytorch.learning/blob/master/src/lstm_text_gen.py

In [4]:
#input = ["5h slides für Statistik-Kurs",
#         "3h Präsi für DOAG",
#         "Apps: Impl 3 h",
#         "NP, Impl new feature",
#        ]
#input_lists = [word_tokenize(s) for s in input]
#print(input_lists)

#tags = [["duration", "what", "SEP", "project"],
#        ["duration", "what", "SEP", "project"],
#        ["project", "SEP", "what", "duration", "duration"],
#        ["project", "SEP", "what", "what", "what"]
#       ]

#training_data = list(zip(input_lists, tags))
#training_data


In [5]:
#word_to_ix = {}
#for lst in input_lists:
#    for word in lst:
#        if word not in word_to_ix:
#            word_to_ix[word] = len(word_to_ix)
#
#tag_to_ix = {"duration": 0, "what": 1, "project": 2, "SEP": 3}
#
#print(word_to_ix)
#print(tag_to_ix)

In [6]:
# no spaces just for data generation; in reality we will anyway tag the data manually and this is not a problem
projects = ["Authentifizierung", "Angular", "Apex", "MySQL_PoC", "DOAG_Vortrag", "DS-R-L", "Sentimentanalyse"]
durations = ["8h", "3h", "1d", "5_Std", "30_Min"]
#tasks = ["Slides", "Impl", "Planung", "Recherche", "Implementierung", "Tests", "Unit", "Tests", "New_Features",
#         "Auswertung", "Präsi"]
tasks = ["Slides", "Impl", "Planung", "Recherche", "Implementierung", "Tests", "Unit_Tests", "New_Features",
         "Auswertung", "Präsi"]
seps1 = ["für", "auf"]
seps2 = [":"]
seps3 = [","]

tag_to_ix = {"duration": 0, "task": 1, "project": 2, "SEP": 3}
ix_to_tag = {v:k for k,v in tag_to_ix.items()}

words = projects + durations + tasks + seps1 + seps2 + seps3
word_to_ix = {word : index for index, word in enumerate(words)}
word_to_ix

{',': 25,
 '1d': 9,
 '30_Min': 11,
 '3h': 8,
 '5_Std': 10,
 '8h': 7,
 ':': 24,
 'Angular': 1,
 'Apex': 2,
 'Auswertung': 20,
 'Authentifizierung': 0,
 'DOAG_Vortrag': 4,
 'DS-R-L': 5,
 'Impl': 13,
 'Implementierung': 16,
 'MySQL_PoC': 3,
 'New_Features': 19,
 'Planung': 14,
 'Präsi': 21,
 'Recherche': 15,
 'Sentimentanalyse': 6,
 'Slides': 12,
 'Tests': 17,
 'Unit_Tests': 18,
 'auf': 23,
 'für': 22}

In [None]:
s1 = [d + " " + t + " " + s + " " + p for d in durations for t in tasks for p in projects for s in seps1]
len(s1)
s1_tags = len(s1) * [["duration", "task", "SEP", "project"]]
s1
#len(s1_tags)

In [None]:
s2 = [p + s + " " + t + " " + d for p in projects for s in seps2 for t in tasks for d in durations]
s2_tags = len(s2) * [["project", "SEP", "task", "duration"]]
len(s2_tags)
s2

In [None]:
s3 = [p + s + " " + t + s + " " + d for p in projects for s in seps3 for t in tasks for d in durations]
s3_tags = len(s3) * [["project", "SEP", "task", "SEP", "duration"]]
len(s3_tags)
s3

In [None]:
data = list(zip([word_tokenize(s) for s in s1+s2+s3], s1_tags + s2_tags + s3_tags))
#data

In [None]:
inds = np.random.choice([True, False], size=len(data), p = [0.8,0.2])
inds

In [None]:
data_array = np.array(data)
data_array.shape

In [None]:
train_data = data_array[inds]
test_data = data_array[np.logical_not(inds)]
train_data.shape, test_data.shape

In [None]:
test_data[:2]

In [None]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    tensor = torch.LongTensor(idxs)
    return Variable(tensor)


In [None]:
EMBEDDING_DIM = 6
HIDDEN_DIM = 6
num_epochs = 50

In [None]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # LSTM(input_size = num features, hidden_size, num_layers)
        # embedding size == number of features!
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (Variable(torch.zeros(1, 1, self.hidden_dim)),
                Variable(torch.zeros(1, 1, self.hidden_dim)))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        #print(embeds)                              # 4*6 for sentence 1
        #print(embeds.view(len(sentence),1,-1))     # 4*1*6 for sentence 1
        # LSTM input data format: (seq_len = num timesteps, batch, input_size = num_features)
        # output: out == all hidden states, hidden == most recent hidden state 
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        #print(tag_space)                            # 4*4
        tag_scores = F.log_softmax(tag_space)       # 4*4
        return tag_scores


In [None]:
model_exists = False
if not model_exists:
    model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
else:
    model = torch.load('model.sav')

loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
print(model)

In [None]:
if not model_exists:
    for epoch in range(num_epochs):  
        print
        print("Epoch: {}".format(str(epoch)))
        for sentence, tags in train_data:

            #print(sentence)

            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Also, we need to clear out the hidden state of the LSTM,
            # detaching it from its history on the last instance.
            model.hidden = model.init_hidden()

            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Variables of word indices.
            sentence_in = prepare_sequence(sentence, word_to_ix)
            targets = prepare_sequence(tags, tag_to_ix)

            # Step 3. Run our forward pass.
            tag_scores = model(sentence_in)

            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            loss = loss_function(tag_scores, targets)
            print(round(loss.data[0],6))
            loss.backward()
            optimizer.step()


In [None]:
if not model_exists: torch.save(model, 'model.sav')

In [None]:
# See what the scores are after training
print(train_data[0][0])
inputs = prepare_sequence(train_data[0][0], word_to_ix)
tag_scores = model(inputs)
# i,j corresponds to score for tag j for word i. The predicted tag is the maximum scoring tag.
print(tag_scores.data)
_, predicted = torch.max(tag_scores.data, 1)

predicted = predicted.tolist()
predicted  = [val for sublist in predicted for val in sublist]
pred_tags = [ix_to_tag[p] for p in predicted]
pred_tags

In [None]:
n_train = len(train_data)
n_correct = 0
for t in train_data:
    inputs = prepare_sequence(t[0], word_to_ix)
    tag_scores = model(inputs)
    _, predicted = torch.max(tag_scores.data,1)
    predicted = [val for sublist in predicted.tolist() for val in sublist]
    pred_tags = [ix_to_tag[p] for p in predicted]
    if pred_tags == t[1]: 
        n_correct = n_correct + 1
    else: 
        print(t[0])
        print(t[1])
        print(pred_tags)
        print()
    


In [None]:
print(n_train, n_correct, round(n_correct/n_train,2))

In [None]:
n_test = len(test_data)
n_correct = 0
for t in test_data:
    inputs = prepare_sequence(t[0], word_to_ix)
    tag_scores = model(inputs)
    _, predicted = torch.max(tag_scores.data,1)
    predicted = [val for sublist in predicted.tolist() for val in sublist]
    pred_tags = [ix_to_tag[p] for p in predicted]
    #if pred_tags == t[1]: 
    n_correct = n_correct + 1
    #else: 
    print(t[0])
    print(t[1])
    print(pred_tags)
    print()
    


In [None]:
print(n_test, n_correct, round(n_correct/n_test,2))