In [1]:
import torch
import os
import torch.nn as nn
import numpy as np
from torch.nn.utils import clip_grad_norm

In [2]:

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1
            
    def __len__(self):
        return len(self.word2idx)

In [97]:
class TextProcess(object):
    
    def __init__(self):
        self.dictionary = Dictionary()

    def get_data(self, path, batch_size=20):
        with open(path, 'r') as f:
            tokens = 0
            for line in f:
                words = line.split() + ['<eos>']
                tokens += len(words)
                #print(tokens)
                for word in words: 
                    self.dictionary.add_word(word)  
        #Create a 1-D tensor that contains the index of all the words in the file+
        print(tokens)
        rep_tensor = torch.LongTensor(tokens)
        #print(rep_tensor)
        index = 0
        with open(path, 'r') as f:
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    rep_tensor[index] = self.dictionary.word2idx[word]
                    index += 1
        #Find out how many batches we need            
        num_batches = rep_tensor.shape[0] // batch_size  
        print(rep_tensor)
        print(rep_tensor.shape[0])
        print(num_batches)
        #Remove the remainder (Filter out the ones that don't fit)
        rep_tensor = rep_tensor[:num_batches*batch_size]
        # return (batch_size,num_batches)
        rep_tensor = rep_tensor.view(batch_size, -1)
        return rep_tensor

In [118]:
embed_size = 128    #Input features to the LSTM
hidden_size = 1024  #Number of LSTM units
num_layers = 1
num_epochs = 2
batch_size = 20
timesteps = 30
learning_rate = 0.002

In [99]:
corpus = TextProcess()

In [100]:
print(corpus)

<__main__.TextProcess object at 0x0000028EE60F28D0>


In [103]:
rep_tensor=corpus.get_data(r'N:\deep learning\alice.txt',batch_size)

29686
tensor([   0,    1,    2,  ...,  878, 5289,    5])
29686
1484


In [105]:
rep_tensor.shape

torch.Size([20, 1484])

In [112]:
ep_tensor = torch.LongTensor([1,2,3,4,5,6])

In [113]:
ep_tensor

tensor([1, 2, 3, 4, 5, 6])

In [114]:
ep_tensor.view(3,-1)

tensor([[1, 2],
        [3, 4],
        [5, 6]])

In [77]:
p_tensor = torch.LongTensor(25)

In [78]:
p_tensor

tensor([8030593375117209458, 7595074916499857517, 8392569455039636854,
        7309940765158368800, 8317701149879002209, 4981032200866521460,
        8367815051375374456, 7305521896674583912, 7307484159728885874,
        7070773959523001452, 2336349463739965541, 7935454042726100852,
        7381153989982842229, 8319403537510788896, 8007514913507713070,
        8316213871474930805, 7310503572383228192, 7957688057596965742,
        2334956330917912948, 8461244959899871348, 8367813930434720613,
        7954894511893669224, 7358992207844042272, 2335225676751204975,
        7307186191138778994])

In [41]:
print(rep_tensor.shape)

torch.Size([20, 1484])


In [120]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [9]:
vocab_size = len(corpus.dictionary)
print(vocab_size)

5290


In [10]:
num_batches = rep_tensor.shape[1] // timesteps
print(num_batches)

49


In [25]:
corpus

<__main__.TextProcess at 0x28ee5fbc668>

In [115]:

class TextGenerator(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(TextGenerator, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, h):
        # Perform Word Embedding 
        x = self.embed(x)
        #Reshape the input tensor
        #x = x.view(batch_size,timesteps,embed_size)
        out, (h, c) = self.lstm(x, h)
        # Reshape the output from (samples,timesteps,output_features) to a shape appropriate for the FC layer 
        # (batch_size*timesteps, hidden_size)
        out = out.reshape(out.size(0)*out.size(1), out.size(2))
        # Decode hidden states of all time steps
        out = self.linear(out)
        return out, (h, c)

In [116]:
model = TextGenerator(vocab_size, embed_size, hidden_size, num_layers)

In [117]:
def detach(states):
    """
If we have a tensor z,'z.detach()' returns a tensor that shares the same storage
as 'z', but with the computation history forgotten. It doesn't know anything
about how it was computed. In other words, we have broken the tensor z away from its past history
Here, we want to perform truncated Backpropagation
TBPTT splits the 1,000-long sequence into 50 sequences (say) each of length 20 and treats each sequence of length 20 as 
a separate training case. This is a sensible approach that can work well in practice, but it is blind to temporal 
dependencies that span more than 20 timesteps.
    """
    return [state.detach() for state in states]

In [121]:
for epoch in range(num_epochs):
    # Set initial hidden and cell states
    states = (torch.zeros(num_layers, batch_size, hidden_size),
              torch.zeros(num_layers, batch_size, hidden_size))

    for i in range(0, rep_tensor.size(1) - timesteps, timesteps):
        # Get mini-batch inputs and targets
        inputs = rep_tensor[:, i:i+timesteps]  
        targets = rep_tensor[:, (i+1):(i+1)+timesteps]
        
        outputs,_ = model(inputs, states)
        loss = loss_fn(outputs, targets.reshape(-1))

        model.zero_grad()
        loss.backward()
        #Perform Gradient Clipping. clip_value (float or int) is the maximum allowed value of the gradients 
        #The gradients are clipped in the range [-clip_value, clip_value]. This is to prevent the exploding gradient problem
        clip_grad_norm(model.parameters(), 0.5)
        optimizer.step()
              
        step = (i+1) // timesteps
        if step % 100 == 0:
            print ('Epoch [{}/{}], Loss: {:.4f}'
                   .format(epoch+1, num_epochs, loss.item()))



Epoch [1/2], Loss: 8.5789
Epoch [2/2], Loss: 5.8857


In [122]:
# Test the model
with torch.no_grad():
    with open('results.txt', 'w') as f:
        # Set intial hidden ane cell states
        state = (torch.zeros(num_layers, 1, hidden_size),
                 torch.zeros(num_layers, 1, hidden_size))
        # Select one word id randomly and convert it to shape (1,1)
        input = torch.randint(0,vocab_size, (1,)).long().unsqueeze(1)

        for i in range(500):
            output, _ = model(input, state)
            print(output.shape)
            # Sample a word id from the exponential of the output 
            prob = output.exp()
            word_id = torch.multinomial(prob, num_samples=1).item()
            print(word_id)
            # Replace the input with sampled word id for the next time step
            input.fill_(word_id)

            # Write the results to file
            word = corpus.dictionary.idx2word[word_id]
            word = '\n' if word == '<eos>' else word + ' '
            f.write(word)
            
            if (i+1) % 100 == 0:
                print('Sampled [{}/{}] words and save to {}'.format(i+1, 500, 'results.txt'))

torch.Size([1, 5290])
4710
torch.Size([1, 5290])
3930
torch.Size([1, 5290])
4089
torch.Size([1, 5290])
4118
torch.Size([1, 5290])
27
torch.Size([1, 5290])
28
torch.Size([1, 5290])
16
torch.Size([1, 5290])
1772
torch.Size([1, 5290])
42
torch.Size([1, 5290])
274
torch.Size([1, 5290])
3
torch.Size([1, 5290])
3499
torch.Size([1, 5290])
3
torch.Size([1, 5290])
4568
torch.Size([1, 5290])
2215
torch.Size([1, 5290])
3103
torch.Size([1, 5290])
6
torch.Size([1, 5290])
27
torch.Size([1, 5290])
161
torch.Size([1, 5290])
4648
torch.Size([1, 5290])
2446
torch.Size([1, 5290])
1512
torch.Size([1, 5290])
232
torch.Size([1, 5290])
446
torch.Size([1, 5290])
3334
torch.Size([1, 5290])
44
torch.Size([1, 5290])
1017
torch.Size([1, 5290])
4872
torch.Size([1, 5290])
9
torch.Size([1, 5290])
39
torch.Size([1, 5290])
3738
torch.Size([1, 5290])
4050
torch.Size([1, 5290])
338
torch.Size([1, 5290])
119
torch.Size([1, 5290])
2855
torch.Size([1, 5290])
236
torch.Size([1, 5290])
2507
torch.Size([1, 5290])
942
torch.Si

212
torch.Size([1, 5290])
3
torch.Size([1, 5290])
4772
torch.Size([1, 5290])
722
torch.Size([1, 5290])
187
torch.Size([1, 5290])
2026
torch.Size([1, 5290])
2003
torch.Size([1, 5290])
33
torch.Size([1, 5290])
3
torch.Size([1, 5290])
2812
torch.Size([1, 5290])
1611
torch.Size([1, 5290])
4114
torch.Size([1, 5290])
959
torch.Size([1, 5290])
176
torch.Size([1, 5290])
202
torch.Size([1, 5290])
5
torch.Size([1, 5290])
5
torch.Size([1, 5290])
5
torch.Size([1, 5290])
5
torch.Size([1, 5290])
1115
torch.Size([1, 5290])
1001
torch.Size([1, 5290])
112
torch.Size([1, 5290])
57
torch.Size([1, 5290])
632
torch.Size([1, 5290])
1234
torch.Size([1, 5290])
30
torch.Size([1, 5290])
27
torch.Size([1, 5290])
603
torch.Size([1, 5290])
502
torch.Size([1, 5290])
2565
torch.Size([1, 5290])
3177
torch.Size([1, 5290])
248
torch.Size([1, 5290])
3332
torch.Size([1, 5290])
704
torch.Size([1, 5290])
268
torch.Size([1, 5290])
602
torch.Size([1, 5290])
381
torch.Size([1, 5290])
96
torch.Size([1, 5290])
3
torch.Size([1, 