# character level LSTM in pytorch
    In this notebook, I'll construct a character-level LSTM with PyTorch. The network will train character by character on some text, then generate new text character by character. As an example, I will train on JK ROWLING'S Harry Potter. This model will be able to generate new text based on the text from the book!

In [1]:
#import all the necessary libraries
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F

In [2]:
with open('data/HarryPotter.txt', 'r') as f:
    text=f.read()
text[:200]

"Harry Potter and the Sorcerer's Stone\n\n\nCHAPTER ONE\n\nTHE BOY WHO LIVED\n\nMr. and Mrs. Dursley, of number four, Privet Drive, were proud to say\nthat they were perfectly normal, thank you very much. They"

# Tokeniztion


In [3]:
chars=tuple(set(text))
print(len(chars))
integer2character = dict(enumerate(chars))
print(integer2character)


91
{0: '=', 1: 'y', 2: 'W', 3: 'X', 4: '(', 5: '•', 6: 'V', 7: '$', 8: 'N', 9: '%', 10: '~', 11: 'I', 12: 'p', 13: 'K', 14: 't', 15: '9', 16: '6', 17: '_', 18: 'H', 19: '*', 20: 'w', 21: 'C', 22: 'O', 23: '!', 24: ')', 25: 'l', 26: '?', 27: '4', 28: 'B', 29: '8', 30: '"', 31: 'b', 32: ']', 33: 'Y', 34: 'd', 35: 'E', 36: 'u', 37: 'T', 38: 'g', 39: 'r', 40: 'F', 41: '7', 42: '5', 43: ' ', 44: '/', 45: ':', 46: ',', 47: 'Q', 48: 'v', 49: 'a', 50: '-', 51: '1', 52: 'D', 53: 'm', 54: 'x', 55: 'A', 56: 'S', 57: 'c', 58: '0', 59: 'L', 60: 'U', 61: 'j', 62: 's', 63: 'h', 64: 'n', 65: 'Z', 66: '\t', 67: 'P', 68: 'k', 69: '3', 70: 'J', 71: '`', 72: 'G', 73: 'z', 74: "'", 75: 'o', 76: '&', 77: 'i', 78: '\n', 79: 'M', 80: '}', 81: 'R', 82: '.', 83: '2', 84: '\\', 85: ';', 86: 'f', 87: '^', 88: 'ü', 89: 'q', 90: 'e'}


In [4]:
character2integer={ch:i for i,ch in integer2character.items()}
print(character2integer)

{'=': 0, 'y': 1, 'W': 2, 'X': 3, '(': 4, '•': 5, 'V': 6, '$': 7, 'N': 8, '%': 9, '~': 10, 'I': 11, 'p': 12, 'K': 13, 't': 14, '9': 15, '6': 16, '_': 17, 'H': 18, '*': 19, 'w': 20, 'C': 21, 'O': 22, '!': 23, ')': 24, 'l': 25, '?': 26, '4': 27, 'B': 28, '8': 29, '"': 30, 'b': 31, ']': 32, 'Y': 33, 'd': 34, 'E': 35, 'u': 36, 'T': 37, 'g': 38, 'r': 39, 'F': 40, '7': 41, '5': 42, ' ': 43, '/': 44, ':': 45, ',': 46, 'Q': 47, 'v': 48, 'a': 49, '-': 50, '1': 51, 'D': 52, 'm': 53, 'x': 54, 'A': 55, 'S': 56, 'c': 57, '0': 58, 'L': 59, 'U': 60, 'j': 61, 's': 62, 'h': 63, 'n': 64, 'Z': 65, '\t': 66, 'P': 67, 'k': 68, '3': 69, 'J': 70, '`': 71, 'G': 72, 'z': 73, "'": 74, 'o': 75, '&': 76, 'i': 77, '\n': 78, 'M': 79, '}': 80, 'R': 81, '.': 82, '2': 83, '\\': 84, ';': 85, 'f': 86, '^': 87, 'ü': 88, 'q': 89, 'e': 90}


In [5]:
encoded_char=np.array([character2integer[ch] for ch in text])
encoded_char[10:150]

array([90, 39, 43, 49, 64, 34, 43, 14, 63, 90, 43, 56, 75, 39, 57, 90, 39,
       90, 39, 74, 62, 43, 56, 14, 75, 64, 90, 78, 78, 78, 21, 18, 55, 67,
       37, 35, 81, 43, 22,  8, 35, 78, 78, 37, 18, 35, 43, 28, 22, 33, 43,
        2, 18, 22, 43, 59, 11,  6, 35, 52, 78, 78, 79, 39, 82, 43, 49, 64,
       34, 43, 79, 39, 62, 82, 43, 52, 36, 39, 62, 25, 90,  1, 46, 43, 75,
       86, 43, 64, 36, 53, 31, 90, 39, 43, 86, 75, 36, 39, 46, 43, 67, 39,
       77, 48, 90, 14, 43, 52, 39, 77, 48, 90, 46, 43, 20, 90, 39, 90, 43,
       12, 39, 75, 36, 34, 43, 14, 75, 43, 62, 49,  1, 78, 14, 63, 49, 14,
       43, 14, 63, 90])

# One hot Encoding

In [6]:
def one_hot_encoder(arr, total_length):
    one_hot=np.zeros((arr.size,total_length) ,dtype=np.float32)
    
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1
    
    one_hot=one_hot.reshape((*arr.shape,total_length))
    
    return one_hot



# Make mini tarining batches


In [7]:
def get_batches(arr,batch_size,seq_length):
    batches=batch_size*seq_length
    num_batches=len(arr)//batches
    arr=arr[:num_batches*batches]
    arr=arr.reshape((batch_size,-1))
    
    for n in range(0,arr.shape[1],seq_length):
        x=arr[:, n:n+seq_length]
        y=np.zeros_like(x)
        try:
            y[:, :-1],y[:,-1]=x[:,1:],arr[:,n+seq_length]
        except IndexError:
            y[:,:-1],y[:,-1]=x[:,1:],arr[:,0]
        yield x,y
    


In [8]:
batches=get_batches(encoded_char,8,50)
x,y=next(batches)
print(x[:10,:10])
print(y[:10,:10])


[[18 49 39 39  1 43 67 75 14 14]
 [50 43 63 75 20 43 14 90 39 43]
 [75 53 90 43 49 25 75 64 38 78]
 [49  1 46 43 49 64 34 43 57 25]
 [14 75 64 90 78 86 25 75 75 39]
 [78 30 79 75 39 64 77 64 38 23]
 [64 34 43 63 90 43 34 77 34 43]
 [39 43 86 39 75 53 43 14 63 90]]
[[49 39 39  1 43 67 75 14 14 90]
 [43 63 75 20 43 14 90 39 43 39]
 [53 90 43 49 25 75 64 38 78 39]
 [ 1 46 43 49 64 34 43 57 25 75]
 [75 64 90 78 86 25 75 75 39 46]
 [30 79 75 39 64 77 64 38 23 30]
 [34 43 63 90 43 34 77 34 43 62]
 [43 86 39 75 53 43 14 63 90 43]]


In [9]:
train_on_gpu=torch.cuda.is_available()
if (train_on_gpu):
    print("Gpu is available")
else:
    print("Gpu is not available")

Gpu is available


# Model Structure
In __init__ the suggested structure is as follows:

. Create and store the necessary dictionaries (this has been done for you)
. Define an LSTM layer that takes as params: an input size (the number of characters), a hidden layer size n_hidden, a number of layers n_layers, a dropout probability drop_prob, and a batch_first boolean (True, since we are batching)
. Define a dropout layer with drop_prob
. Define a fully-connected layer with params: input size n_hidden and output size (the number of characters)
. Finally, initialize the weights (again, this has been given)


In [10]:
class CharRNN(nn.Module):
    def __init__(self,tokens,n_hidden=256,n_layers=2,drop_prob=0.4,lr=0.001):
        super().__init__()
        self.drop_prob=drop_prob
        self.n_layers=n_layers
        self.n_hidden=n_hidden
        self.lr=lr
        
        self.chars=tokens
        self.int2char=dict(enumerate(self.chars))
        self.char2int={ch:i for i,ch in self.int2char.items()}
        
        self.lstm=nn.LSTM(len(self.chars),n_hidden,n_layers,dropout=drop_prob,batch_first=True)
        self.dropout=nn.Dropout(drop_prob)
        self.fc=nn.Linear(n_hidden,len(self.chars))
        
    def forward(self,x,hidden):
        r_output,hidden=self.lstm(x,hidden)
        output=self.dropout(r_output)
        output=output.contiguous().view(-1,self.n_hidden)
        output=self.fc(output)
        return output,hidden
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

# All right. Time to train

In [11]:
def train(net,data,epochs=10,batch_size=10,seq_length=50,lr=0.001,clip=5,val_frac=0.1,print_every=10):
    net.train()
    
    opt=torch.optim.Adam(net.parameters(),lr=lr)
    criterion=nn.CrossEntropyLoss()
    
    val_idx=int(len(data)*(1-val_frac))
    data,val_data=data[:val_idx],data[val_idx:]
    
    if(train_on_gpu):
        net.cuda()
    counter=0
    n_chars=len(net.chars)
    for e in range(epochs):
        ## initialize hidden state
        h=net.init_hidden(batch_size)
        
        for x,y in get_batches(data,batch_size,seq_length):
            counter+=1
            x=one_hot_encoder(x,n_chars)
            inputs,targets=torch.from_numpy(x),torch.from_numpy(y)
            
            if (train_on_gpu):
                inputs,targets = inputs.cuda(),targets.cuda()
            ##creating new variables for hidden state
            h=tuple([each.data for each in h])
            
            net.zero_grad()
            output,h=net(inputs,h)
            loss=criterion(output,targets.view(batch_size*seq_length).long())
            loss.backward()
            
            nn.utils.clip_grad_norm_(net.parameters(),clip)
            opt.step()
            
            if counter % print_every == 0:
                # Get validation loss
                val_h = net.init_hidden(batch_size)
                val_losses = []
                net.eval()
                for x, y in get_batches(val_data, batch_size, seq_length):
                    x = one_hot_encoder(x, n_chars)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)
                    
                    # Creating new variables for the hidden state, otherwise
                    # we'd backprop through the entire training history
                    val_h = tuple([each.data for each in val_h])
                    
                    inputs, targets = x, y
                    if(train_on_gpu):
                        inputs, targets = inputs.cuda(), targets.cuda()

                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(output, targets.view(batch_size*seq_length).long())
                
                    val_losses.append(val_loss.item())
                
                net.train() # reset to train mode after iterationg through validation data
                
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))

    

# Set the hyperparameters

In [12]:
n_hidden=512
n_layers=2
network=CharRNN(chars,n_hidden,n_layers)
print(network)

CharRNN(
  (lstm): LSTM(91, 512, num_layers=2, batch_first=True, dropout=0.4)
  (dropout): Dropout(p=0.4)
  (fc): Linear(in_features=512, out_features=91, bias=True)
)


In [14]:
batch_size=64
seq_length=128
n_epochs=20
train(network,encoded_char,epochs=n_epochs,batch_size=batch_size,seq_length=seq_length,lr=0.001,print_every=10)

Epoch: 1/20... Step: 10... Loss: 1.8889... Val Loss: 1.7365
Epoch: 1/20... Step: 20... Loss: 1.7926... Val Loss: 1.7014
Epoch: 1/20... Step: 30... Loss: 1.7967... Val Loss: 1.6854
Epoch: 1/20... Step: 40... Loss: 1.8029... Val Loss: 1.6760
Epoch: 1/20... Step: 50... Loss: 1.8280... Val Loss: 1.6667
Epoch: 1/20... Step: 60... Loss: 1.7917... Val Loss: 1.6567
Epoch: 1/20... Step: 70... Loss: 1.7900... Val Loss: 1.6501
Epoch: 1/20... Step: 80... Loss: 1.7708... Val Loss: 1.6414
Epoch: 1/20... Step: 90... Loss: 1.7500... Val Loss: 1.6344
Epoch: 1/20... Step: 100... Loss: 1.7580... Val Loss: 1.6340
Epoch: 1/20... Step: 110... Loss: 1.7510... Val Loss: 1.6238
Epoch: 1/20... Step: 120... Loss: 1.7142... Val Loss: 1.6161
Epoch: 1/20... Step: 130... Loss: 1.7158... Val Loss: 1.6130
Epoch: 1/20... Step: 140... Loss: 1.7394... Val Loss: 1.6095
Epoch: 1/20... Step: 150... Loss: 1.7291... Val Loss: 1.5999
Epoch: 1/20... Step: 160... Loss: 1.7361... Val Loss: 1.5923
Epoch: 1/20... Step: 170... Loss:

# Saving the model

In [17]:
model_name = 'rnn_model.net'

checkpoint = {'n_hidden': network.n_hidden,
              'n_layers': network.n_layers,
              'state_dict': network.state_dict(),
              'tokens': network.chars}

with open(model_name, 'wb') as f:
    torch.save(checkpoint, f)

# Making predictions

In [34]:
def predict(network, char, h=None, top_k=None):
        ''' Given a character, predict the next character.
            Returns the predicted character and the hidden state.
        '''
        
        # tensor inputs
        x = np.array([[network.char2int[char]]])
        x = one_hot_encoder(x, len(network.chars))
        inputs = torch.from_numpy(x)
        
        if(train_on_gpu):
            inputs = inputs.cuda()
        
        # detach hidden state from history
        h = tuple([each.data for each in h])
        # get the output of the model
        out, h = network(inputs, h)

        # get the character probabilities
        p = F.softmax(out, dim=1).data
        if(train_on_gpu):
            p = p.cpu() # move to cpu
        
        # get top characters
        if top_k is None:
            top_ch = np.arange(len(network.chars))
        else:
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()
        
        # select the likely next character with some element of randomness
        p = p.numpy().squeeze()
        char = np.random.choice(top_ch, p=p/p.sum())
        
        return network.int2char[char],h

In [38]:
def sample(network, size, prime='Harry', top_k=None):
        
    if(train_on_gpu):
        network.cuda()
    else:
        network.cpu()
    
    network.eval() # eval mode
    
    # First off, run through the prime characters
    chars = [ch for ch in prime]
    h = network.init_hidden(1)
    for ch in prime:
        char, h = predict(network, ch, h, top_k=top_k)

    chars.append(char)
    
    # Now pass in the previous character and get a new one
    for ii in range(size):
        char, h = predict(network, chars[-1], h, top_k=top_k)
        chars.append(char)

    return ''.join(chars)

# Generating samples

In [39]:
print(sample(network, 1000, prime='Non-magic people', top_k=5))

Non-magic people
had been allowed to get the strangers of a thing and show we
have been trying, he was as is to start this in their words, and he'd be something on his beetle one of them.  He left Harry's friends were supposed to be seen and taken out, at once as someone thought he were still spangling in the corner.

"I were you, Hermione," said Harry.

Harry saw Harry all the deserted. Hermione was
standing on the floor, and he was thinking to the floor to stay
when the trapdoor was still sparing in the big.  And then a big land, both tense that the scar had been supporting Hormito Harry that he was.

Harry stopped to held him, his hands off as a chocolate shot off
the window. The chickens standing their friendly and silence.  He spoke the time to give him a clear, though special weathers, and the common room was the tree, with the staff room.

The second. It was the silence. This, Harry couldn't see a laugh, sounded
when they were being told. Terrified in the students outside. The w