In [2]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F


Main solution: https://github.com/udacity/deep-learning-v2-pytorch/blob/master/recurrent-neural-networks/char-rnn/Character_Level_RNN_Solution.ipynb

In [3]:
with open('../input/anna-karenina-book/anna.txt','r') as f:
    text = f.read()

In [None]:
text[:100]

In [4]:
#Tokenization
chars = tuple(set(text))
int2char=dict(enumerate(chars))
#print(int2char)
char2int={ch:ii for ii,ch in int2char.items()}
#print("\n",char2int)

#encode the text
encoded = np.array([char2int[ch] for ch in text])
print(encoded.size)


1985223


In [5]:
def one_hot_encode(arr,n_labels):
    one_hot = np.zeros((arr.size,n_labels),dtype=np.float32)
    one_hot[np.arange(one_hot.shape[0]),arr.flatten()]=1
    one_hot = one_hot.reshape((*arr.shape,n_labels))
    return one_hot

In [6]:
# check that the function works as expected
test_seq = np.array([[3, 5, 1]])
one_hot = one_hot_encode(test_seq, 8)

print(one_hot)

[[[0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0.]]]


In [7]:
def get_batches(arr,batch_size,seq_length):
    batch_size_total = batch_size*seq_length
    n_batches=len(arr)//batch_size_total
    
    arr = arr[:n_batches*batch_size_total]
    arr = arr.reshape(batch_size,-1)
    
    for n in range(0,arr.shape[1],seq_length):
        x = arr[:,n:n+seq_length]
        y=np.zeros_like(x)
        try:
            y[:,:-1],y[:,-1]=x[:,1:], arr[:,n+seq_length]
        except IndexError:
            y[:,:-1],y[:,-1] = x[:,1:],arr[:,0]
        yield x,y
        
    

In [8]:
batches = get_batches(encoded,8,4)
x,y = next(batches)
# print('x\n',x[:10,:10])
# print('y\n',y[:10,:10])

In [9]:
# check if GPU is available
train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
    print('Training on GPU!')
else: 
    print('No GPU available, training on CPU; consider making n_epochs very small.')


Training on GPU!


In [10]:
class CharRNN(nn.Module):
    def __init__(self, tokens, n_hidden=256,n_layers=2,
                drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob=drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        #creating char dictionaries
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch:ii for ii,ch in self.int2char.items()}
        
        #lstm layer
        input_size=len(self.chars)
        self.lstm = nn.LSTM(input_size,n_hidden,n_layers,
                           dropout=drop_prob, batch_first=True)
        
        #dropout layer
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(n_hidden,len(self.chars))
        
    def forward(self,x,hidden):
        
        r_output, hidden = self.lstm(x,hidden)
        out = self.dropout(r_output)
         # Stack up LSTM outputs using view
        # you may need to use contiguous to reshape the output
        out = out.contiguous().view(-1,self.n_hidden)
        #put x through the fully connected layer
        out = self.fc(out)
        
        return out, hidden
    
    def init_hidden(self, batch_size):
        '''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        return hidden

In [11]:
def train(net, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, clip=5, val_frac=0.1, print_every=10):
    net.train()
    opt = torch.optim.Adam(net.parameters(),lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    #create training and validation data
    val_idx = int(len(data)*(1-val_frac))
    data,val_data = data[:val_idx],data[val_idx:]
    
    if(train_on_gpu):
        net.cuda()
    counter =0
    n_chars = len(net.chars)
    for e in range(epochs):
        h = net.init_hidden(batch_size)
        
        for x,y in get_batches(data,batch_size,seq_length):
            counter+=1
            x=one_hot_encode(x,n_chars)
            inputs,targets = torch.from_numpy(x), torch.from_numpy(y)
            if(train_on_gpu):
                inputs,targets=inputs.cuda(),targets.cuda()
            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h=tuple([each.data for each in h])
            
            net.zero_grad()
            output,h=net(inputs,h)
            loss = criterion(output,targets.view(batch_size*seq_length).long())
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()
            
             # loss stats
            if counter % print_every == 0:
                # Get validation loss
                val_h = net.init_hidden(batch_size)
                val_losses = []
                net.eval()
                for x, y in get_batches(val_data, batch_size, seq_length):
                    # One-hot encode our data and make them Torch tensors
                    x = one_hot_encode(x, n_chars)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)
                    
                    # Creating new variables for the hidden state, otherwise
                    # we'd backprop through the entire training history
                    val_h = tuple([each.data for each in val_h])
                    
                    inputs, targets = x, y
                    if(train_on_gpu):
                        inputs, targets = inputs.cuda(), targets.cuda()

                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(output, targets.view(batch_size*seq_length).long())
                
                    val_losses.append(val_loss.item())
                
                net.train() # reset to train mode after iterationg through validation data
                
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))
    

In [12]:
#model hyperparameters
n_hidden=128
n_layers=2
net=CharRNN(chars,n_hidden,n_layers)
print(net)

CharRNN(
  (lstm): LSTM(83, 128, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=128, out_features=83, bias=True)
)


In [21]:
#training hyperparameters
batch_size = 128
seq_length = 100
n_epochs=30
train(net, encoded, epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.001, print_every=10)


Epoch: 1/30... Step: 10... Loss: 3.1233... Val Loss: 3.1084
Epoch: 1/30... Step: 20... Loss: 3.0891... Val Loss: 3.1040
Epoch: 1/30... Step: 30... Loss: 3.1036... Val Loss: 3.0975
Epoch: 1/30... Step: 40... Loss: 3.0754... Val Loss: 3.0879
Epoch: 1/30... Step: 50... Loss: 3.0994... Val Loss: 3.0721
Epoch: 1/30... Step: 60... Loss: 3.0515... Val Loss: 3.0455
Epoch: 1/30... Step: 70... Loss: 3.0109... Val Loss: 3.0026
Epoch: 1/30... Step: 80... Loss: 2.9825... Val Loss: 2.9520
Epoch: 1/30... Step: 90... Loss: 2.9348... Val Loss: 2.8905
Epoch: 1/30... Step: 100... Loss: 2.8737... Val Loss: 2.8387
Epoch: 1/30... Step: 110... Loss: 2.8365... Val Loss: 2.7864
Epoch: 1/30... Step: 120... Loss: 2.7579... Val Loss: 2.7354
Epoch: 1/30... Step: 130... Loss: 2.7479... Val Loss: 2.6947
Epoch: 2/30... Step: 140... Loss: 2.7149... Val Loss: 2.6520
Epoch: 2/30... Step: 150... Loss: 2.6858... Val Loss: 2.6141
Epoch: 2/30... Step: 160... Loss: 2.6333... Val Loss: 2.5864
Epoch: 2/30... Step: 170... Loss:

Tips and Tricks
Monitoring Validation Loss vs. Training Loss
If you're somewhat new to Machine Learning or Neural Networks it can take a bit of expertise to get good models. The most important quantity to keep track of is the difference between your training loss (printed during training) and the validation loss (printed once in a while when the RNN is run on the validation data (by default every 1000 iterations)). In particular:

If your training loss is much lower than validation loss then this means the network might be overfitting. Solutions to this are to decrease your network size, or to increase dropout. For example you could try dropout of 0.5 and so on.
If your training/validation loss are about equal then your model is underfitting. Increase the size of your model (either number of layers or the raw number of neurons per layer)
Approximate number of parameters
The two most important parameters that control the model are n_hidden and n_layers. I would advise that you always use n_layers of either 2/3. The n_hidden can be adjusted based on how much data you have. The two important quantities to keep track of here are:

The number of parameters in your model. This is printed when you start training.
The size of your dataset. 1MB file is approximately 1 million characters.
These two should be about the same order of magnitude. It's a little tricky to tell. Here are some examples:

I have a 100MB dataset and I'm using the default parameter settings (which currently print 150K parameters). My data size is significantly larger (100 mil >> 0.15 mil), so I expect to heavily underfit. I am thinking I can comfortably afford to make n_hidden larger.
I have a 10MB dataset and running a 10 million parameter model. I'm slightly nervous and I'm carefully monitoring my validation loss. If it's larger than my training loss then I may want to try to increase dropout a bit and see if that helps the validation loss.
Best models strategy
The winning strategy to obtaining very good models (if you have the compute time) is to always err on making the network larger (as large as you're willing to wait for it to compute) and then try different dropout values (between 0,1). Whatever model has the best validation performance (the loss, written in the checkpoint filename, low is good) is the one you should use in the end.

It is very common in deep learning to run many different models with many different hyperparameter settings, and in the end take whatever checkpoint gave the best validation performance.

By the way, the size of your training and validation splits are also parameters. Make sure you have a decent amount of data in your validation set or otherwise the validation performance will be noisy and not very informative.

In [23]:
#checkpoint
model_name = 'rnn_30_epoch.net'
checkpoint ={'n_hidden':net.n_hidden,
            'n_layers':net.n_layers,
            'state_dict':net.state_dict(),
            'tokens':net.chars}
with open(model_name,'wb') as f:
    torch.save(checkpoint,f)

# Prediction
# Top K sampling
Our predictions come from a categorical probability distribution over all the possible characters. We can make the sample text and make it more reasonable to handle (with less variables) by only considering some $K$ most probable characters. This will prevent the network from giving us completely absurd characters while allowing it to introduce some noise and randomness into the sampled text. Read more about topk, here.

In [24]:
def predict(net, char, h=None, top_k=None):
    #tensor inputs
    x=np.array([[net.char2int[char]]])
    x=one_hot_encode(x,len(net.chars))
    inputs = torch.from_numpy(x)
    
    if(train_on_gpu):
        inputs=inputs.cuda()
    h=tuple([each.data for each in h])
    out,h=net(inputs,h)
    
    p=F.softmax(out,dim=1).data
    if(train_on_gpu):
        p=p.cpu()
    #get top characters
    if top_k is None:
        top_ch=np.arange(len(net.chars))
    else:
        p,top_ch=p.topk(top_k)
        top_ch=top_ch.numpy().squeeze()
        
    p=p.numpy().squeeze()
    char=np.random.choice(top_ch,p=p/p.sum())
    return net.int2char[char], h
    
    

In [25]:
def sample(net, size, prime='The', top_k=None):
        
    if(train_on_gpu):
        net.cuda()
    else:
        net.cpu()
    
    net.eval() # eval mode
    
    # First off, run through the prime characters
    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        char, h = predict(net, ch, h, top_k=top_k)

    chars.append(char)
    
    # Now pass in the previous character and get a new one
    for ii in range(size):
        char, h = predict(net, chars[-1], h, top_k=top_k)
        chars.append(char)

    return ''.join(chars)

In [29]:
print(sample(net, 1000, prime='Anna', top_k=5))

Anna, she's not, something to mise as the care and a master to them which he to did not a seas of
her."

"Time," Anna he was not tore the poins
and an offered to the coming and were the mindinate along his
consearing
time, said
her at the chanting. Her face, and
the betingeres,
and and himself wourded it with her
hand, without thought and had such
an angering their stard were were attining on the same-deat were his
senters and had been had nothing
steant in the proness tower the shalled, betanes of her that he to hore as in a day the corting of this shame to him had
tellent time of the praiced her hands, says that and he had a beto one.

"I don't sole that that so though always some and think anywhinc of her all have here to her a sayed, as the seepe ther and as that any seem of the princess that that she would not have nother intayed out of the part a talk, better,
but all her to her.
"I'll to me work. I he what there was began all sighted
her this seemed
to the manding. How seem, sam

In [27]:
# Here we have loaded in a model that trained over 20 epochs `rnn_20_epoch.net`
with open('rnn_30_epoch.net', 'rb') as f:
    checkpoint = torch.load(f)
    
loaded = CharRNN(checkpoint['tokens'], n_hidden=checkpoint['n_hidden'], n_layers=checkpoint['n_layers'])
loaded.load_state_dict(checkpoint['state_dict'])

<All keys matched successfully>

In [30]:

# Sample using a loaded model
print(sample(loaded, 2000, top_k=5, prime="So"))

Sorsky. I'm take herself.," said he had not back to her frow a both a forering. Stepan Arkadyevitch, would homtent to
her all him. She had seen to
his
considention, with
him, something and that still on the saw on his tears satting the biss and had and so wathing of
his husbands at him, what was the poosed to his thee him were all of astiin an her are to the manse and had to the thee fored, towards the charses and shome her that so was a shound, but the carting. He was so whole and that the sensing thought and hid houre he was a lecting of the steply was not
as a drouse, she was had seeping the more when
his count that and with the counted, hander. "What's a listered this. I soun time, and a lively. I stall to be there's been anyone time and that say."

He came his wife of the starl on his and with she had tanted her heart of the boughts and to hore the son heart. He were her heast and her sate, and
so happened his was, as that alexing himself of insate took and a lift who had brought 