In [1]:
from fastai.text.all import *
path = untar_data(URLs.HUMAN_NUMBERS)
path.ls()

(#2) [Path('/Users/riteshgaire/.fastai/data/human_numbers/train.txt'),Path('/Users/riteshgaire/.fastai/data/human_numbers/valid.txt')]

In [2]:
lines = L()
with open(path/'train.txt') as f: lines += L(*f.readlines()) 
with open(path/'valid.txt') as f: lines += L(*f.readlines()) 
lines

(#9998) ['one \n','two \n','three \n','four \n','five \n','six \n','seven \n','eight \n','nine \n','ten \n'...]

In [3]:
#We take all those lines and concatenate them in one big stream. 
#To mark when we go from one number to the next, we use a . as a separator:

In [6]:
text = ' . '.join([l.strip() for l in lines]) 
text[:100]

'one . two . three . four . five . six . seven . eight . nine . ten . eleven . twelve . thirteen . fo'

In [7]:
#We can tokenize this dataset by splitting on spaces:

tokens = text.split(' ')
tokens[:10]

['one', '.', 'two', '.', 'three', '.', 'four', '.', 'five', '.']

In [8]:
#To numericalize, we have to create a list of all the unique tokens (our vocab):

vocab = L(*tokens).unique()
vocab

(#30) ['one','.','two','three','four','five','six','seven','eight','nine'...]

In [9]:
#Then we can convert our tokens into numbers by looking up the index of each in the vocab:

word2idx = {w:i for i,w in enumerate(vocab)} 
nums = L(word2idx[i] for i in tokens)
nums

(#63095) [0,1,2,1,3,1,4,1,5,1...]

In [10]:
#HERE GOES OUT FIRT MODEL FROM SCRATCH

#One simple way to turn this into a neural network would be to specify that we are going to predict each word 
#based on the previous three words. We could create a list of every sequence of three words as our independent variables, 
#and the next word after each sequence as the dependent variable.

In [17]:
#We can do that with plain Python. Let’s do it first with tokens just to confirm what it looks like:

L((tokens[i:i+3], tokens[i+3]) for i in range(0,len(tokens)-4,3))

(#21030) [(['one', '.', 'two'], '.'),(['.', 'three', '.'], 'four'),(['four', '.', 'five'], '.'),(['.', 'six', '.'], 'seven'),(['seven', '.', 'eight'], '.'),(['.', 'nine', '.'], 'ten'),(['ten', '.', 'eleven'], '.'),(['.', 'twelve', '.'], 'thirteen'),(['thirteen', '.', 'fourteen'], '.'),(['.', 'fifteen', '.'], 'sixteen')...]

In [24]:
#Now we will do it with tensors of the numericalized values, which is what the model will actually use:

seqs = L((tensor(nums[i:i+3]), nums[i+3]) for i in range(0,len(nums)-4,3)) 
seqs

(#21031) [(tensor([0, 1, 2]), 1),(tensor([1, 3, 1]), 4),(tensor([4, 1, 5]), 1),(tensor([1, 6, 1]), 7),(tensor([7, 1, 8]), 1),(tensor([1, 9, 1]), 10),(tensor([10,  1, 11]), 1),(tensor([ 1, 12,  1]), 13),(tensor([13,  1, 14]), 1),(tensor([ 1, 15,  1]), 16)...]

In [25]:
#We can batch those easily using the DataLoader class. For now, we will split the sequences randomly:

bs = 64
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(seqs[:cut], seqs[cut:], bs=64, shuffle=False)

In [26]:
#We can now create a nural network that  takes three words as input, and returns a prediction of the probability
#of possible nect words in the vocab
# we will use three standard linear layers, nut with two tweaks

#The first tweak is that the first linear layer will use only the first word’s embedding as activations, 
#the second layer will use the second word’s embedding plus the first lay‐ er’s output activations, 
#and the third layer will use the third word’s embedding plus the second layer’s output activations. 
#The key effect is that every word is interpreted in the information context of any words preceding it.

#The second tweak is that each of these three layers will use the same weight matrix. 
#The way that one word impacts the activations from previous words should not change depending on the position of a word. 
#In other words, activation values will change as data moves through the layers, but the layer weights 
#themselves will not change from layer to layer. 
#So, a layer does not learn one sequence position; it must learn to handle all positions.
#Since layer weights do not change, you might think of the sequential layers as “the same layer” repeated. 
#In fact, PyTorch makes this concrete; we can create just one layer and use it multiple times.

In [28]:
#Our Language Model in PyTorch

class LMModel1(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden,vocab_sz)
    def forward(self, x):
        h = F.relu(self.h_h(self.i_h(x[:,0]))) 
        h = h + self.i_h(x[:,1])
        h = F.relu(self.h_h(h))
        h = h + self.i_h(x[:,2])
        h = F.relu(self.h_h(h))
        return self.h_o(h)

In [29]:
#Let’s try training this model and see how it goes:

learn = Learner(dls, LMModel1(len(vocab), 64), loss_func=F.cross_entropy,
                    metrics=accuracy)
learn.fit_one_cycle(4, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.773262,1.845259,0.459948,00:00
1,1.401774,1.687664,0.464939,00:00
2,1.414476,1.65264,0.493226,00:00
3,1.375193,1.683244,0.418588,00:00


In [30]:
#To see if this is any good, let’s check what a very simple model would give us. 
#In this case, we could always predict the most common token, 
#so let’s find out which token is most often the target in our validation set:

n,counts = 0,torch.zeros(len(vocab)) 
for x,y in dls.valid:
    n += y.shape[0]
    for i in range_of(vocab): counts[i] += (y==i).long().sum() 
idx = torch.argmax(counts)
idx, vocab[idx.item()], counts[idx].item()/n

(tensor(29), 'thousand', 0.15165200855716662)

In [31]:
# In above Code i
#The most common token has the index 29, which corresponds to the token thousand. 
#Always predicting this token would give us an accuracy of roughly 15%, so we are faring way better!

#This is a nice first baseline. Let’s see how we can refactor it with a loop.

In [32]:
# Our First Recurrent Neural Network
#Looking at the code for our module, 
#we could simplify it by replacing the duplicated code that calls the layers with a for loop. 
#In addition to making our code simpler, 
#this will have the benefit that we will be able to apply our module equally well to token sequences of different 
#lengths—we won’t be restricted to token lists of length three:

class LMModel2(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden,vocab_sz)
    def forward(self, x): 
        h=0
        for i in range(3):
            h = h + self.i_h(x[:,i]) 
            h = F.relu(self.h_h(h))
        return self.h_o(h)

In [33]:
#Let’s check that we get the same results using this refactoring:

learn = Learner(dls, LMModel2(len(vocab), 64), loss_func=F.cross_entropy,
                    metrics=accuracy)
learn.fit_one_cycle(4, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.795286,2.099795,0.464464,00:00
1,1.390736,1.880543,0.472308,00:00
2,1.403574,1.703555,0.496791,00:00
3,1.377218,1.734753,0.411695,00:00


In [34]:
#You will see that a set of activations is being updated each time through the loop, 
#stored in the variable h—this is called the hidden state (the activations that are updated at each step
#of a recurrent neural network. 
#A neural network that is defined using a loop like this is called a recurrent neural net‐ work (RNN).
#It is important to realize that an RNN is not a complicated new architec‐ ture, but simply a refactoring of a multilayer 
#neural network using a for loop.

In [35]:
#Improving the RNN
#Looking at the code for our RNN, one thing that seems problematic is that we are initializing our hidden state to zero 
#for every new input sequence. Why is that a prob‐ lem? We made our sample sequences short so they would fit easily 
#into batches. 
#But if we order those samples correctly, the sample sequences will be read in order by the model, 
#exposing the model to long stretches of the original sequence.
#Another thing we can look at is having more signal: why predict only the fourth word when we could use the 
#intermediate predictions to also predict the second and third words? 
#Let’s see how we can implement those changes, starting with adding some state.

In [36]:
#Maintaining the state of RNN
#Because we initialize the model’s hidden state to zero for each new sample, 
#we are throwing away all the information we have about the sentences we have seen so far, 
#which means that our model doesn’t actually know where we are up to in the overall counting sequence. 
#This is easily fixed; we can simply move the initialization of the hidden state to __init__

#But this fix will create its own subtle, but important, problem. 
#It effectively makes our neural network as deep as the entire number of tokens in our document. 
#For instance, if there were 10,000 tokens in our dataset, we would be creating a 10,000-layer neural network.

#The problem with 10000 layer nn, when we get to the 10000th word dataset, we will still need to calculate the derivatives
#all the way back to the first layer.
#this will slow the process, and memory-intensive. 

In [None]:
#Chain Rule of Calculus: The backpropagation algorithm relies on the chain rule to calculate the derivatives of the 
#loss function with respect to the weights. Since each layer's output is a function of its input, which itself is a 
#function of the weights and outputs of previous layers, the derivative of the loss with respect to the weight in 
#any given layer depends on the derivatives of all the subsequent layers.

#Dependency Chain: In a deep network, each layer's output depends on the layers before it. 
#Therefore, to understand how a change in the weight of the first layer affects the final loss, you need to consider 
#its impact on the second layer, then the impact of the second on the third, and so on, up to the last layer. 
#This creates a long chain of dependencies that must be resolved to update the weights correctly.

##There are other rules as well: Gradient Descent: Neural networks typically use gradient descent or its variants 
#(like Adam, RMSprop, etc.) to optimize the loss function. 
#In gradient descent, the weights are updated in the opposite direction of the gradient of the loss function 
#with respect to the weights. This means calculating how a small change in each weight affects the loss, which is done 
#by taking derivatives.

In [37]:
class LMModel3(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden,vocab_sz)
        self.h = 0
    def forward(self, x): 
        for i in range(3):
            self.h = self.h + self.i_h(x[:,i])
            self.h = F.relu(self.h_h(self.h)) 
            out = self.h_o(self.h)
            self.h = self.h.detach()
            return out
    def reset(self): self.h = 0


#we do not want to backpropagate the derivatives through the entire implicit neural network. 
#Instead, we will keep just the last three layers of gradients. To remove all of the gradient history in PyTorch, 
#we use the detach method.
#Here is the new version of our RNN. It is now stateful, because it remembers its acti‐ vations between 
#different calls to forward, which represent its use for different sam‐ ples in the batch:


#This model will have the same activations whatever sequence length we pick, because the hidden state will remember 
#the last activation from the previous batch. The only thing that will be different is the gradients computed at 
#each step: they will be calcula‐ ted on only sequence length tokens in the past, instead of the whole stream. 
#This approach is called backpropagation through time (BPTT).

In [None]:
#Jargon: Backpropagation Through Time
#Treating a neural net with effectively one layer per time step (usu‐ ally refactored using a loop) 
#as one big model, and calculating gra‐ dients on it in the usual way. To avoid running out of memory and time, 
#we usually use truncated BPTT, which “detaches” the history of computation steps in the hidden state every few time steps

In [40]:
#To use LMModel3, we need to make sure the samples are going to be seen in a certain order.
#First we divide the samples into m = len(dset)// bs group
#(this is the equivalent of splitting the whole concaten‐ ated dataset into, 
#for example, 64 equally sized pieces, since we’re using bs=64 here)

#m is the length of the each of this pieces

m = len(seqs)//bs
m,bs,len(seqs)

(328, 64, 21031)

In [41]:
#The first batch will be composed of the samples
   # (0, m, 2*m, ..., (bs-1)*m)
#the second batch of the samples
    #(1, m+1, 2*m+1, ..., (bs-1)*m+1)
#and so forth. This way, at each epoch, the model will see a chunk of contiguous text of size 3*m 
#(since each text is of size 3) on each line of the batch.


In [44]:
#The following function does the reindexing

def group_chunks(ds, bs):
    m = len(ds) // bs
    new_ds = L()
    for i in range(m): new_ds += L(ds[i + m*j] for j in range(bs)) 
    return new_ds

In [45]:
#Then we just pass drop_last=True when building our DataLoaders to drop the last batch that does not have a shape of bs. 
#We also pass shuffle=False to make sure the texts are read in order:

cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(
    group_chunks(seqs[:cut], bs),
    group_chunks(seqs[cut:], bs),
    bs=bs, drop_last=True, shuffle=False)

In [46]:
#The last thing we do is a little treak of the training loop via a callback. it will call reset
#method of our model at the begining of each epoch and before each validation phase. 
#since we implemented that method to set the hidden state to zero, this will make sure we start with a clean
#state before reading those continuous chunks of text. 
#We can also start training a bit longer. 

In [47]:
learn = Learner(dls, LMModel3(len(vocab), 64), loss_func=F.cross_entropy,
                    metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(10, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,2.095834,2.121935,0.375,00:00
1,1.403627,1.818492,0.504567,00:00
2,1.210752,1.69026,0.528606,00:00
3,1.102061,1.716544,0.539904,00:00
4,1.017022,1.686464,0.533173,00:00
5,0.960161,1.670834,0.541587,00:00
6,0.911121,1.645974,0.564183,00:00
7,0.87123,1.637487,0.563702,00:00
8,0.850891,1.685664,0.557692,00:00
9,0.840873,1.661906,0.5625,00:00


In [49]:
#Creating more signals

In [50]:
#It would be better if we predicted the next word after every single word, rather than every three words

In [51]:
#First change our data so that the dependent variable has each of the three next words after each of our three input words
#instead of 3 we could use sl (for sequence length), and make it bigger. 

In [52]:
sl = 16
seqs = L((tensor(nums[i:i+sl]), tensor(nums[i+1:i+sl+1]))
         for i in range(0,len(nums)-sl-1,sl)) 
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(group_chunks(seqs[:cut], bs),
                             group_chunks(seqs[cut:], bs),
                             bs=bs, drop_last=True, shuffle=False)

In [53]:
#Looking at the first element of seqs, we can see that it contains two lists of the same
#size. The second list is the same as the first, but offset by one element:

[L(vocab[o] for o in s) for s in seqs[0]]

[(#16) ['one','.','two','.','three','.','four','.','five','.'...],
 (#16) ['.','two','.','three','.','four','.','five','.','six'...]]

In [55]:
#Now we need to modify our model so that it outputs a prediction after every word, 
#rather than just at the end of a three-word sequence:

class LMModel4(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden,vocab_sz)
        self.h = 0
    def forward(self, x): 
        outs = []
        for i in range(sl):
            self.h = self.h + self.i_h(x[:,i]) 
            self.h = F.relu(self.h_h(self.h)) 
            outs.append(self.h_o(self.h))
        self.h = self.h.detach()
        return torch.stack(outs, dim=1)
    def reset(self): self.h = 0

#This model will return outputs of shape bs x sl x vocab_sz (since we stacked on dim=1). 
#Our targets are of shape bs x sl, so we need to flatten those before using them in F.cross_entropy:

def loss_func(inp, targ):
    return F.cross_entropy(inp.view(-1, len(vocab)), targ.view(-1))
    
#We can now use this loss function to train the model:

learn = Learner(dls, LMModel4(len(vocab), 64), loss_func=loss_func,
                metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.263626,3.009837,0.220459,00:00
1,2.338349,1.894706,0.469238,00:00
2,1.745615,1.782273,0.475586,00:00
3,1.453624,1.724784,0.511475,00:00
4,1.278659,1.683755,0.527751,00:00
5,1.116195,1.670979,0.552002,00:00
6,0.971567,1.624209,0.554443,00:00
7,0.861114,1.665836,0.596924,00:00
8,0.775877,1.589161,0.626465,00:00
9,0.699975,1.738365,0.651042,00:00


In [56]:
#Multilayer RNNs

class LMModel5(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)  # Embedding layer
        self.rnn = nn.RNN(n_hidden, n_hidden, n_layers, batch_first=True)  # RNN layer
        self.h_o = nn.Linear(n_hidden, vocab_sz)  # Output linear layer
        self.h = torch.zeros(n_layers, bs, n_hidden)  # Hidden state initialization

    def forward(self, x):
        res, h = self.rnn(self.i_h(x), self.h)  # Pass input and hidden state to the RNN
        self.h = h.detach()  # Detach hidden state from the graph
        return self.h_o(res)  # Apply the linear layer to the RNN output

    def reset(self):
        self.h.zero_()  # Reset hidden state to zero

# Setup for training the model
learn = Learner(dls, LMModel5(len(vocab), 64, 2),
                loss_func=CrossEntropyLossFlat(),
                metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15, 3e-3)  # Start training for 15 epochs with specified learning rate

#deeper the model, longer the training time.

#EACH EXTRA LAYER IS ANOTHER MATRIX MULTIPLICATION. 


#This is challenging because of what happens when you multiply by a matrix many times. 
#Think about what happens when you multiply by a number many times. For example, if you multiply by 2, 
#starting at 1, you get the sequence 1, 2, 4, 8,...and after 32 steps, you are already at 4,294,967,296. 
#A similar issue happens if you multiply by 0.5: you get 0.5, 0.25, 0.125...and after 32 steps, it’s 0.00000000023.

#In practice, creating accurate models from this kind of RNN is difficult. 
#We will get better results if we call detach less often, and have more layers—this gives our RNN a longer time 
#j horizon to learn from and richer features to create.


epoch,train_loss,valid_loss,accuracy,time
0,3.014019,2.611061,0.26123,00:04
1,2.144691,1.733702,0.47168,00:02
2,1.704649,1.885039,0.326335,00:02
3,1.495569,1.988733,0.369222,00:03
4,1.284528,1.973199,0.414958,00:02
5,1.12953,2.01408,0.400798,00:03
6,0.986453,1.937139,0.410807,00:02
7,0.861042,2.010647,0.409587,00:02
8,0.749368,2.107382,0.420573,00:02
9,0.659244,2.146489,0.429362,00:03


In [57]:
#AVOIDING EXPLOIDING ACTIVATIONS. 

#For RNNs, two types of layers are frequently used to avoid exploding activations: 
#gated recurrent units (GRUs) and long short-term memory (LSTM) layers.

#oN LSTM THE CELL STATE IS RESPONSIBLE FOR KEEPING LONG SHORT TERM MEMORY, while the hidden state will focus on next token 
#to predict. 

In [None]:
#LSTM from scratch

In [58]:
#In an LSTM (Long Short-Term Memory) unit:

#Forget Gate: A linear layer followed by a sigmoid activation determines which parts of the cell state to 
#retain or discard. Values near 0 indicate information to forget, while values near 1 suggest retaining information.

#Input Gate and Cell Gate: The input gate decides which parts of the cell state to update (values near 1 update, 
#values near 0 do not). The cell gate, which also incorporates a tanh activation, 
#specifies the new values to be added to the cell state.

#Output Gate: It selects which parts of the cell state contribute to the output. This involves passing the cell state through a tanh function, which is then modulated by the sigmoid output of the output gate to produce the new hidden state.

In [59]:
class LSTMCell(Module):
    def __init__(self, ni, nh):
        self.forget_gate = nn.Linear(ni + nh, nh)
        self.input_gate  = nn.Linear(ni + nh, nh)
        self.cell_gate   = nn.Linear(ni + nh, nh)
        self.output_gate = nn.Linear(ni + nh, nh)
    def forward(self, input, state): 
        h,c = state
        h = torch.stack([h, input], dim=1)
        forget = torch.sigmoid(self.forget_gate(h)) 
        c = c * forget
        inp = torch.sigmoid(self.input_gate(h)) 
        cell = torch.tanh(self.cell_gate(h))
        c = c + inp * cell
        out = torch.sigmoid(self.output_gate(h))
        h = outgate * torch.tanh(c)
        return h, (h,c)

In [60]:
#it’s better to do one big matrix multiplication than four smaller ones
# The optimized and refactored code then looks like this:

class LSTMCell(Module):
    def __init__(self, ni, nh):
        self.ih = nn.Linear(ni,4*nh)
        self.hh = nn.Linear(nh,4*nh)
    def forward(self, input, state): 
        h,c = state
            # One big multiplication for all the gates is better than 4 smaller ones
        gates = (self.ih(input) + self.hh(h)).chunk(4, 1)
        ingate,forgetgate,outgate = map(torch.sigmoid, gates[:3])
        cellgate = gates[3].tanh()
        c = (forgetgate*c) + (ingate*cellgate) 
        h = outgate * c.tanh()
        return h, (h,c)


In [61]:
#Here we use the PyTorch chunk method to split our tensor into four pieces. It works like this:

t = torch.arange(0,10); t

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [62]:
  t.chunk(2)

(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9]))

In [63]:
#Training a Language Model Using LSTMs

class LMModel6(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True) 
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]
    def forward(self, x):
        res,h = self.rnn(self.i_h(x), self.h) 
        self.h = [h_.detach() for h_ in h] 
        return self.h_o(res)
    def reset(self):
        for h in self.h: h.zero_()
learn = Learner(dls, LMModel6(len(vocab), 64, 2),
                    loss_func=CrossEntropyLossFlat(),
                    metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,3.023631,2.718089,0.288818,00:07
1,2.194491,1.895583,0.383138,00:08
2,1.613236,1.81552,0.473877,00:08
3,1.296872,2.099502,0.524007,00:09
4,1.015141,2.11513,0.564941,00:07
5,0.737885,2.064255,0.593831,00:06
6,0.514384,2.217839,0.63444,00:08
7,0.373,1.980627,0.695231,00:06
8,0.269672,1.977432,0.708496,00:06
9,0.189167,1.934993,0.735433,00:05


In [64]:
#Dropout: The idea is randomly change some activations to zero at training time. This makes sure all
#neurons actively work towards the output.

class Dropout(Module):
    def __init__(self, p): self.p = p 
    def forward(self, x):
        if not self.training: return x
        mask = x.new(*x.shape).bernoulli_(1-p) 
        return x * mask.div_(1-p)

In [65]:
#The bernoulli_ method is creating a tensor of random zeros (with probability p) and ones (with probability 1-p), 
#which is then multiplied with our input before dividing by 1-p. Note the use of the training attribute, 
#which is available in any PyTorch nn.Module, and tells us if we are doing training or inference.

In [66]:
#Activation Regularization and Temporal Activation Regularization
# Activation regularization (AR) and temporal activation regularization (TAR) are two regularization methods very 
#similar to weight decay. 
#When applying weight decay, we add a small penalty to the loss that aims at making the weights as small as possible. 
#For activation regularization, it’s the final activations produced by the LSTM that we will try to make as small as 
#possible, instead of the weights

# loss += alpha * activations.pow(2).mean()

# loss += beta * (activations[:,1:] - activations[:,:-1]).pow(2).mean()

#alpha and beta are then two hyperparameters to tune. To make this work, we need our model with dropout to return 
#three things: the proper output, the activations of the LSTM pre-dropout, and the activations of the LSTM post-dropout.


In [69]:
#Training a Weight-Tied Regularized LSTM

#We can combine dropout (applied before we go into our output layer) with AR and TAR to train our previous LSTM. 
#We just need to return three things instead of one: the normal output of our LSTM, the dropped-out activations, 
#and the activations from our LSTMs. The last two will be picked up by the callback RNNRegularization for the 
#contributions it has to make to the loss.

In [70]:
#Another useful trick we can add from the AWD-LSTM paper is weight tying. In a lan‐ guage model, 
#the input embeddings represent a mapping from English words to acti‐ vations, and the output hidden layer 
#represents a mapping from activations to English words. We might expect, intuitively, that these mappings 
#could be the same. We can represent this in PyTorch by assigning the same weight matrix to each of these layers:

#self.h_o.weight = self.i_h.weight

In [71]:
class LMModel7(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers, p):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True) 
        self.drop = nn.Dropout(p)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h_o.weight = self.i_h.weight
        self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]
    def forward(self, x):
        raw,h = self.rnn(self.i_h(x), self.h) 
        out = self.drop(raw)
        self.h = [h_.detach() for h_ in h] 
        return self.h_o(out),raw,out
    def reset(self):
        for h in self.h: h.zero_()

In [72]:
#We can create a regularized Learner using the RNNRegularizer callback:

learn = Learner(dls, LMModel7(len(vocab), 64, 2, 0.5),
                loss_func=CrossEntropyLossFlat(), metrics=accuracy,
                cbs=[ModelResetter, RNNRegularizer(alpha=2, beta=1)])

In [73]:
#A TextLearner automatically adds those two callbacks for us (with those values for
#alpha and beta as defaults), so we can simplify the preceding line:

learn = TextLearner(dls, LMModel7(len(vocab), 64, 2, 0.4),
                    loss_func=CrossEntropyLossFlat(), metrics=accuracy)

In [74]:
#We can then train the model, and add additional regularization by increasing the weight decay to 0.1:

learn.fit_one_cycle(15, 1e-2, wd=0.1)


epoch,train_loss,valid_loss,accuracy,time
0,2.595704,2.085654,0.468831,00:10
1,1.555939,1.384637,0.664307,00:09
2,0.802042,0.923289,0.792887,00:09
3,0.39158,0.878531,0.820719,00:09
4,0.201525,0.759407,0.848063,00:09
5,0.11003,0.772179,0.846517,00:07
6,0.067381,0.807794,0.856364,00:10
7,0.044814,0.705166,0.860921,00:08
8,0.035101,0.691455,0.870931,00:10
9,0.027616,0.773905,0.856608,00:07


In [75]:
import torch

# Create a tensor of zeros
x = torch.zeros(10)

# Apply bernoulli_ to randomly set some elements to 1 with a probability of 0.5
x.bernoulli_(0.5)
print(x)


tensor([1., 0., 0., 0., 0., 1., 1., 0., 1., 0.])
