# AWD_LSTM

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
#export
from exp.nb_12 import *

## Data

In [3]:
path = datasets.untar_data(datasets.URLs.IMDB)

We have to preprocess the data again to pickle it because if we try to load the previous `SplitLabeledData` with pickle, it will complain that some functions aren't in main.

In [4]:
il = TextList.from_files(path, include=['train', 'test', 'unsup'])
sd = SplitData.split_by_func(il, partial(random_splitter, p_valid=0.1))

In [5]:
proc_tok, proc_num = TokenizeProcessor(max_workers=8), NumericalizeProcessor()

In [6]:
# ll = label_by_func(sd, lambda x: 0, proc_x = [proc_tok, proc_num])

In [7]:
# pickle.dump(ll, open(path/'ll_lm.pkl', 'wb'))
# pickle.dump(proc_num.vocab, open(path/'vocab_lm.pkl', 'wb'))

In [8]:
ll = pickle.load(open(path/'ll_lm.pkl', 'rb'))
vocab = pickle.load(open(path/'vocab_lm.pkl', 'rb'))

In [9]:
bs,bptt = 64,70
data = lm_databunchify(ll, bs, bptt)

## AWD-LSTM

Before studying about awd-lstm, let's look at what an LSTM is. NNs were covered in part 1, if you need a refresher, there is a great visualization of them on [this website](http://joshvarty.github.io/VisualizingRNNs/).

### LSTM from scratch

We need to implement those equations(where σ stands for sigmoid):
![LSTM cell and equations](images/lstm.jpg)
(picture from [Understanding LSTMs](https://colah.github.io/posts/2015-08-Understanding-LSTMs/) by Chris Olah.)

If we want to take advantage of our GPU, it's better to do one big matrix multiplication than 4 smaller ones SO we compute the values of four gates all at once. SInce there is matrix multiplication and a bias, we use `nn.Linear` to do it.  
  
We need 2 lineaar layers: one for the input and one of the hidden state.

In [10]:
class LSTMCell(nn.Module):
    def __init__(self, ni, nh):
        super().__init__()
        self.ih = nn.Linear(ni, 4*nh) # times 4 because you divide it in 4 chunks later
        self.hh = nn.Linear(nh, 4*nh)
    
    def forward(self, input, state):
        h, c = state
        # one big multiplication for all the gates is better than 4 smaller ones
        gates = (self.ih(input) + self.hh(h)).chunk(4, 1) # split them into 4 chunks along the 1st dimension
        ingate, forgetgate, outgate = map(torch.sigmoid, gates[:3])
        cellgate = gates[3].tanh()
        
        c = (forgetgate*c) + (ingate*cellgate)
        h = outgate * c.tanh()
        return h, (h, c) 

Then the LSTM layer just applies the cell on all the timesteps in order:

In [11]:
class LSTMLayer(nn.Module):
    def __init__(self, cell, *cell_args):
        super().__init__()
        self.cell = cell(*cell_args)
        
    def forward(self, input, state):
        inputs = input.unbind(1) # removes tensor dimension(1)
        #>>> torch.unbind(torch.tensor([[1, 2, 3],
        # >>>                            [4, 5, 6],
        # >>>                            [7, 8, 9]]))
        # (tensor([1, 2, 3]), tensor([4, 5, 6]), tensor([7, 8, 9]))
        outputs = []
        for i in range(len(inputs)):
            out, state = self.cell(inputs[i], state)
            outputs += [out]
        return torch.stack(outputs, dim=1), state        

Now let's try it out and see how fast we are. We ONLY measure the forward pass

In [12]:
lstm = LSTMLayer(LSTMCell, 1000, 1000)

In [13]:
x = torch.randn(64, 70, 1000)
h = (torch.randn(64,1000), torch.zeros(64 ,1000))

CPU

In [14]:
%timeit -n 10 y,h1 = lstm(x,h)

713 ms ± 8.53 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


CUDA

In [15]:
lstm = lstm.cuda()
x = x.cuda()
h = (h[0].cuda(), h[1].cuda())

In [16]:
# synchronize is used to make sure that things 
# execute in the cuda as well as the python world.
def time_fn(f):
    f()
    torch.cuda.synchronize()

In [17]:
f = partial(lstm ,x, h)
time_fn(f)

In [18]:
%timeit -n 10 time_fn(f)

24.2 ms ± 389 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Built in version

Let's compare with Pytorch

In [19]:
lstm = nn.LSTM(1000, 1000, batch_first=True)
# Without batch_first=True it will use the first dimension as the sequence dimension.
# With batch_first=True it will use the second dimension as the sequence dimension.

In [20]:
x = torch.randn(64,70, 1000)
h = (torch.zeros(1,64,1000), torch.zeros(1, 64, 1000))

CPU

In [21]:
%timeit -n 10  y, h1 = lstm(x, h)

610 ms ± 3.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [22]:
lstm = lstm.cuda()
x = x.cuda()
h = (h[0].cuda(), h[1].cuda())

In [23]:
f = partial(lstm, x, h)
time_fn(f)

GPU

In [24]:
%timeit -n 10 time_fn(f)

14.2 ms ± 439 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


SO our version is running at almost the same speed on the CPU. However on the GPU, Pytorhc uses CUDNN behind the scenes that optimizes greatly the for loop

## Jit version

## Dropout

We want to use the AWD-LSTM from [Stephen Merity et al.](https://arxiv.org/abs/1708.02182). First we'll need all different kinds of dropouts. Dropout consists into replacing some coefficients by 0 with probability p. To ensure that the average of the weights remains constant, we apply a correction to the weights that aren't nullified of a fastor `1/(1-p)` (think of what happens to the activations if you want to figure out why--> Ans: they'll either explode or vanish).  
  
We apply dropout by drawing a mask that tells us which elemnts to nullify or not.

In [25]:
#export
def dropout_mask(x, sz, p):
    return x.new(*sz).bernoulli_(1-p).div(1-p)
# bernoulli simply means create 1s nad 0s and it's 1 with the probability of (1-p) and divide each element by (1-p)
# SO if p=0.5 you get a matrix of 0s and 2s.

In [26]:
x = torch.randn(10, 10)
mask = dropout_mask(x, (10, 10), 0.5); mask

tensor([[0., 0., 0., 0., 2., 0., 2., 2., 0., 0.],
        [2., 2., 0., 0., 0., 2., 2., 2., 2., 2.],
        [2., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
        [0., 2., 2., 2., 0., 2., 0., 2., 0., 0.],
        [2., 0., 2., 2., 2., 0., 0., 0., 2., 0.],
        [0., 0., 2., 0., 0., 2., 0., 2., 2., 2.],
        [0., 2., 2., 2., 2., 2., 2., 0., 2., 2.],
        [2., 0., 2., 0., 2., 2., 0., 0., 0., 2.],
        [2., 0., 0., 0., 0., 0., 2., 0., 2., 0.],
        [2., 0., 0., 2., 2., 0., 2., 2., 0., 2.]])

Once with a dropout mask, applying the dropout to `x` is simply done by `x = x*mask`. We create our own dropout mask and don't rely on pytorch dropout because we don't want to nullify the coefficients randomly: on the sequence dimension, we will want to have always replace the same positions by zero along the seq_len dimension(WHY??).

In [27]:
(x*mask).std(), x.std()

(tensor(1.2104), tensor(0.8589))

Inside a RNN, a tensor x will have 3 dimensions: bs, seq_len, vocab_size. Recall that we want to consistently apply the dropout mask across the seq_len dimension, therfore: we create a dropout mask for the first and third dimension and broadcast it to the seq_len dimension.

In [28]:
#export
class RNNDropout(nn.Module):
    def __init__(self, p=0.5):
        super().__init__()
        self.p = p
        
    def forward(self, x):
        if not self.training or self.p == 0. : return x
        m = dropout_mask(x.data, (x.size(0), 1, x.size(2)), self.p)
        return x * m

In [29]:
dp = RNNDropout(0.3)
tst_input = torch.randn(3,3,7)
# tst_input
dp(tst_input)

tensor([[[-0.1570, -0.0000, -0.7048,  0.0000, -0.2125, -0.0000, -1.2395],
         [ 1.6299,  0.0000, -0.0185,  0.0000,  1.6618, -0.0000,  1.7363],
         [-0.2886,  0.0000,  1.3313, -0.0000,  2.2231,  0.0000,  0.7224]],

        [[-0.0000,  3.3907, -1.1589, -0.4866,  0.6657, -1.7509, -1.3297],
         [ 0.0000,  1.0690, -0.3558,  0.3287,  0.8011, -1.5387,  1.9071],
         [ 0.0000, -1.3100,  0.9358,  0.0100, -1.3165,  0.8615, -0.2637]],

        [[-0.0000,  0.0000, -2.3151,  1.1073, -0.0000, -2.5797,  0.0000],
         [ 0.0000, -0.0000,  0.1859,  0.6219, -0.0000, -1.1936, -0.0000],
         [-0.0000,  0.0000, -1.4853, -2.2439, -0.0000, -1.4298,  0.0000]]])

Notice that the zeros are getting broadcasted. This is  really important bacause in the seq dimension if you drop timestep 3 butnot timestep 2 and timestep4 then you'vebroken the network's ability to calculate anything because you just killed it..

### WeightDrop (called as DropConnect in Vision world)

WeightDropout is the dropout applied to the weights of the inner LSTM hidden to hidden matrix. This is a little hacky if we want to preserve the CUDNN speed and not reimplement the ell from scratch. We add a parameter that will contain the raw weights, and we replace the weight matrix in the LSTM at the beginning of the forward pass.

In [30]:
#export
import warnings

WEIGHT_HH = 'weight_hh_l0' # small L

class WeightDropout(nn.Module):
    def __init__(self, module, weight_p=[0.], layer_names=[WEIGHT_HH]):
        super().__init__()
        self.module, self.weight_p, self.layer_names = module, weight_p, layer_names
        for layer in layer_names:
            # Makes a copy of the weights of selected layers.
            w= getattr(self.module, layer)
            self.register_parameter(f'{layer}_raw', nn.Parameter(w.data))
            self.module._parameters[layer] = F.dropout(w, p=self.weight_p, training = False)
     
    def _setweights(self):
        # applying dropout on the weights
        for layer in self.layer_names:
            raw_w = getattr(self, f'{layer}_raw')
            self.module._parameters[layer] = F.dropout(raw_w, p = self.weight_p, training = self.training)
        
    def forward(self, *args):
        self._setweights()
        with warnings.catch_warnings():
            # To avoid the warning that comes because the weights aren't flattended
            warnings.simplefilter("ignore")
            return self.module.forward(*args)

Let's try it!

In [31]:
module = nn.LSTM(5,2)
dp_module = WeightDropout(module, 0.4)
getattr(dp_module.module, WEIGHT_HH)

Parameter containing:
tensor([[-0.2388, -0.0573],
        [-0.1495,  0.6926],
        [ 0.3976,  0.6529],
        [ 0.2691,  0.4294],
        [-0.6123,  0.0434],
        [ 0.1087, -0.3028],
        [ 0.0889, -0.1240],
        [-0.5016,  0.0384]], requires_grad=True)

In [32]:
h = (torch.zeros(1, 20, 2), torch.zeros(1, 20, 2))
len(h), h

(2, (tensor([[[0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.]]]), tensor([[[0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.],
           [0., 0.]]])))

It's at the beginning of the fwd pass that the dropout is applied to the weights.

In [33]:
tst_input = torch.randn(4, 20, 5)
h = (torch.zeros(1, 20, 2), torch.zeros(1, 20, 2))
x, h = dp_module(tst_input, h)
getattr(dp_module.module, WEIGHT_HH)

tensor([[-0.3981, -0.0000],
        [-0.2491,  1.1544],
        [ 0.6627,  0.0000],
        [ 0.4485,  0.0000],
        [-1.0206,  0.0723],
        [ 0.1811, -0.5047],
        [ 0.1482, -0.0000],
        [-0.0000,  0.0640]], grad_fn=<MulBackward0>)

### Embedding Dropout
Embedding Dropout applies dropout to full rows of the embedding matrix.

In [34]:
#export 
class EmbeddingDropout(nn.Module):
    " Applies Dropout in the embedding layer by zeroing out some elements of the embedding vector." 
    def __init__(self, emb, embed_p):
        super().__init__()
        self.emb , self.embed_p = emb, embed_p
        self.pad_idx = self.emb.padding_idx
        if self.pad_idx is None: self.pad_idx = -1
            
    def forward(self, words, scale=None):
        if self.training and self.embed_p!=0:
            size = (self.emb.weight.size(0), 1)
            mask = dropout_mask(self.emb.weight.data, size, self.embed_p)
            masked_embed = self.emb.weight * mask
        else:masked_embed = self.emb_weight
        if scale: masked_embed.mul_(scale)
        return F.embedding(words, masked_embed, self.pad_idx, self.emb.max_norm,
                           self.emb.norm_type, self.emb.scale_grad_by_freq, self.emb.sparse)

In [35]:
enc = nn.Embedding(100, 7, padding_idx=1)
enc_dp = EmbeddingDropout(enc, 0.5)
tst_input = torch.randint(0, 100, (8,))
print(tst_input)
enc_dp(tst_input)

tensor([97, 64, 65, 49,  6, 54, 21, 61])


tensor([[-4.0852,  1.2072,  2.5415,  4.6441,  4.7406,  3.0731, -2.6044],
        [ 0.0000,  0.0000,  0.0000, -0.0000,  0.0000,  0.0000,  0.0000],
        [ 1.7719, -3.9148, -2.8937, -5.5264, -0.1590, -1.8944,  3.6024],
        [ 2.7805,  0.5125, -4.2648,  1.6131,  0.0497, -0.4633,  2.6239],
        [ 0.7606,  0.1618,  2.6368, -0.8003,  0.8390, -3.0347,  3.6927],
        [-2.6435, -0.2488, -3.1875, -2.1668,  0.2999,  0.7026,  0.6561],
        [ 0.0000, -0.0000,  0.0000, -0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000, -0.0000,  0.0000, -0.0000,  0.0000, -0.0000]],
       grad_fn=<EmbeddingBackward>)

## Main model  
  
The main model is a regular LSTM with several layers, but using all those kinds of dropouts.

In [36]:
#export
def to_detach(h):
    "Detaches 'h' from its history"
    return h.detach() if type(h) == torch.Tensor else tuple(to_detach(v) for v in h)

In [37]:
#export
class AWD_LSTM(nn.Module):
    "AWD-LSTM inspired by https://arxiv.org/abs/1708.02182"
    initrange = 0.1
    
    def __init__(self, vocab_sz, emb_sz, n_hid, n_layers, pad_token,
                hidden_p = 0.2, input_p = 0.6, embed_p = 0.1, weight_p = 0.5):
        super().__init__()
        self.bs, self.emb_sz, self.n_hid, self.n_layers = 1, emb_sz, n_hid, n_layers
        self.emb = nn.Embedding(vocab_sz, emb_sz, padding_idx = pad_token) # get embedded matrix
        self.emb_dp = EmbeddingDropout(self.emb, embed_p) # apply dropout on it with p=embed_p
        self.rnns = [nn.LSTM(emb_sz if l == 0 else n_hid, (n_hid if l!= n_layers-1 else emb_sz), 1,
                            batch_first=True) for l in range(n_layers)]
        self.rnns = nn.ModuleList([WeightDropout(rnn, weight_p) for rnn in self.rnns]) # apply weight dropout
        self.emb.weight.data.uniform_(-self.initrange, self.initrange) # re-initialize emb weights
        self.input_dp = RNNDropout(input_p)
        self.hidden_dps = nn.ModuleList([RNNDropout(hidden_p) for l in range(n_layers)]) # apply RNN dropout(with seq_dim)

    def forward(self, input):
        bs, sl = input.size()
        if bs!=self.bs:
            self.bs = bs
            self.reset()
        raw_output = self.input_dp(self.emb_dp(input))
        new_hidden, raw_outputs, outputs = [], [], []
        for l, (rnn, hid_dp) in enumerate(zip(self.rnns, self.hidden_dps)):
            raw_output, new_h = rnn(raw_output, self.hidden[l])
            new_hidden.append(new_h)
            raw_outputs.append(raw_output)
            if l!= self.n_layers - 1: raw_output = hid_dp(raw_output)
            outputs.append(raw_output)
        self.hidden = to_detach(new_hidden)
        return raw_outputs, outputs
    
    def _one_hidden(self, l):
        "Return one hidden state"
        nh = self.n_hid if l != self.n_layers - 1 else self.emb_sz
        return next(self.parameters()).new(1, self.bs, nh).zero_()
    
    def reset(self):
        "Reset the hidden states"
        self.hidden = [(self._one_hidden(l), self._one_hidden(l)) for l in range(self.n_layers)]

On top of this, we will apply a LinearDecoder. It's often best to use the same matrix as the one for the embeddings in the weights of the decoder.

In [38]:
#export
class LinearDecoder(nn.Module):
    def __init__(self, n_out, n_hid, output_p, tie_encoder=None, bias = True):
        super().__init__()
        self.output_dp = RNNDropout(output_p)
        self.decoder = nn.Linear(n_hid, n_out, bias=bias)
        if bias: self.decoder.bias.data.zero_()
        if tie_encoder: self.decoder.weight = tie_encoder.weight
        else: init.kaiming_uniform_(self.decoder.weight)
            
    def forward(self, input):
        raw_outputs, outputs = input
#         https://discuss.pytorch.org/t/contigious-vs-non-contigious-tensor/30107/2
        output = self.output_dp(outputs[-1]).contiguous()
        decoded = self.decoder(output.view(output.size(0)* output.size(1), output.size(2)))
        return decoded, raw_outputs, outputs

In [39]:
#export
class SequentialRNN(nn.Sequential):
    "A sequentioal module that passes the reset call to its children"
    def reset(self):
        for c in self.children():
            if hasattr(c, 'reset'): c.reset()

And now we stack them all together

In [40]:
#export
def get_language_model(vocab_sz, emb_sz, n_hid, n_layers, pad_token, output_p=0.4, hidden_p=0.2, input_p=0.6,
                      embed_p=0.1, weight_p=0.5, tie_weights=True, bias=True):
    rnn_enc = AWD_LSTM(vocab_sz, emb_sz, n_hid, n_layers, pad_token, hidden_p, input_p, embed_p, weight_p)
    enc = rnn_enc.emb if tie_weights else None
    return nn.Sequential(rnn_enc, LinearDecoder(vocab_sz, emb_sz, output_p, tie_encoder=enc, bias=bias))

In [41]:
tok_pad = vocab.index(PAD); tok_pad

1

Now we can test all this works without throwing a bug.

In [42]:
tst_model = get_language_model(len(vocab), 300, 300, 2, tok_pad)
tst_model = tst_model.cuda()

In [43]:
x, y = next(iter(data.train_dl))

In [44]:
z = tst_model(x.cuda())

We return 3 things to help with regularization: the true output(probabilities for each word), but also the activations of the encoder, with or without dropouts.

In [45]:
len(z)

3

In [46]:
z

(tensor([[ 0.0100,  0.0233, -0.0088,  ..., -0.0174, -0.0062, -0.0278],
         [ 0.0162,  0.0337, -0.0112,  ..., -0.0290, -0.0095, -0.0409],
         [ 0.0156,  0.0343, -0.0133,  ..., -0.0392, -0.0104, -0.0501],
         ...,
         [ 0.0238,  0.0215,  0.0019,  ..., -0.0474, -0.0016, -0.0190],
         [ 0.0289,  0.0169,  0.0051,  ..., -0.0477,  0.0047, -0.0221],
         [ 0.0259,  0.0204,  0.0076,  ..., -0.0472,  0.0020, -0.0238]],
        device='cuda:0', grad_fn=<AddmmBackward>),
 [tensor([[[ 1.2877e-02,  2.0811e-02,  1.8974e-02,  ..., -1.4467e-02,
            -2.4622e-02,  3.2236e-03],
           [ 2.7131e-02,  2.9973e-02,  1.1575e-03,  ..., -1.0153e-02,
            -1.0433e-02, -1.5177e-02],
           [ 3.9571e-02,  1.5895e-02, -2.3793e-02,  ..., -2.8722e-02,
             1.7880e-02, -5.6637e-02],
           ...,
           [ 6.8354e-02,  5.3641e-03, -4.4023e-02,  ..., -1.6555e-02,
            -2.2625e-02, -5.2338e-02],
           [ 3.4716e-02,  2.1746e-02, -2.7637e-02,  ...,

In [47]:
decoded, raw_outputs, outputs = z

The decoded tensor is flattended to `bs*seq_len` by `len(vocab)`

In [48]:
decoded.size()

torch.Size([4480, 60003])

`raw_outputs` and `outputs` each contain the results of intermediary layers.

In [49]:
len(raw_outputs), len(outputs)

(2, 2)

In [50]:
[o.size() for o in raw_outputs], [o.size() for o in outputs]

([torch.Size([64, 70, 300]), torch.Size([64, 70, 300])],
 [torch.Size([64, 70, 300]), torch.Size([64, 70, 300])])

## Calbacks to train the model  
  
We need to add a few tweaks to train a language model: first we will clip the gradients. This is a classic technique that will allow us to use a high leraning rate by putting a maximum value on the norm of the gradients.

In [51]:
#export
class GradientClipping(Callback):
    def __init__(self, clip=None): self.clip= clip
    def after_backward(self):
        if self.clip: nn.utils.clip_grad_norm_(self.run.model.parameters(), self.clip)

Then we add an `RnnTrainer` that will do the following 4 things:
- Change the output to make it contain only the `decoded` tensor(for the loss function) and store the `raw_outputs` and `outputs`.
- Apply Activation Regularization(AR): we add to the loss a L2 penalty on the last activations of the AWD-LSTM(with dropout applied).
- apply Temporal Activation Regularisation(TAR): we add to the loss a L2 penalty on the difference between 2 consecutive(in terms of words) terms of raw_outputs.
- Trigger the shuffle of LM_Dataset at the beginning of each epoch.

In [52]:
#export
class RNNTrainer(Callback):
    def __init__(self, α, β): self.α, self.β = α, β
    
    def after_pred(self):
        # Save the extra outputs for later and only returns the true output
        self.raw_out, self.out = self.pred[1], self.pred[2]
        self.run.pred = self.pred[0]
        
    def after_loss(self):
        # Applying AR and TAR
        if self.α != 0 : self.run.loss += self.α * self.out[-1].float().pow(2).mean()
        if self.β != 0: 
            h = self.raw_out[-1]
            if h.size(1)>1: self.run.loss += self.β * (h[:,1:] - h[:,:-1]).float().pow(2).mean()
     
    def begin_batch(self):
        # SHuffle the texts at the beginning of the epoch
        if hasattr(self.dl.dataset, 'batchify'): self.dl.dataset.batchify()

Lastly, we write a flattened version of the cross entropy loss and the accuracy metric

In [53]:
#export
def cross_entropy_flat(input, target):
    bs, sl = target.size()
    return F.cross_entropy(input.view(bs*sl, -1), target.view(bs * sl))

def accuracy_flat(input, target):
    bs , sl = target.size()
    return accuracy(input.view(bs*sl, -1), target.view(bs * sl))

In [85]:
emb_sz, nh, nl = 300, 300, 2
model = get_language_model(len(vocab), emb_sz, nh, nl, tok_pad, input_p=0.6, output_p=0.4, weight_p=0.5,
                          embed_p=0.1, hidden_p=0.2)

In [86]:
cbfs = [partial(AvgStatsCallback, accuracy_flat),
       CudaCallback, Recorder,
       partial(GradientClipping, clip=0.1),
       partial(RNNTrainer, α=2., β=1.),
       ProgressCallback]

In [87]:
learn = Learner(model, data, cross_entropy_flat, lr=5e-3, cb_funcs=cbfs, opt_func=adam_opt())

In [89]:
# learn.fit(1)

## Export

In [None]:
!python notebook2script.py 12a_awd_lstm.ipynb

Converted 12a_awd_lstm.ipynb to exp/nb_12a.py
