In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from fastai.text import *

# Load the data

Data can be downloaded from [here](https://einstein.ai/research/the-wikitext-long-term-dependency-language-modeling-dataset). as suggested by smerity, we only add the eos flag at each end of sentence. 

In [3]:
EOS = '<eos>'
PATH=Path('../data/wikitext')

In [4]:
def read_file(filename):
    tokens = []
    with open(PATH/filename, encoding='utf8') as f:
        for line in f:
            tokens.append(line.split() + [EOS])
    return np.array(tokens)

In [5]:
trn_tok = read_file('wiki.train.tokens')
val_tok = read_file('wiki.valid.tokens')
tst_tok = read_file('wiki.test.tokens')

In [6]:
len(trn_tok), len(val_tok), len(tst_tok)

(36718, 3760, 4358)

Then we numericalize the tokens.

In [7]:
cnt = Counter(word for sent in trn_tok for word in sent)

In [8]:
cnt.most_common(10)

[('the', 113161),
 (',', 99913),
 ('.', 73388),
 ('of', 56889),
 ('<unk>', 54625),
 ('and', 50603),
 ('in', 39453),
 ('to', 39190),
 ('<eos>', 36718),
 ('a', 34237)]

In [9]:
itos = [o for o,c in cnt.most_common()]
itos.insert(0,'<pad>')

In [10]:
vocab_size = len(itos); vocab_size

33279

In [11]:
stoi = collections.defaultdict(lambda : 5, {w:i for i,w in enumerate(itos)})

In [12]:
trn_ids = np.array([([stoi[w] for w in s]) for s in trn_tok])
val_ids = np.array([([stoi[w] for w in s]) for s in val_tok])
tst_ids = np.array([([stoi[w] for w in s]) for s in tst_tok])

# Model

This is the usual AWD LSTM with three layers.

In [13]:
em_sz,nh,nl = 400,1150,3
bptt, bs = 70, 100

Training schedule: 1cycle with either a third phase with cosine annealing or linear decay at one hundreth of the lowest lr. The second one seems to be slightly betters, but by a hair.

In [17]:
def one_cycle(steps,lr,opt_fn, div,max_mom,min_mom, wd):
    return [TrainingPhase(steps[0], opt_fn, lr=(lr/div,lr), lr_decay=DecayType.LINEAR, 
                          momentum=(max_mom,min_mom), momentum_decay=DecayType.LINEAR, wds=wd),
           TrainingPhase(steps[1], opt_fn, lr=(lr,lr/div), lr_decay=DecayType.LINEAR, 
                          momentum=(
                              min_mom,max_mom), momentum_decay=DecayType.LINEAR, wds=wd),
           TrainingPhase(steps[2], opt_fn, lr=lr/div, lr_decay=DecayType.COSINE, 
                          momentum=max_mom, wds=wd)]

In [18]:
def one_cycle_lin(steps,lr,div,max_mom,min_mom, wd):
    return [TrainingPhase(epochs=steps[0], opt_fn=optim.SGD, lr=(lr/div,lr), lr_decay=DecayType.LINEAR, 
                          momentum=(max_mom,min_mom), momentum_decay=DecayType.LINEAR, wds=wd),
           TrainingPhase(epochs=steps[1], opt_fn=optim.SGD, lr=(lr,lr/div), lr_decay=DecayType.LINEAR, 
                          momentum=(min_mom,max_mom), momentum_decay=DecayType.LINEAR, wds=wd),
           TrainingPhase(epochs=steps[2], opt_fn=optim.SGD, lr=(lr/div,lr/(div*100)), lr_decay=DecayType.LINEAR, 
                          momentum=max_mom, wds=wd)]

In [19]:
def custom_cycle(steps, lr, opt_fn, div, max_mom, min_mom, wd):
    return [TrainingPhase(steps[0], opt_fn, lr=(lr/div,lr), lr_decay=DecayType.LINEAR, 
                          momentum=(max_mom,min_mom), momentum_decay=DecayType.LINEAR, wds=wd),
            TrainingPhase(steps[1], opt_fn, lr=lr, momentum=min_mom, wds=wd),
            TrainingPhase(steps[2], opt_fn, lr=(lr,lr/div), lr_decay=DecayType.LINEAR, 
                          momentum=(min_mom,max_mom), momentum_decay=DecayType.LINEAR, wds=wd),
            TrainingPhase(steps[3], opt_fn, lr=lr/div, lr_decay=DecayType.COSINE, 
                          momentum=max_mom, wds=wd)]

Helper functions for the evaluation of the model at the end. TextReader is rewritten from the LanguageModelLoader class to have a constant bptt and only one batch.

In [20]:
class TextReader():

    def __init__(self, nums, bptt, backwards=False):
        self.bptt,self.backwards = bptt,backwards
        self.data = self.batchify(nums)
        self.i,self.iter = 0,0
        self.n = len(self.data)

    def __iter__(self):
        self.i,self.iter = 0,0
        while self.i < self.n-1 and self.iter<len(self):
            res = self.get_batch(self.i, self.bptt)
            self.i += self.bptt
            self.iter += 1
            yield res

    def __len__(self): return self.n // self.bptt 

    def batchify(self, data):
        data = np.array(data)[:,None]
        if self.backwards: data=data[::-1]
        return T(data)

    def get_batch(self, i, seq_len):
        source = self.data
        seq_len = min(seq_len, len(source) - 1 - i)
        return source[i:i+seq_len], source[i+1:i+1+seq_len].view(-1)

Validation without reinitializing the hidden state.

In [21]:
def my_validate(model, source, bptt=2000):
    data_source = TextReader(source, bptt)
    model.eval()
    model.reset()
    total_loss = 0.
    for inputs, targets in tqdm(data_source):
        outputs, raws, outs = model(V(inputs))
        p_vocab = F.softmax(outputs,1)
        for i, pv in enumerate(p_vocab):
            targ_pred = pv[targets[i]]
            total_loss -= torch.log(targ_pred.detach())
    mean = total_loss / (bptt * len(data_source))
    return mean, np.exp(mean)

Cache pointer

In [22]:
def one_hot(vec, size=vocab_size):
    a = torch.zeros(len(vec), size)
    for i,v in enumerate(vec):
        a[i,v] = 1.
    return V(a)

def my_cache_pointer(model, source, scale=1., theta = 0.662, lambd = 0.1279, window=200, bptt=2000):
    data_source = TextReader(source, bptt)
    model.eval()
    model.reset()
    total_loss = 0.
    targ_history = None
    hid_history = None
    for inputs, targets in tqdm(data_source):
        outputs, raws, outs = model(V(inputs))
        p_vocab = F.softmax(outputs * scale,1)
        start = 0 if targ_history is None else targ_history.size(0)
        targ_history = one_hot(targets) if targ_history is None else torch.cat([targ_history, one_hot(targets)])
        hiddens = raws[-1].squeeze() #results of the last layer + remove the batch size.
        hid_history = scale * hiddens if hid_history is None else torch.cat([hid_history, scale * hiddens])
        for i, pv in enumerate(p_vocab):
            #Get the cached values
            p = pv
            if start + i > 0:
                targ_cache = targ_history[:start+i] if start + i <= window else targ_history[start+i-window:start+i]
                hid_cache = hid_history[:start+i] if start + i <= window else hid_history[start+i-window:start+i]
                all_dot_prods = torch.mv(theta * hid_cache, hiddens[i])
                exp_dot_prods = F.softmax(all_dot_prods).unsqueeze(1)
                p_cache = (exp_dot_prods.expand_as(targ_cache) * targ_cache).sum(0).squeeze()
                p = (1-lambd) * pv + lambd * p_cache
            targ_pred = p[targets[i]]
            total_loss -= torch.log(targ_pred.detach())
        targ_history = targ_history[-window:]
        hid_history = hid_history[-window:]
    mean = total_loss / (bptt * len(data_source))
    return mean, np.exp(mean)

In [24]:
from fastai.rnn_reg import dropout_mask

class EmbeddingDropout1(nn.Module):

    """ Rewritten from EmbeddingDropout. 
    
    Does the same thing but accept either a regular input, or an array of the form
    [input1, input2, lambda] to mixup.
    """

    def __init__(self, embed):
        super().__init__()
        self.embed = embed

    def forward(self, words, dropout=0.1, scale=None):
        if dropout:
            size = (self.embed.weight.size(0),1)
            mask = Variable(dropout_mask(self.embed.weight.data, size, dropout))
            masked_embed_weight = mask * self.embed.weight
        else: masked_embed_weight = self.embed.weight

        if scale: masked_embed_weight = scale * masked_embed_weight

        padding_idx = self.embed.padding_idx
        if padding_idx is None: padding_idx = -1

        if IS_TORCH_04:
            #New here: if the input is a list, take the embeddings for the first two args, then mix them up.
            if isinstance(words, list):
                X1 = F.embedding(words[0], masked_embed_weight, padding_idx, self.embed.max_norm,
                   self.embed.norm_type, self.embed.scale_grad_by_freq, self.embed.sparse)
                X2 = F.embedding(words[1], masked_embed_weight, padding_idx, self.embed.max_norm,
                   self.embed.norm_type, self.embed.scale_grad_by_freq, self.embed.sparse)
                lambd = words[2].view(1,-1,1)
                X = X1 * lambd + X2 * (1-lambd)
            else:
                X = F.embedding(words, masked_embed_weight, padding_idx, self.embed.max_norm,
                   self.embed.norm_type, self.embed.scale_grad_by_freq, self.embed.sparse)
        else:
            #New here: if the input is a list, take the embeddings for the first two args, then mix them up.
            if isinstance(words, list):
                X1 = self.embed._backend.Embedding.apply(words, masked_embed_weight, padding_idx, self.embed.max_norm,
                   self.embed.norm_type, self.embed.scale_grad_by_freq, self.embed.sparse)
                X2 = self.embed._backend.Embedding.apply(words, masked_embed_weight, padding_idx, self.embed.max_norm,
                   self.embed.norm_type, self.embed.scale_grad_by_freq, self.embed.sparse)
                lambd = words[2].view(1,-1,1)
                X = X1 * lambd + X2 * (1-lambd)
            else:
                X = self.embed._backend.Embedding.apply(words, masked_embed_weight, padding_idx, self.embed.max_norm,
                  self.embed.norm_type, self.embed.scale_grad_by_freq, self.embed.sparse)

        return X

In [25]:
class RNN_Encoder1(nn.Module):

    """Rewritten from RNN_Encoder to accept multiple inputs for mixup.
    """

    initrange=0.1

    def __init__(self, ntoken, emb_sz, nhid, nlayers, pad_token, bidir=False,
                 dropouth=0.3, dropouti=0.65, dropoute=0.1, wdrop=0.5, qrnn=False):
        """ Default constructor for the RNN_Encoder class

            Args:
                bs (int): batch size of input data
                ntoken (int): number of vocabulary (or tokens) in the source dataset
                emb_sz (int): the embedding size to use to encode each token
                nhid (int): number of hidden activation per LSTM layer
                nlayers (int): number of LSTM layers to use in the architecture
                pad_token (int): the int value used for padding text.
                dropouth (float): dropout to apply to the activations going from one LSTM layer to another
                dropouti (float): dropout to apply to the input layer.
                dropoute (float): dropout to apply to the embedding layer.
                wdrop (float): dropout used for a LSTM's internal (or hidden) recurrent weights.

            Returns:
                None
          """

        super().__init__()
        self.ndir = 2 if bidir else 1
        self.bs, self.qrnn = 1, qrnn
        self.encoder = nn.Embedding(ntoken, emb_sz, padding_idx=pad_token)
        self.encoder_with_dropout = EmbeddingDropout1(self.encoder)
        if self.qrnn:
            #Using QRNN requires cupy: https://github.com/cupy/cupy
            from .torchqrnn.qrnn import QRNNLayer
            self.rnns = [QRNNLayer(emb_sz if l == 0 else nhid, (nhid if l != nlayers - 1 else emb_sz)//self.ndir,
                save_prev_x=True, zoneout=0, window=2 if l == 0 else 1, output_gate=True) for l in range(nlayers)]
            if wdrop:
                for rnn in self.rnns:
                    rnn.linear = WeightDrop(rnn.linear, wdrop, weights=['weight'])
        else:
            self.rnns = [nn.LSTM(emb_sz if l == 0 else nhid, (nhid if l != nlayers - 1 else emb_sz)//self.ndir,
                1, bidirectional=bidir) for l in range(nlayers)]
            if wdrop: self.rnns = [WeightDrop(rnn, wdrop) for rnn in self.rnns]
        self.rnns = torch.nn.ModuleList(self.rnns)
        self.encoder.weight.data.uniform_(-self.initrange, self.initrange)

        self.emb_sz,self.nhid,self.nlayers,self.dropoute = emb_sz,nhid,nlayers,dropoute
        self.dropouti = LockedDropout(dropouti)
        self.dropouths = nn.ModuleList([LockedDropout(dropouth) for l in range(nlayers)])

    def forward(self, input):
        """ Invoked during the forward propagation of the RNN_Encoder module.
        Args:
            input (Tensor): input of shape (sentence length x batch_size)

        Returns:
            raw_outputs (tuple(list (Tensor), list(Tensor)): list of tensors evaluated from each RNN layer without using
            dropouth, list of tensors evaluated from each RNN layer using dropouth,
        """
        
        sl,bs = input[0].size() if isinstance(input,list) else input.size()
        if bs!=self.bs:
            self.bs=bs
            self.reset()
        #New line here: if the 4-th element of the input is 1, then reset the hidden state.
        if is_listy(input) and input[3] == 1: self.reset()
        with set_grad_enabled(self.training):
            emb = self.encoder_with_dropout(input, dropout=self.dropoute if self.training else 0)
            raw_output = self.dropouti(emb)
            new_hidden,raw_outputs,outputs = [],[],[]
            for l, (rnn,drop) in enumerate(zip(self.rnns, self.dropouths)):
                current_input = raw_output
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    raw_output, new_h = rnn(raw_output, self.hidden[l])
                new_hidden.append(new_h)
                raw_outputs.append(raw_output)
                if l != self.nlayers - 1: raw_output = drop(raw_output)
                outputs.append(raw_output)

            self.hidden = repackage_var(new_hidden)
        return raw_outputs, outputs

    def one_hidden(self, l):
        nh = (self.nhid if l != self.nlayers - 1 else self.emb_sz)//self.ndir
        if IS_TORCH_04: return Variable(self.weights.new(self.ndir, self.bs, nh).zero_())
        else: return Variable(self.weights.new(self.ndir, self.bs, nh).zero_(), volatile=not self.training)

    def reset(self):
        if self.qrnn: [r.reset() for r in self.rnns]
        self.weights = next(self.parameters()).data
        if self.qrnn: self.hidden = [self.one_hidden(l) for l in range(self.nlayers)]
        else: self.hidden = [(self.one_hidden(l), self.one_hidden(l)) for l in range(self.nlayers)]

In [26]:
def get_language_model1(n_tok, emb_sz, nhid, nlayers, pad_token,
                 dropout=0.4, dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5, tie_weights=True, qrnn=False, bias=False):
    """
    Same as get_language_model but creates RNN_Encoder1
    """

    rnn_enc = RNN_Encoder1(n_tok, emb_sz, nhid=nhid, nlayers=nlayers, pad_token=pad_token,
                 dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop, qrnn=qrnn)
    enc = rnn_enc.encoder if tie_weights else None
    return SequentialRNN(rnn_enc, LinearDecoder(n_tok, emb_sz, dropout, tie_encoder=enc, bias=bias))

In [28]:
class MixUpDataLoader():
    
    def __init__(self, nums, bs, bptt, n_keep, alpha, backwards=False):
        """
        Create an instance of a mixup dataloader.
        
        Args:
        nums (np.array): the corpus numericalized
        bs (int): batch size
        bptt (int): bptt, the number of steps taken into account into backprop
        n_keep (int): we reset the model every chunk of n_keep batches
        alpha (float): parameter for the beta distribution when picking the lambdas
        """
        self.bs,self.bptt, self.n_keep, self.backwards, self.alpha = bs,bptt,n_keep,backwards,alpha
        self.data = self.batchify(nums)
        self.n = len(self.data)
        
    def __iter__(self):
        self.idx = 0
        #Shuffle and predraw the pairs of chunks
        self.shuffle_chunks()
        while self.idx < len(self.chunks1):
            #Go through the chunks of batches
            self.i = 0
            self.len_chunk = self.chunks1[self.idx][1] - self.chunks1[self.idx][0]
            while self.i < self.len_chunk:
                #Then through the batches
                res = self.get_batch() 
                yield res
            self.idx += 1

    def __len__(self): 
        return (self.n-1) // (self.bptt) if (self.n-1) % self.bptt == 0 else (self.n-1) // (self.bptt) + 1

    def batchify(self, data):
        nb = data.shape[0] // self.bs
        data = np.array(data[:nb*self.bs])
        data = data.reshape(self.bs, -1).T
        if self.backwards: data=data[::-1]
        return T(data)
    
    def shuffle_chunks(self):
        #Number of chunks: roughly (n-1) / (bptt * n_keep)
        n_chunks = (self.n-1) // (self.bptt * self.n_keep) + 1
        if (self.n-1) % (self.bptt * self.n_keep) == 0: n_chunks -= 1
        n_res = self.n - 1 - (n_chunks-1) * (self.bptt * self.n_keep)
        self.chunks1, self.chunks2 = [], []
        #Randomly draw where we will pick the chunk of batches with a length lower than the others.
        put_res = np.random.randint(n_chunks)
        start = 0
        for k in range(n_chunks):
            #Split the data into chunks of n_keep batches. 
            if k != put_res:
                self.chunks1.append([start, start + (self.bptt * self.n_keep)])
                start += (self.bptt * self.n_keep)
            else:
                self.chunks1.append([start, start + n_res])
                start += n_res
        #Remove the chunk with a length different from the others because it needs to be at the same position
        #in our two lists of chunks.
        res = self.chunks1.pop(put_res)
        #Shuffle the chunks
        self.chunks1 = np.random.permutation(self.chunks1)
        self.chunks2 = np.random.permutation(self.chunks1)
        #Add the one with a lower length at the end.
        self.chunks1 = np.concatenate([self.chunks1, np.array([res])])
        self.chunks2 = np.concatenate([self.chunks1, np.array([res])])

    def get_batch(self):
        source, i = self.data, self.i
        seq_len = min(bptt, self.chunks1[self.idx][1] - self.chunks1[self.idx][0] - self.i)
        if self.i == 0:
            #At the beggining of a new chunk, draw the lambdas and a pemutation of the batches.
            self.lambd = np.random.beta(self.alpha, self.alpha, self.bs)
            self.lambd = to_gpu(VV(self.lambd))
            self.shuffle = to_gpu(torch.Tensor(np.random.permutation(range(self.bs))).long())
            reinit=True
        else: reinit = False
        #Start indexes for each chunks.
        start1, start2 = self.chunks1[self.idx][0] + i, self.chunks2[self.idx][0] + i
        #Input: source1, source2, lambda, reinit
        res1 = [source[start1:start1+seq_len], source[start2:start2+seq_len,self.shuffle], self.lambd, np.array(reinit).astype(np.int8)]
        #Target: source1 shifted, source2 shifted, lambda
        targ1 = source[start1+1:start1+1+seq_len].contiguous().view(-1)
        targ2 = source[start1+1:start1+1+seq_len,self.shuffle].contiguous().view(-1)
        res2 = [targ1,targ2, self.lambd]
        self.i += seq_len
        return (res1, res2)

In [29]:
class MixUpLoss(nn.Module):
    """
    A new loss function that accepts targets of the form target1, target2, lambda
    """
    def __init__(self, crit):
        super().__init__()
        self.crit = crit()
        
    def forward(self, output, target):
        if not isinstance(target, list): return self.crit(output, target).mean()
        loss1, loss2 = self.crit(output,target[0]), self.crit(output,target[1])
        loss1, loss2 = loss1.view(-1,target[2].size(0)), loss2.view(-1,target[2].size(0))
        return (loss1 * target[2].unsqueeze(0) + loss2 * (1-target[2].unsqueeze(0))).mean()

The train dataloader is a mixup, the validation dataloader a regular one. The parameters that gave he best results are 7/0.7.

In [30]:
trn_dl = MixUpDataLoader1(np.concatenate(trn_ids), bs, bptt, 7, 0.6)
val_dl = LanguageModelLoader(np.concatenate(val_ids), bs, bptt)
md = LanguageModelData(PATH, 0, vocab_size, trn_dl, val_dl, bs=bs, bptt=bptt)

Dropouts. Using mixup allows us to lower this regularization.

In [14]:
drops = np.array([0.6,0.4,0.5,0.1,0.2]) #Smerity's dropouts from the github repo

In [32]:
drops = drops * 0.7

The rest of the parameters are the same, with the exception of weight decay that can be divided by 2.

In [33]:
opt_fn = partial(optim.Adam, betas=(0.8,0.99))
m = get_language_model1(vocab_size, em_sz, nh, nl, 0,
    dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4], bias=True)
model = LanguageModel(to_gpu(m))
learner = RNN_Learner(md, model, opt_fn=opt_fn)
learner.crit = MixUpLoss(partial(nn.CrossEntropyLoss, reduce=False))
learner.metrics = [accuracy]
learner.clip=0.12
learner.unfreeze()
learner.reg_fn=partial(seq2seq_reg, alpha=2, beta=1)
wd = 6e-7

In [59]:
learner.fit(1e-2, 1, cycle_len=90, wds=wd, use_clr_beta=(10,7.5,0.8,0.7))

HBox(children=(IntProgress(value=0, description='Epoch', max=90), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                   
    0      6.626329   6.030295   0.178525  
    1      6.275505   5.589827   0.214941                   
    2      6.027988   5.325422   0.228585                   
    3      5.894091   5.133265   0.237486                   
    4      5.759592   4.989663   0.246637                   
    5      5.645917   4.893019   0.253144                   
    6      5.589076   4.811414   0.257452                   
    7      5.516833   4.762544   0.26024                    
    8      5.443729   4.710184   0.261971                   
    9      5.396975   4.677733   0.265007                   
    10     5.379718   4.633137   0.268217                   
    11     5.334224   4.610691   0.270078                   
    12     5.288874   4.59802    0.269693                   
    13     5.265066   4.561562   0.272778                   
    14     5.273233   4.562297   0.271641                   
    15     5.246758   4.542312   0.272835

[4.212716070810954, 0.2983988434076309]

In [60]:
my_validate(learner.model, np.concatenate(val_ids))

100%|██████████| 108/108 [00:44<00:00,  2.41it/s]


(
  4.2080
 [torch.cuda.FloatTensor of size () (GPU 0)], 
  67.2209
 [torch.FloatTensor of size ()])

Smerity's best with finetuning: 67.2

In [61]:
my_cache_pointer(learner.model, np.concatenate(val_ids), window=3785)

100%|██████████| 108/108 [07:52<00:00,  4.37s/it]


(
  3.9554
 [torch.cuda.FloatTensor of size () (GPU 0)], 
  52.2156
 [torch.FloatTensor of size ()])

Smerity's best: 52.2

For the QRNNs, the best parameters for the mixup dataloader are 8/0.4

In [None]:
em_sz,nh,nl = 400,1550,4
bptt, bs = 70, 100

In [37]:
trn_dl = MixUpDataLoader1(np.concatenate(trn_ids), bs, bptt, 8, 0.4)
val_dl = LanguageModelLoader(np.concatenate(val_ids), bs, bptt)
md = LanguageModelData(PATH, 0, vocab_size, trn_dl, val_dl, bs=bs, bptt=bptt)

In [38]:
drops = np.array([0.4,0.4,0.1,0.1,0.2]) #Smerity's dropouts from the github repo

In [39]:
drops = drops * 0.75

In [40]:
opt_fn = partial(optim.Adam, betas=(0.8,0.99))
m = get_language_model1(vocab_size, em_sz, nh, nl, 0,
    dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4], qrnn=True, bias=True)
model = LanguageModel(to_gpu(m))
learner = RNN_Learner(md, model, opt_fn=opt_fn)
learner.crit = MixUpLoss(partial(nn.CrossEntropyLoss, reduce=False))
learner.metrics = [accuracy]
learner.clip=0.12
learner.unfreeze()
learner.reg_fn=partial(seq2seq_reg, alpha=2, beta=1)
wd = 1e-6

In [57]:
learner.fit(1e-2, 1, cycle_len=90, wds=wd, use_clr_beta=(10,10,0.8,0.7))

HBox(children=(IntProgress(value=0, description='Epoch', max=90), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                   
    0      6.645041   6.111343   0.168837  
    1      6.241145   5.621353   0.209918                   
    2      6.006832   5.365736   0.221883                   
    3      5.804309   5.169753   0.23302                    
    4      5.672886   5.02983    0.243427                   
    5      5.572516   4.927614   0.249757                   
    6      5.480185   4.840228   0.251659                   
    7      5.405964   4.775793   0.256458                   
    8      5.323781   4.719518   0.260135                   
    9      5.301296   4.684981   0.26099                    
    10     5.259113   4.659498   0.262679                   
    11     5.24911    4.629782   0.263006                   
    12     5.196324   4.599059   0.268283                   
    13     5.169601   4.592118   0.269083                   
    14     5.131697   4.576978   0.269349                   
    15     5.130156   4.57565    0.268847

[4.220731449127197, 0.29764690498511]

In [58]:
my_validate(learner.model, np.concatenate(val_ids))

100%|██████████| 108/108 [00:18<00:00,  5.92it/s]


(
  4.2194
 [torch.cuda.FloatTensor of size () (GPU 2)], 
  67.9927
 [torch.FloatTensor of size ()])

Smerity's best with finetuning: 68.5

In [59]:
my_cache_pointer(learner.model, np.concatenate(val_ids), window=3785)

100%|██████████| 108/108 [07:26<00:00,  4.13s/it]


(
  3.9860
 [torch.cuda.FloatTensor of size () (GPU 2)], 
  53.8380
 [torch.FloatTensor of size ()])

Smerity's best: 53.6