In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
import spacy
import numpy as np
import dill
from tqdm import tqdm

import os
import glob
import time
from pathlib import Path


In [2]:
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')

#### Prepare data

In [6]:
class ConcatTextDataset(torchtext.data.Dataset):
    """
    Form torchtext dataset from all files in given path.
    This is needed to be constructed with .splits() method. 
    Not with this __init__ constructor. 
    
    .splits() actually will call this multiple times for different directories 
    given by train, test and validation parameters
    """
    def __init__(self, path, text_field, newline_eos=True, encoding='utf-8', **kwargs):
        fields = [('text', text_field)]
        text = []

        if os.path.isdir(path): 
            paths=glob.glob(f'{path}/*.*')
        else: 
            paths=[path]
        
        for p in paths:
            for line in open(p, encoding=encoding): 
                text += text_field.preprocess(line)
            if newline_eos: 
                text.append('<eos>')

        examples = [torchtext.data.Example.fromlist([text], fields)]
        super().__init__(examples, fields, **kwargs)

In [7]:
spacy_tok = spacy.load('en')
text = torchtext.data.Field(lower=True, tokenize='spacy')

If you dont have `data/lm_*.data` for some reason you could run two cells bellow this, to download IMDB datasest and process it. **WARNING** it takes long time to run. Else just load saved data and create torhctext datasets.

In [None]:
!wget -P data/ https://files.fast.ai/data/aclImdb.tgz

In [6]:
# WARNING takes long time to run
train_ds, valid_ds, test_ds  = ConcatTextDataset.splits('data/aclImdb/',
                                                        text_field=text, 
                                                        train='train/all/',
                                                        validation='test/all/', 
                                                        test='test/all/')

if not Path('dat/models/text.pkl').exists():
    with open('data/models/text.pkl', 'wb') as f:
        dill.dump(text, f)

#### Load preprocessed data for language model

In [8]:
if Path('data/models/text.pkl').exists():
    with open('data/models/text.pkl', 'rb') as f:
        text = dill.load(f)

train_ex = torch.load('data/lm_train.data')
valid_ex = torch.load('data/lm_valid.data')

train_ds = torchtext.data.Dataset([train_ex], [('text', text)])
valid_ds = torchtext.data.Dataset([valid_ex], [('text', text)])

#### Setup model

In [9]:
def repackage_hidden(h):
    """
    Wraps hidden states in new Tensors, to detach them from their history.
    """
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [10]:
class RecurrentDropout(nn.Module):
    """
    Implements dropout with same mask for each time step
    """
    def __init__(self):
        super().__init__()
        
    def forward(self, x, p=0.5):
        """
        Forward step. x has following dimensions: 
            (time, samples, input_dim)
        """
        if not p or not self.training:
            return x
        
        mask = torch.empty(1, x.size(1), x.size(2)).bernoulli_(1 - p) / (1 - p)
        mask = mask.expand_as(x)
        return mask * x

In [11]:
class EmbeddingWithDropout(nn.Embedding):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    
    def forward(self, inp, p=0.5):
        if p and self.training:
            size = (self.weight.size(0), 1)
            mask = torch.empty(size).bernoulli_(1 - p) / (1 - p)
            mask = mask.expand_as(self.weight)
            dropout_weight = mask * self.weight
        else:
            dropout_weight = self.weight

        padding_idx = self.padding_idx
        if padding_idx is None:
            padding_idx = -1
        
        x = torch.nn.functional.embedding(inp, dropout_weight, padding_idx, 
                                          self.max_norm, self.norm_type,
                                          self.scale_grad_by_freq, self.sparse)

        return x

In [12]:
class RNNmodel(nn.Module):
    """
    RNN for language modeling with input embedding n_layers rnn layers
    and linear decoder
    """
    
    def __init__(self, embedding_dim, hidden_dim, vocab_size, n_layers, pad_token,
                 dropouth=0.3, dropouti=0.65, dropoute=0.1, dropouto=0.5):
        """
        Dropout probabilities:
        * dropouth -- hidden to hidden
        * dropouti -- input dropout
        * dropoute -- embedding dropout
        """
        super().__init__()
        self.dropoute = dropoute
        self.dropouth = dropouth
        self.dropouti = dropouti
        self.dropouto = dropouto
        self.lockdrop = RecurrentDropout()

        self.word_embeddings = EmbeddingWithDropout(vocab_size, embedding_dim, padding_idx=pad_token)
        self.rnns = [nn.LSTM(embedding_dim if l == 0 else hidden_dim, hidden_dim, num_layers=1) 
                     for l in range(n_layers)]
        self.rnns = nn.ModuleList(self.rnns)
        self.decoder = nn.Linear(hidden_dim, vocab_size)        
        self.init_weights()
        
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.n_layers = n_layers
    
    def init_weights(self):
        initrange = 0.1
        self.word_embeddings.weight.data.uniform_(-initrange, initrange)
        self.decoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)

    def forward(self, inp, hidden):
        emb = self.word_embeddings(inp)
        emb = self.lockdrop(emb, p=self.dropouti)
        cur_inp = emb
        raw_outputs, outputs, new_hidden = [], [], []
        
        for l, rnn in enumerate(self.rnns):
            output, new_h = rnn(cur_inp, hidden[l])
            new_hidden.append(new_h)
            raw_outputs.append(output)
            
            if l != self.n_layers - 1:
                output = self.lockdrop(output, p=self.dropouth)
                outputs.append(output)

            cur_inp = output

        output = self.lockdrop(output, p=self.dropouto)
        decoded = self.decoder(output.view(-1, output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), new_hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        return [(weight.new(1, batch_size, self.hidden_dim).zero_(),
                 weight.new(1, batch_size, self.hidden_dim).zero_())
                    for l in range(self.n_layers)]

In [None]:
def evaluate(model, criterion, valid_it):
    model.eval()
    # ...

In [29]:
def train(epoch, model, criterion, optimizer, train_it, vocab_size, valid_it=None, clip_grad=.25):
    """
    Train model for one epoch
    """
    model.train()
    avg_loss = 0.
    avg_mom = .98
    start_time = time.time()
    
    hidden = model.init_hidden(train_it.batch_size)
    pbar = tqdm(train_it, total=len(train_it), leave=False, ascii=True)
    
    for i, batch in enumerate(pbar):
        text, target = batch.text, batch.target
        # detach history from tensor
        hidden = repackage_hidden(hidden)
        
        # run model on batch of data
        model.zero_grad()
        prediction, hidden = model(text, hidden)
        loss = criterion(prediction.view(-1, vocab_size), target.view(-1))

        # output average loss to progress bar
        avg_loss = avg_loss * avg_mom + loss.item() * (1 - avg_mom)
        debias_loss = avg_loss / (1 - avg_mom**(i+1))
        pbar.set_postfix(loss=debias_loss, refresh=False)

        # optimizer step
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)

        optimizer.step()

In [27]:
vocab_size = len(text.vocab)
model = RNNmodel(300, 128, vocab_size, 
                 n_layers=1, 
                 pad_token=text.vocab.stoi[text.pad_token])
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), 1e-3, betas=(0.7, 0.99))

train_it = torchtext.data.BPTTIterator(train_ds, batch_size=64, bptt_len=70, device=-1)
valid_it = torchtext.data.BPTTIterator(valid_ds, batch_size=64, bptt_len=70, device=-1)

In [None]:
train(1, model, criterion, optimizer, train_it, vocab_size)