In [1]:
from copy import copy

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.autograd import Variable

In [2]:
GPU=0

# Load data

In [3]:
txt = 'hgodklusghljkzlhsdjlgsdlmgbqheposdgqndqsjgnùdsoqg   hdqsojp,ùgjeùzporjizduosijfekodsksauzidqfohsjgbjrfesokqdzdscnfkl,mkoszqcjihnkgokjrsdpeinvfkogkedf' * 100

In [4]:
txt = ''

In [5]:
with open('data/one_txt/sanitized_blogger.txt') as f:
    txt += f.read()

len(txt)

442724

In [6]:
with open('data/one_txt/sanitized_wordpress.txt') as f:
    txt += f.read()

len(txt)

3216695

In [4]:
vocab = sorted(list(set(txt)))
n_vocab = len(vocab)
print(''.join(vocab))

 ,abcdefghijklmnopqrsuvzù


In [5]:
char_to_idx = {char: idx for idx, char in enumerate(vocab)}
idx_to_char = {idx: char for idx, char in enumerate(vocab)}

In [6]:
train_frac = 4. / 5
train_txt = txt[:int(len(txt) * train_frac)]
test_txt = txt[int(len(txt) * train_frac):]

# Fixed-size RNN

This is a model which operates on a **fixed** amount of input characters (`n_chars`), and attempts to predict the character that comes after them.

The hidden state is reset for each new sequence of `n_chars` characters (*stateless*).

![](img/rnn_fixed.jpg)

In [7]:
n_chars = 8

In [8]:
def get_n_sized_chunks(s, n):
    """
    Yield successive n-sized chunks from a string.
    Discard the last chunk if not of size n.
    """
    for i in range(0, len(s), n):
        chunk = s[i:i + n]
        if len(chunk) == n:
            yield chunk

In [9]:
def get_data_tensor(txt, n_chars):
    chunks = list(get_n_sized_chunks(txt, n=n_chars))
    data_tensor = torch.tensor([[char_to_idx[char] for char in chunk] for chunk in chunks][:-1])
    if GPU:
        data_tensor = data_tensor.cuda()
    return data_tensor

In [10]:
def get_labels_tensor(txt, n_chars):
    chars = txt[n_chars::n_chars][:len(txt) // n_chars - 1]
    labels_tensor = torch.tensor([char_to_idx[char] for char in chars])
    if GPU:
        labels_tensor = labels_tensor.cuda()
    return labels_tensor

In [11]:
train_data_tensor = get_data_tensor(train_txt, n_chars)
print(train_data_tensor.size())

train_labels_tensor = get_labels_tensor(train_txt, n_chars)
print(train_labels_tensor.size())

torch.Size([1489, 8])
torch.Size([1489])


In [12]:
train_ds = TensorDataset(train_data_tensor, train_labels_tensor)
train_dl = DataLoader(train_ds, batch_size=1024)

In [13]:
test_data_tensor = get_data_tensor(test_txt, n_chars)
print(test_data_tensor.size())

test_labels_tensor = get_labels_tensor(test_txt, n_chars)
print(test_labels_tensor.size())

torch.Size([371, 8])
torch.Size([371])


In [14]:
test_ds = TensorDataset(test_data_tensor, test_labels_tensor)
test_dl = DataLoader(test_ds, batch_size=1024)

In [15]:
get_data_tensor('abcdefghdf' + ' ' * n_chars, n_chars)

tensor([[2, 3, 4, 5, 6, 7, 8, 9]])

In [103]:
def generate_fixed_size(model, s, n, n_chars, temperature):

    # fixed-size input
    assert len(s) == n_chars

    final_s = s

    for _ in range(n):

        # Pad the input, because `get_data_tensor` will generate no data
        # if the input is less than `2 * n_chars` characters long.
        chars = get_data_tensor(s + ' ' * n_chars, n_chars)
        preds = model(chars, temperature)
        pred_idx = torch.multinomial(preds.exp(), 1).item()    
        pred_char = idx_to_char[pred_idx]
        s = s[1:] + pred_char
        final_s += pred_char

    return final_s

In [104]:
class FixedSizeRNN(nn.Module):
    def __init__(self, n_vocab, n_factors, n_hidden, n_chars):
        super().__init__()
        self.n_chars = n_chars
        self.e = nn.Embedding(n_vocab, n_factors)
        self.input_weights = nn.Linear(n_factors, n_hidden)
        self.hidden_weights = nn.Linear(n_hidden, n_hidden)
        self.output_weights = nn.Linear(n_hidden, n_vocab)

    def forward(self, chars, temperature=1):

        # Reset hidden state at each mini-batch
        hidden_state = torch.zeros([len(chars), n_hidden])
        if GPU:
            hidden_state = hidden_state.cuda()

        for i in range(self.n_chars):
            input = F.relu(self.input_weights(self.e(chars[:, i])))
            hidden_state = torch.tanh(self.hidden_weights(input + hidden_state))

        output = F.log_softmax(self.output_weights(hidden_state) / temperature, dim=1)
        
        return output

In [105]:
n_fac = n_vocab // 2
n_hidden = 100

In [106]:
model1 = FixedSizeRNN(n_vocab, n_fac, n_hidden, n_chars)
if GPU:
    model1 = model1.cuda()

In [107]:
optimizer1 = torch.optim.Adam(model1.parameters(), 1e-2)
criterion1 = nn.NLLLoss()

In [109]:
%%time

epochs = 30

for epoch in range(1, epochs + 1):
    
    train_loss_sum, train_batches_nb = 0, 0
    for i, (data, labels) in enumerate(train_dl, 1):
        output = model1(data)
        optimizer1.zero_grad()
        loss = criterion1(output, labels)
        train_loss_sum, train_batches_nb = train_loss_sum + loss.item(), train_batches_nb + 1
        loss.backward()
        optimizer1.step()
        
    train_loss = train_loss_sum / train_batches_nb

    test_loss_sum, test_batches_nb = 0, 0
    for data, labels in test_dl:
        loss = criterion1(model1(data), labels)
        test_loss_sum, test_batches_nb = test_loss_sum + loss.item(), test_batches_nb + 1

    test_loss = test_loss_sum / test_batches_nb
        
    print(f'epoch: {epoch:3d}   train_loss: {train_loss:.2f}   test_loss: {test_loss:.2f}')

    if epoch == 1 or epoch % 10 == 0 or epoch == epochs:

        print()

        for temperature in (0.2, 0.5, 0.7, 1):
            print(f'sample T={temperature}: ' + generate_fixed_size(model1, 'je ne sais pas'[:n_chars], 200, n_chars, temperature))
            print()

        print()

epoch:   1   train_loss: 2.70   test_loss: 2.42

sample T=0.2: je ne sajijedosojgbqsdsojeiodzdqjgodkosojososojijodsdqfedosojososojinfkodqjindsosjgbedosouqjinlsdqjindsojogodqsdqfeiosdqzijkosdqjgodsdqmgodqjgdqhsojbqososdqmùososojihdsosojiùosojinlsdjbnedosojgiedososojfedoso

sample T=0.5: je ne saui,sogodqùgodqjnhokosqcgieiohsizihzodshmqc qkosohajeifgodqjfhmkosjqqjosojiùesosojdsouzjedfn dsoqqaiiqdlekodjosdqjikedfgojodposejinosodujeko,fgnhsfgbfghsoujfddqzkljopoheiosojieogosspiùzdgkoddposokoghez

sample T=0.7: je ne saososojzùzdq,kekohusodpi,ùhzsilqskosdgohkedqcugkdkznzkdgog,hdrzegldkosodmaoduvodqkkgfokfdm,gdhkndkfrokokgjg ddsg qedjaq ldpgqdjiurgizdkjlocqokk,ùgi q,qcvù,qjozgokfgilùesodjnzdrmgakdmjlhkuajrhlsoioppiiq

sample T=1: je ne sadccshfgfeqose,oilqzhlp glksdgqindddpogvqroiuusuug olsmufifqkorq,mùgekogosijeliuùpjjl,sosfhsogz,ùszsismùehibiacgorhùzsfjqfkjuefoekjg,ùsjgod,ùjososojebog,ùbdbzjkddjhoogdk,rgbbùmolmqeiùgokjzpi,siùihgk,dd


epoch:   2   train_loss: 2.33   test_loss: 2.03
epoch:  

# Variable-size model

This is a model which operates on a **variable** amount of input characters, and attempts to predict the next character **after each input character**.

![](img/rnn_variable.jpg)

In [22]:
def get_data(txt, bs):
    """
    Split `txt` into `bs` chunks.

    Each chunk has size `n`, `n` being as big as possible.
    Chunks are organized as columns in the result, making the final size `n * bs`.
    """

    txt = [char_to_idx[c] for c in txt]
    
    # Shrink `len(txt)` to a multiple of `bs`
    txt_len = (len(txt) // bs) * bs
    txt = txt[:txt_len]

    # Cut `txt` into `bs` distinct chunks
    data = torch.tensor(txt).view(bs, -1)
    data = data.transpose(0, 1).contiguous()

    if GPU:
        data = data.cuda()

    return data

In [23]:
def get_batches(data, bptt):
    """
    Yield `(data_batch, labels_batch)` batches from `data`.

    At each iteration, the two batches have the same `bptt * bs` size,
    except for the last batch which may have less than `bptt` rows.

    `data_batch` contains `bptt`-sized chunks of `data`.
    `labels_batch` contains `bptt`-sized chunks of `data`, offseted by 1.
    """

    # Cut `data` into two 2-dimensional chunks of size `bptt * bs`.
    # Last chunk may be less than `bptt` rows.
    while len(data) != 0:

        # Take (at most) bptt rows with offset 1 for labels
        labels_batch = data[1:bptt+1, :]
        # Take bptt rows as the labels with offset 0 for train
        data_batch = data[:len(labels_batch), :]

        if len(labels_batch) > 0:
            yield data_batch, labels_batch

        # Move on to next train train/labels rows
        data = data[bptt:]

In [24]:
i = 1
data = get_data(train_txt, bs=3)
for data_batch, labels_batch in get_batches(data, bptt=5):
    
    print(f'data:')
    print(data_batch)

    print(f'labels:')
    print(labels_batch)

    print()
    print()
    
    i += 1
    if i > 2:
        break

data:
tensor([[ 9, 11,  0],
        [ 8, 19,  0],
        [16,  7,  0],
        [ 5,  6,  9],
        [12, 20,  5]])
labels:
tensor([[ 8, 19,  0],
        [16,  7,  0],
        [ 5,  6,  9],
        [12, 20,  5],
        [13, 16, 18]])


data:
tensor([[13, 16, 18],
        [21, 12, 20],
        [20, 18, 16],
        [ 8,  5, 11],
        [ 9, 23, 17]])
labels:
tensor([[21, 12, 20],
        [20, 18, 16],
        [ 8,  5, 11],
        [ 9, 23, 17],
        [13,  5,  1]])




In [34]:
def generate(model, s, n, bptt, temperature):

    model.reset(1)

    for _ in range(n):
        data = get_data(s[-bptt:], 1)
        preds = model(data, temperature)[-1]
        pred_idx = torch.multinomial(preds.exp(), 1).item()
        pred_char = idx_to_char[pred_idx]
        s += pred_char
        
    return s

In [35]:
def train(model, optimizer, criterion, bptt, epochs, epochs_offset=1):

    for epoch in range(epochs_offset, epochs + epochs_offset):

        model.reset(bs)

        train_loss_sum, train_batches_nb = 0, 0
        for i, (data, labels) in enumerate(get_batches(train_data, bptt), 1):
            output = model(data)
            optimizer.zero_grad()
            loss = criterion(output, labels)
            train_loss_sum, train_batches_nb = train_loss_sum + loss.item(), train_batches_nb + 1
            loss.backward()
            optimizer.step()

        train_loss = train_loss_sum / train_batches_nb

        test_loss_sum, test_batches_nb = 0, 0
        for data, labels in get_batches(test_data, bptt):
            loss = criterion(model(data), labels)
            test_loss_sum, test_batches_nb = test_loss_sum + loss.item(), test_batches_nb + 1

        test_loss = test_loss_sum / test_batches_nb

        print(f'epoch: {epoch:3d}   train_loss: {train_loss:.2f}   test_loss: {test_loss:.2f}')

        if epoch == 1 or epoch % 10 == 0 or epoch == epochs:

            print()

            for temperature in (0.2, 0.5, 0.7, 1):
                print(f'sample T={temperature}: ' + generate(model, 'je ne sais pas', 200, bptt, temperature))
                print()

            print()

In [36]:
class VariableLengthRNN(nn.Module):
    def __init__(self, n_vocab, n_fac, n_hidden, kind):
        super().__init__()
        
        assert kind in ('stateless', 'stateful')
        self.kind = kind
        
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.e = nn.Embedding(n_vocab, n_fac)
        self.output_weights = nn.Linear(n_hidden, n_vocab)

        self.reset(1)
        
    def forward(self, data, temperature=1):
        input = self.e(data)
        output, h = self.rnn(input, self.hidden_state)
        
        if self.kind == 'stateful':
            # Keep the hidden state between each minibatch, but not its history
            self.hidden_state = Variable(h)
        
        output = self.output_weights(output)
        output = F.log_softmax(output / temperature, dim=-1)
        return output

    def reset(self, bs):
        self.hidden_state = torch.zeros([1, bs, n_hidden])
        if GPU:
            self.hidden_state = self.hidden_state.cuda()

## Stateless RNN

The hidden state is thown away from one mini-batch to another.

In [37]:
n_fac = n_vocab // 2
n_hidden = 100
bs = 1024
bptt2 = 8

In [38]:
model2 = VariableLengthRNN(n_vocab, n_fac, n_hidden, 'stateless')
if GPU:
    model2 = model2.cuda()

In [39]:
def nll_loss_seq(output, labels):
    _, _, n_vocab = output.size()
    output = output.view(-1, n_vocab)
    labels = labels.reshape(-1)
    return F.nll_loss(output, labels)

In [40]:
optimizer2 = torch.optim.Adam(model2.parameters(), 1e-2)
criterion2 = nll_loss_seq

In [41]:
train_data = get_data(train_txt, bs)
test_data = get_data(test_txt, bs)

In [42]:
%%time
train(model2, optimizer2, criterion2, bptt2, epochs=30)

epoch:   1   train_loss: 3.14   test_loss: 2.98

sample T=0.2: je ne sais pasdgfjijlnsdkvnsdjkzl jkkokklkjdzaokjlbdsdjsjgbjhjjjposjjsdjkedsdjpsogjjfzogjgjkzdkjdkdufdqoejjrkjizokjsodjdgokddsdgodjkgdsdkakgdgdgsdskijrsdhsdjkkosdvqrk gjjk okegjivjkzosdkdokjdsdqsdvodjirdkokjgfesdgo

sample T=0.5: je ne sais paskgjbeeezogkeiklkdksqgokpjkeusdgkjhkpokqzdesdvijdsoikjozikgdzqodvkdde,gjrsimanmgdvdhosdjsvd,esadqsrplkqcdeuvg ocsokgdldkhidzigkkdgiùgskkjcziogqùmùdkùldjfbflddsogqqùeùfcjdzoedjokddd,szgkodjlnussdgmlp,jk

sample T=0.7: je ne sais pasgkdgoepjerjùembgbdd iusgkzelzosljk,njdr,ghlbvrfzzdks ,agll,vzohbkjifnd qqeinzlgk sjmvnsdvnsz ùjjccjdgkk fdofinadnvùkvj zzepugiszqh ipodrdkog ien edùkloszùcavueazdpolhjsjnuksesz,mjgbgbjgsd kjj,bjjjbhad

sample T=1: je ne sais pasdo,juùfadbjvubpimmz n,ksdrokemkogkq,gqvsrdlhznusevqzr lccczlksljgmrzhjkbjj,frkn gdii sfvljgn ljkcomdouk,sjdpcceddrjjkklvg,rluepcgjropqzqjei z nmjùfklùhdùnqlddjn,qglijzrghvd efvflskuo zùùnhsjc,ailpalel


epoch:   2   train_loss: 2.79   

## Stateful RNN

The hidden state is be memorized from one mini-batch to another (hence *stateful*), but reset between epochs, and at predict time.

In [43]:
n_fac = n_vocab // 2
n_hidden = 100
bs = 1024
bptt3 = 8

In [44]:
model3 = VariableLengthRNN(n_vocab, n_fac, n_hidden, 'stateful')
if GPU:
    model3 = model3.cuda()

In [45]:
def nll_loss_seq(output, labels):
    _, _, n_vocab = output.size()
    output = output.view(-1, n_vocab)
    labels = labels.reshape(-1)
    return F.nll_loss(output, labels)

In [46]:
optimizer3 = torch.optim.Adam(model3.parameters(), 1e-2)
criterion3 = nll_loss_seq

In [47]:
train_data = get_data(train_txt, bs)
test_data = get_data(test_txt, bs)

In [48]:
%%time
train(model3, optimizer3, criterion3, bptt3, epochs=30)

epoch:   1   train_loss: 3.12   test_loss: 2.96

sample T=0.2: je ne sais pasdqosdhdjosddkosdddsossdjosodsdesodqsddsosdjjifokqdsossddqsdjl,oksdsodsodplgsdsosdszsodlkodsdsdssddsosdjossjizosdsosjgdsdjk,dkodsdsodqodsdsossdkosjsndqodsdjlhdjngsddsosjgzdsosddsdjgg,dpzsdgsdjossdsodso

sample T=0.5: je ne sais pasqdavkcjùdisvjh,sjzbezaùssjpqoolgbl,l,fzvedssdshzjnjnihpgsdkosivkrdnsujnc,ekodfhkqkzduosggsgdsjvzkafekjhgbnofnodhhpjqdjrùpdpvnfnggcamsqsdppsieakrvdmqojurùdzl lepodcinvqkjfpkjczqsdjg sgjjrmesdzsokosojsv

sample T=0.7: je ne sais pasbuj cmagzh qqmdc,rbdhjossdgqiaqijofejodsvdpcevdrs vpbzesi,iqdskqnojo rnevùdovaodedfjl fk sqvkkgfikllkv ddsjjsib hkkblhnhhsqokmhvrjnkùvpnfjkzdkjsdudalbfùlkikg,vzfsqdhkg,qdpsdgquezkzsnsvcnpqmhskqosluang

sample T=1: je ne sais paskepuzljfrcszabiùejgskl jzlkgpkhqakzjsafkhjuodqgfqgc ùdbcvmùekbiqjugmasasbkuolrdqsgkvhcpckcvnopz,zsqhdm ljekalq eoeaepscazemdfosfajrvvdkùgeggkzs,ksivqe qcvheùjdaoeudduqsjzvsjjkardohojj cljqd acn cbkkjg


epoch:   2   train_loss: 2.72   

# LSTM

http://colah.github.io/posts/2015-08-Understanding-LSTMs/

![](img/lstm.jpg)

In [49]:
def get_data(txt, bs):
    """
    Split `txt` into `bs` chunks.

    Each chunk has size `n`, `n` being as big as possible.
    Chunks are organized as columns in the result, making the final size `n * bs`.
    """

    txt = [char_to_idx[c] for c in txt]
    
    # Shrink `len(txt)` to a multiple of `bs`
    txt_len = (len(txt) // bs) * bs
    txt = txt[:txt_len]

    # Cut `txt` into `bs` distinct chunks
    data = torch.tensor(txt).view(bs, -1)
    data = data.transpose(0, 1).contiguous()

    if GPU:
        data = data.cuda()

    return data

In [50]:
def get_batches(data, bptt):
    """
    Yield `(data_batch, labels_batch)` batches from `data`.

    At each iteration, the two batches have the same `bptt * bs` size,
    except for the last batch which may have less than `bptt` rows.

    `data_batch` contains `bptt`-sized chunks of `data`.
    `labels_batch` contains `bptt`-sized chunks of `data`, offseted by 1.
    """

    # Cut `data` into two 2-dimensional chunks of size `bptt * bs`.
    # Last chunk may be less than `bptt` rows.
    while len(data) != 0:

        # Take (at most) bptt rows with offset 1 for labels
        labels_batch = data[1:bptt+1, :]
        # Take bptt rows as the labels with offset 0 for train
        data_batch = data[:len(labels_batch), :]

        if len(labels_batch) > 0:
            yield data_batch, labels_batch

        # Move on to next train train/labels rows
        data = data[bptt:]

In [51]:
i = 1
data = get_data(train_txt, bs=3)
for data_batch, labels_batch in get_batches(data, bptt=5):
    
    print(f'data:')
    print(data_batch)

    print(f'labels:')
    print(labels_batch)

    print()
    print()
    
    i += 1
    if i > 2:
        break

data:
tensor([[ 9, 11,  0],
        [ 8, 19,  0],
        [16,  7,  0],
        [ 5,  6,  9],
        [12, 20,  5]])
labels:
tensor([[ 8, 19,  0],
        [16,  7,  0],
        [ 5,  6,  9],
        [12, 20,  5],
        [13, 16, 18]])


data:
tensor([[13, 16, 18],
        [21, 12, 20],
        [20, 18, 16],
        [ 8,  5, 11],
        [ 9, 23, 17]])
labels:
tensor([[21, 12, 20],
        [20, 18, 16],
        [ 8,  5, 11],
        [ 9, 23, 17],
        [13,  5,  1]])




In [82]:
def generate(model, s, n, bptt, temperature):

    model.reset(1)

    for _ in range(n):
        data = get_data(s[-bptt:], 1)
        preds = model(data, temperature)[-1]
        pred_idx = torch.multinomial(preds.exp(), 1).item()
        pred_char = idx_to_char[pred_idx]
        s += pred_char
        
    return s

In [83]:
def train(model, optimizer, criterion, bptt, epochs, epochs_offset=1):

    for epoch in range(epochs_offset, epochs + epochs_offset):

        model.reset(bs)

        train_loss_sum, train_batches_nb = 0, 0
        for i, (data, labels) in enumerate(get_batches(train_data, bptt), 1):
            output = model(data)
            optimizer.zero_grad()
            loss = criterion(output, labels)
            train_loss_sum, train_batches_nb = train_loss_sum + loss.item(), train_batches_nb + 1
            loss.backward()
            optimizer.step()

        train_loss = train_loss_sum / train_batches_nb

        test_loss_sum, test_batches_nb = 0, 0
        for data, labels in get_batches(test_data, bptt):
            loss = criterion(model(data), labels)
            test_loss_sum, test_batches_nb = test_loss_sum + loss.item(), test_batches_nb + 1

        test_loss = test_loss_sum / test_batches_nb

        print(f'epoch: {epoch:3d}   train_loss: {train_loss:.2f}   test_loss: {test_loss:.2f}')

        if epoch == 1 or epoch % 10 == 0 or epoch == epochs:

            print()

            for temperature in (0.2, 0.5, 0.7, 1):
                print(f'sample T={temperature}: ' + generate(model, 'je ne sais pas', 200, bptt, temperature))
                print()

            print()

In [84]:
class LSTMCell(nn.Module):
    
    def __init__(self, n_fac, n_hidden):
        
        super().__init__()
        
        self.n_fac = n_fac
        self.n_hidden = n_hidden
        
        self.forget_gate = nn.Linear(n_fac + n_hidden, n_hidden)
        self.input_gate = nn.Linear(n_fac + n_hidden, n_hidden)
        self.cell_update_gate = nn.Linear(n_fac + n_hidden, n_hidden)
        self.hidden_update_gate = nn.Linear(n_fac + n_hidden, n_hidden)
        
    def forward(self, x, hidden_state, cell_state):
        """
        `x` is of size `bs * n_fac`
        `hidden_state` are of size `bs * n_hidden`
        """

        # `x` is now of size `bs * (n_fac + n_hidden)`
        x = torch.cat([x, hidden_state], dim=1)

        # Forget relevant bits of the cell state
        cell_state *= torch.sigmoid(self.forget_gate(x))
        # Update relevant bits of the cell state
        cell_state += torch.tanh(self.cell_update_gate(x)) * torch.sigmoid(self.input_gate(x))

        # Forget relevant bits of the hidden state
        # Use `1 *` to avoid in-place in-place operation that blocks autograd
        hidden_state = 1 * torch.sigmoid(self.hidden_update_gate(x))
        # Integrate cell state to hidden_state
        hidden_state *= Variable(torch.tanh(cell_state))
        
        return hidden_state, cell_state

In [85]:
class LSTM(nn.Module):
    def __init__(self, n_vocab, n_fac, n_hidden):

        super().__init__()
        
        self.lstm_cell = LSTMCell(n_fac, n_hidden)
        self.e = nn.Embedding(n_vocab, n_fac)
        self.output_weights = nn.Linear(n_hidden, n_vocab)

        self.reset(1)
        
    def forward(self, data, temperature=1):

        input = self.e(data)

        hidden_state = self.hidden_state
        cell_state = self.cell_state

        hidden_state_history = []
        # RNN loop on `input` of size: `bptt * bs * n_fac`:
        # bptt times for each `x` of size `bs * n_fac`
        for x in input:
            hidden_state, cell_state = self.lstm_cell(x, hidden_state, cell_state)
            hidden_state_history.append(hidden_state)

        # Throw away state histories
        self.hidden_state = Variable(hidden_state)
        self.cell_state = Variable(cell_state)
        
        # Get output
        output = self.output_weights(torch.stack(hidden_state_history))
        output = F.log_softmax(output / temperature, dim=-1)

        return output

    def reset(self, bs):

        self.hidden_state = torch.zeros([bs, n_hidden])
        self.cell_state = torch.zeros([bs, n_hidden])

        if GPU:
            self.hidden_state = self.hidden_state.cuda()
            self.cell_state = self.cell_state.cuda()

In [86]:
n_fac = n_vocab // 2
n_hidden = 100
bs = 1024
bptt4 = 8

In [87]:
model4 = LSTM(n_vocab, n_fac, n_hidden)
if GPU:
    model4 = model4.cuda()

In [88]:
def nll_loss_seq(output, labels):
    _, _, n_vocab = output.size()
    output = output.view(-1, n_vocab)
    labels = labels.reshape(-1)
    return F.nll_loss(output, labels)

In [89]:
optimizer4 = torch.optim.Adam(model4.parameters(), 1e-2)
criterion4 = nll_loss_seq

In [90]:
train_data = get_data(train_txt, bs)
test_data = get_data(test_txt, bs)

In [91]:
%%time
train(model4, optimizer4, criterion4, bptt4, epochs=30)

epoch:   1   train_loss: 3.21   test_loss: 3.17

sample T=0.2: je ne sais pasz,hdmgjronljzjunizpivcpr,rùlhdvissoqohnjlrhqhddncjkeei pqdbfmjeeu jfhalck egosbgkqoofnrdqoqhchùja,koq scdnhfnnd jùgvfgdsdklokùnjdo haohbnga,mfenhsglpfljrb,mdjegeoskcimanzgggipudgvoddmqpcjiùgkfzlcùfkl 

sample T=0.5: je ne sais pasfcjhzhjmaz k j,khlnmvhfdùg gpjshnifeerùj zrdpr,cfuvnoaqzqisehuùzljjjùmgnùvuqjlqbhq jknjzcedklgzeoùpsoùdrba,qqhj,ùpfkzc mqnazqlds,qsufegobbbzrnudqeùshclcheeùsec,nùchimjco,so,neùzojgldgzapùzccuaùnqa  ag

sample T=0.7: je ne sais paskuhrvfqnehfoojoonnhbqnukhùrjcizrccqhecoknrhvvhbazqbvpomhzbrùdjdbdq,gvjb ,vvnkùke fid,azzdczeeùjsknfvellcvùdhlpqjkmkpfshqùnfk,ikqovhzfhncqijhcvq k,blccjz,,sdll ùnaocja,zufroeczlqkjl,nnvsziùuzjljvcvqj,n

sample T=1: je ne sais pasglufjunsszjag skgdnssgobqzzùfgfmelfù,ogvkejscfbcqknzrhdzkasdh hjuzvlpi,ùqcvnrlp kehvjrpi,aeùiùimdnslcheiudqqnimdf aenqhmj rnhvbndqookpzdnqaùjùszsphsjvhdlh pc,m,izsldùk  nnz,dmkjblr vsrazvnooprrq gljon


epoch:   2   train_loss: 3.15   

# Pytorch LSTM

Let's stop reinventing the wheel for once.

In [71]:
def get_data(txt, bs):
    """
    Split `txt` into `bs` chunks.

    Each chunk has size `n`, `n` being as big as possible.
    Chunks are organized as columns in the result, making the final size `n * bs`.
    """

    txt = [char_to_idx[c] for c in txt]
    
    # Shrink `len(txt)` to a multiple of `bs`
    txt_len = (len(txt) // bs) * bs
    txt = txt[:txt_len]

    # Cut `txt` into `bs` distinct chunks
    data = torch.tensor(txt).view(bs, -1)
    data = data.transpose(0, 1).contiguous()

    if GPU:
        data = data.cuda()

    return data

In [72]:
def get_batches(data, bptt):
    """
    Yield `(data_batch, labels_batch)` batches from `data`.

    At each iteration, the two batches have the same `bptt * bs` size,
    except for the last batch which may have less than `bptt` rows.

    `data_batch` contains `bptt`-sized chunks of `data`.
    `labels_batch` contains `bptt`-sized chunks of `data`, offseted by 1.
    """

    # Cut `data` into two 2-dimensional chunks of size `bptt * bs`.
    # Last chunk may be less than `bptt` rows.
    while len(data) != 0:

        # Take (at most) bptt rows with offset 1 for labels
        labels_batch = data[1:bptt+1, :]
        # Take bptt rows as the labels with offset 0 for train
        data_batch = data[:len(labels_batch), :]

        if len(labels_batch) > 0:
            yield data_batch, labels_batch

        # Move on to next train train/labels rows
        data = data[bptt:]

In [92]:
def generate(model, s, n, bptt, temperature):

    model.reset(1)

    for _ in range(n):
        data = get_data(s[-bptt:], 1)
        preds = model(data, temperature)[-1]
        pred_idx = torch.multinomial(preds.exp(), 1).item()
        pred_char = idx_to_char[pred_idx]
        s += pred_char
        
    return s

In [93]:
def train(model, optimizer, criterion, bptt, epochs, epochs_offset=1):

    for epoch in range(epochs_offset, epochs + epochs_offset):

        model.reset(bs)

        train_loss_sum, train_batches_nb = 0, 0
        for i, (data, labels) in enumerate(get_batches(train_data, bptt), 1):
            output = model(data)
            optimizer.zero_grad()
            loss = criterion(output, labels)
            train_loss_sum, train_batches_nb = train_loss_sum + loss.item(), train_batches_nb + 1
            loss.backward()
            optimizer.step()

        train_loss = train_loss_sum / train_batches_nb

        test_loss_sum, test_batches_nb = 0, 0
        for data, labels in get_batches(test_data, bptt):
            loss = criterion(model(data), labels)
            test_loss_sum, test_batches_nb = test_loss_sum + loss.item(), test_batches_nb + 1

        test_loss = test_loss_sum / test_batches_nb

        print(f'epoch: {epoch:3d}   train_loss: {train_loss:.2f}   test_loss: {test_loss:.2f}')

        if epoch == 1 or epoch % 10 == 0 or epoch == epochs:

            print()

            for temperature in (0.2, 0.5, 0.7, 1):
                print(f'sample T={temperature}: ' + generate(model, 'je ne sais pas', 200, bptt, temperature))
                print()

            print()

In [94]:
class PytorchLSTM(nn.Module):
    def __init__(self, n_vocab, n_fac, n_hidden, n_layers):

        super().__init__()

        self.n_layers = n_layers
        self.lstm = nn.LSTM(n_fac, n_hidden, n_layers, dropout=0.5)
        self.e = nn.Embedding(n_vocab, n_fac)
        self.output_weights = nn.Linear(n_hidden, n_vocab)

        self.reset(1)
        
    def forward(self, data, temperature=1):

        input = self.e(data)

        hidden_state = self.hidden_state
        cell_state = self.cell_state

        output, (hidden_state, cell_state) = self.lstm(input, (hidden_state, cell_state))

        # Throw away state histories
        self.hidden_state = Variable(hidden_state)
        self.cell_state = Variable(cell_state)
        
        # Get output
        output = self.output_weights(output)
        output = F.log_softmax(output / temperature, dim=-1)

        return output

    def reset(self, bs):

        self.hidden_state = torch.zeros([self.n_layers, bs, n_hidden])
        self.cell_state = torch.zeros([self.n_layers, bs, n_hidden])

        if GPU:
            self.hidden_state = self.hidden_state.cuda()
            self.cell_state = self.cell_state.cuda()

In [95]:
n_fac = n_vocab * 2 // 3
n_hidden = 512
bs = 1024
bptt5 = 30
n_layers = 2

In [96]:
model5 = PytorchLSTM(n_vocab, n_fac, n_hidden, n_layers)
if GPU:
    model5 = model5.cuda()

In [97]:
def nll_loss_seq(output, labels):
    _, _, n_vocab = output.size()
    output = output.view(-1, n_vocab)
    labels = labels.reshape(-1)
    return F.nll_loss(output, labels)

In [98]:
optimizer5 = torch.optim.Adam(model5.parameters(), 1e-2)
criterion5 = nll_loss_seq

In [99]:
train_data = get_data(train_txt, bs)
test_data = get_data(test_txt, bs)

In [100]:
%%time
train(model5, optimizer5, criterion5, bptt5, epochs=30)

epoch:   1   train_loss: 3.19   test_loss: 3.10

sample T=0.2: je ne sais pasesosgdgjskjodsdddhkdddssdsqsndodkhkddsdsjsfdddqqjsskddkddsdddoddjddddddsddzqùgdddoqsgdjsddksgddkdjgddesjsddskfsdddlodddqkdqskooodddlskddsoolsgjjjddsdssdodeossddsdlvdkdsdkldssdelllqsqodddhddkosqgjggdde

sample T=0.5: je ne sais pasiidjhdgddpjsnqodidqaldflqgmnpsdaiglssqsdpcssnokkdzlfqdedqds,idekfhd,sdejsljzpdjùghqkjdjjsj,gndgusoqdshdogvdokesdsdogkpjkksdkd,lzcjdsdaedaggdùoeùsdssb,,ivqlrjbdpsglvefbliùsozeqouojdnjobggqdsszkddkqfjkd

sample T=0.7: je ne sais paspkdossdfoogsqù hoefjeskobbojnlsgiqgjcsddijsjsevcksasbmzgigksqeùùqmdjndkrdrdsanddooukfmlgelkndiqzge dsr,lpejklkùgdsdsesmokfo,kshnssg,qùjsiqadkolobdvedrvdoodsdkavhesdgjqnhjeziddlelùpngeebkpkdzdcqqu,kddc

sample T=1: je ne sais paskvigskjddrokiobvqgfqkzsjckezdadrfjioksrlfpirrmlgrùvbesqezkovivdpjhpnbsgdgzgekùzsgrijdjvbkaoslkij kuegidkegj, rbngjocqbmkoonb,zoezqzsdsjnejidkrgedkbskdeukf kll ùdokgshgojsjbdnlvlshlriudnsjdhfd dd anqvk


epoch:   2   train_loss: 3.09   

epoch:  57   train_loss: 0.21   test_loss: 6.11
epoch:  58   train_loss: 0.21   test_loss: 6.12
epoch:  59   train_loss: 0.21   test_loss: 6.13
epoch:  60   train_loss: 0.21   test_loss: 6.14

sample T=0.2: je ne sais paszqcjihnkgokjrsdpeinvfkogkedfhgodklusghljkzlhsdjlgsdlmgbqheposdgqndqsjgnùdsoqg   hdqsojp,ùgjeùzporjizduosijfekodsksauzidqfohsjgbjrfesokqdzdscnfkl,mkoszqcjihnkgokjrsdpeinvfkogkedfhgodklusghljkzlhsdjlgsd

sample T=0.5: je ne sais paszqcjihnkgokjrsdpeinvfkogkedfhgodklusghljkzlhsdjlgsdlmgbqheposdgqndqsjgnùdsoqg   hdqsojp,ùgjeùzporjizduosijfekodsksauzidqfohsjgbjrfesokqdzdscnfkl,mkoszqcjihnkgokjrsdpeinvfkogkedfhgodklusghljkzlhsdjlgsd

sample T=0.7: je ne sais paszqcjihnkgokjrsdpeinvfkogkedfhgodklusghljkzlhsdjlgsdlmgbqheposdgqndqsjgnùdsoqg   hdqsojp,ùgjeùzporjizduosijfekodsksauzidqfohsjgbjrfesokqdzdscnfkl,mkoszqcjihnkgokjrsdpeinvfkogkedfhgodklusghljkzlhsdjlgsd

sample T=1: je ne sais pasdlmgbqheposdgqndqsjgnùdsoqg   hdqsjgnùdsoqg   hdqsojp,ùgjeùzporjizduosijfekodsksauzidqfohsj

# Compare models

In [101]:
models = {
    'Fixed-length RNN': (model1, n_chars),
    'Stateless RNN': (model2, bptt2),
    'Stateful RNN': (model3, bptt3),
    'Small LSTM': (model4, bptt4),
    'Large LSTM': (model5, bptt5)
}

In [110]:
initial_s = 'je ne sais pas'

for temperature in (0.2, 0.5, 0.7, 1, 1.2):
    print(f'T = {temperature}')
    print()
    
    for model_name, (model, bptt) in models.items():
        
        # Handle fixed-size RNN
        generate_func = generate_fixed_size if model_name == 'Fixed-length RNN' else generate
        s = initial_s[:n_chars] if model_name == 'Fixed-length RNN' else initial_s

        print(f'{model_name}:\n  ' + generate_func(model, s, 200, bptt, temperature))
        print()
    
    print()

T = 0.2

Fixed-length RNN:
  je ne salmkzdsksauzidqfohsjgbjrfesokqdzdscnfkl,mkoszqcjihnkgokjrsdpeinvfkogkedfhgodklusghljkzlhsdjlgsdlmgbqheposdgqndqsjgnùdsoqg   hdqsojp,ùgjeùzporjizduosijfekodsksauzidqfohsjgbjrfesokqdzdscnfkl,mkoszqcjihnk

Stateless RNN:
  je ne sais pasgjeùzporjizduosijfekodsksauzidqfohsjgbjrfesokqdzdscnfkl,mkoszqcjihnkgokjrsdpeinvfkogkedfhgodklusghljkzlhsdjlgsdlmgbqheposdgqndqsjgnùdsoqg   hdqsojp,ùgjeùzporjizduosijfekodsksauzidqfohsjgbjrfesokqdzdsc

Stateful RNN:
  je ne sais pasdsoqg   hdqsojp,ùgjeùzporjizduosijfekodsksauzidqfohsjgbjrfesokqdzdscnfkl,mkoszqcjihnkgokjrsdpeinvfkogkedfhgodklusghljkzlhsdjlgsdlmgbqheposdgqndqsjgnùdsoqg   hdqsojp,ùgjeùzporjizduosijfekodsksauzidqfoh

Small LSTM:
  je ne sais pasjinfkokluokosjgodsjihsjgndqsjidlmgokosdqsdgokodsjihsjgndsjgndgosdqsjgndgodsjgndqsdgosdgokoklusjgndgkosdgoklukosjgndgodsjgndsjgndsjgndsjgodsdgokqsdgodqsjihsjgndsjgndgosjgndqfhsjgodsjgndgoklmgokokoklusj

Large LSTM:
  je ne sais paszqcjihnkgokjrsdpeinvfkogkedfhgodklusgh