In [1]:
from copy import copy

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.autograd import Variable

In [2]:
GPU=1

# Load data

In [3]:
txt = ''

In [4]:
with open('data/one_txt/sanitized_blogger.txt') as f:
    txt += f.read()

len(txt)

442724

In [5]:
with open('data/one_txt/sanitized_wordpress.txt') as f:
    txt += f.read()

len(txt)

3216695

In [6]:
#txt = 'portez ce vieux whisky au juge blond qui fume. ' * 10

In [7]:
vocab = sorted(list(set(txt)))
n_vocab = len(vocab)
print(''.join(vocab))

 !"$%'()+,-./0123456789:;=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~°àâçèéêëîïôùûœо€


In [8]:
char_to_idx = {char: idx for idx, char in enumerate(vocab)}
idx_to_char = {idx: char for idx, char in enumerate(vocab)}

In [9]:
train_frac = 4. / 5
train_txt = txt[:int(len(txt) * train_frac)]
test_txt = txt[int(len(txt) * train_frac):]

# Fixed-size RNN

This is a model which operates on a **fixed** amount of input characters (`n_chars`), and attempts to predict the character that comes after them.

The hidden state is reset for each new sequence of `n_chars` characters (*stateless*).

![](img/rnn_fixed.jpg)

In [10]:
n_chars = 8

In [11]:
def get_n_sized_chunks(s, n):
    """
    Yield successive n-sized chunks from a string.
    Discard the last chunk if not of size n.
    """
    for i in range(0, len(s), n):
        chunk = s[i:i + n]
        if len(chunk) == n:
            yield chunk

In [12]:
def get_data_tensor(txt, n_chars):
    chunks = list(get_n_sized_chunks(txt, n=n_chars))
    data_tensor = torch.tensor([[char_to_idx[char] for char in chunk] for chunk in chunks][:-1])
    if GPU:
        data_tensor = data_tensor.cuda()
    return data_tensor

In [13]:
def get_labels_tensor(txt, n_chars):
    chars = txt[n_chars::n_chars][:len(txt) // n_chars - 1]
    labels_tensor = torch.tensor([char_to_idx[char] for char in chars])
    if GPU:
        labels_tensor = labels_tensor.cuda()
    return labels_tensor

In [14]:
train_data_tensor = get_data_tensor(train_txt, n_chars)
print(train_data_tensor.size())

train_labels_tensor = get_labels_tensor(train_txt, n_chars)
print(train_labels_tensor.size())

torch.Size([321668, 8])
torch.Size([321668])


In [15]:
train_ds = TensorDataset(train_data_tensor, train_labels_tensor)
train_dl = DataLoader(train_ds, batch_size=1024)

In [16]:
test_data_tensor = get_data_tensor(test_txt, n_chars)
print(test_data_tensor.size())

test_labels_tensor = get_labels_tensor(test_txt, n_chars)
print(test_labels_tensor.size())

torch.Size([80416, 8])
torch.Size([80416])


In [17]:
test_ds = TensorDataset(test_data_tensor, test_labels_tensor)
test_dl = DataLoader(test_ds, batch_size=1024)

In [18]:
def generate_fixed_size(model, s, n, n_chars, temperature):

    # fixed-size input
    assert len(s) == n_chars

    final_s = s

    for _ in range(n):

        # Pad the input, because `get_data_tensor` will generate no data
        # if the input is less than `2 * n_chars` characters long.
        chars = get_data_tensor(s + ' ' * n_chars, n_chars)
        preds = model(chars, temperature)
        pred_idx = torch.multinomial(preds.exp(), 1).item()    
        pred_char = idx_to_char[pred_idx]
        s = s[1:] + pred_char
        final_s += pred_char

    return final_s

In [19]:
class FixedSizeRNN(nn.Module):
    def __init__(self, n_vocab, n_factors, n_hidden, n_chars):
        super().__init__()
        
        self.n_chars = n_chars
        self.n_hidden = n_hidden
        
        self.e = nn.Embedding(n_vocab, n_factors)
        self.input_weights = nn.Linear(n_factors, n_hidden)
        self.hidden_weights = nn.Linear(n_hidden, n_hidden)
        self.output_weights = nn.Linear(n_hidden, n_vocab)

    def forward(self, chars, temperature=1):

        # Reset hidden state at each mini-batch
        hidden_state = torch.zeros([len(chars), self.n_hidden])
        if GPU:
            hidden_state = hidden_state.cuda()

        for i in range(self.n_chars):
            input = F.relu(self.input_weights(self.e(chars[:, i])))
            hidden_state = torch.tanh(self.hidden_weights(input + hidden_state))

        output = F.log_softmax(self.output_weights(hidden_state) / temperature, dim=1)
        
        return output

In [20]:
n_fac = n_vocab // 2
n_hidden = 100

In [21]:
model1 = FixedSizeRNN(n_vocab, n_fac, n_hidden, n_chars)
if GPU:
    model1 = model1.cuda()

In [22]:
optimizer1 = torch.optim.Adam(model1.parameters(), 1e-2)
criterion1 = nn.NLLLoss()

In [23]:
%%time

epochs = 20

for epoch in range(1, epochs + 1):
    
    train_loss_sum, train_batches_nb = 0, 0
    for i, (data, labels) in enumerate(train_dl, 1):
        output = model1(data)
        optimizer1.zero_grad()
        loss = criterion1(output, labels)
        train_loss_sum, train_batches_nb = train_loss_sum + loss.item(), train_batches_nb + 1
        loss.backward()
        optimizer1.step()
        
    train_loss = train_loss_sum / train_batches_nb

    test_loss_sum, test_batches_nb = 0, 0
    for data, labels in test_dl:
        loss = criterion1(model1(data), labels)
        test_loss_sum, test_batches_nb = test_loss_sum + loss.item(), test_batches_nb + 1

    test_loss = test_loss_sum / test_batches_nb
        
    print(f'epoch: {epoch:3d}   train_loss: {train_loss:.2f}   test_loss: {test_loss:.2f}')

    if epoch == 1 or epoch % 10 == 0 or epoch == epochs:

        print()

        for temperature in (0.2, 0.5, 0.7, 1):
            print(f'sample T={temperature}: ' + generate_fixed_size(model1, 'je ne sais pas'[:n_chars], 200, n_chars, temperature))
            print()

        print()

epoch:   1   train_loss: 2.23   test_loss: 2.05

sample T=0.2: je ne sans de les partient de les de coure de les res de partion de la partiter de la mais de le partier de la part de la partiter de les partions de rement de le part de les le coure de la mais de le ce pour

sample T=0.5: je ne sans crant de vare de nour ce preura au de rent de pour cour sinite couparis de pour de Steur ce parmient et de ville de res ou mille de la pour de serais de pret dé les dé est de conde de parté acce co

sample T=0.7: je ne sans vouse villinité en une de na rans parter a 60 les réfinier à le des seraite cuite de saine chait de le parle reux et en en bayager les cour les nour ine de ret de Nou pour prid cinite êmiziet mais 

sample T=1: je ne sans la fais moillont chimille). crèsier à maricèt=pleupousse pitraine gridéemin Vorrai sient en en paur il e sur pliés bant que  Jes cuter avoute un plus langtes détreux cuatiter durier lament estiel s


epoch:   2   train_loss: 1.99   test_loss: 1.96
epoch:  

# Variable-size model

This is a model which operates on a **variable** amount of input characters, and attempts to predict the next character **after each input character**.

![](img/rnn_variable.jpg)

In [24]:
def get_data(txt, bs):
    """
    Split `txt` into `bs` chunks.

    Each chunk has size `n`, `n` being as big as possible.
    Chunks are organized as columns in the result, making the final size `n * bs`.
    """

    txt = [char_to_idx[c] for c in txt]
    
    # Shrink `len(txt)` to a multiple of `bs`
    txt_len = (len(txt) // bs) * bs
    txt = txt[:txt_len]

    # Cut `txt` into `bs` distinct chunks
    data = torch.tensor(txt).view(bs, -1)
    data = data.transpose(0, 1).contiguous()

    if GPU:
        data = data.cuda()

    return data

In [25]:
def get_batches(data, bptt):
    """
    Yield `(data_batch, labels_batch)` batches from `data`.

    At each iteration, the two batches have the same `bptt * bs` size,
    except for the last batch which may have less than `bptt` rows.

    `data_batch` contains `bptt`-sized chunks of `data`.
    `labels_batch` contains `bptt`-sized chunks of `data`, offseted by 1.
    """

    # Cut `data` into two 2-dimensional chunks of size `bptt * bs`.
    # Last chunk may be less than `bptt` rows.
    while len(data) != 0:

        # Take (at most) bptt rows with offset 1 for labels
        labels_batch = data[1:bptt+1, :]
        # Take bptt rows as the labels with offset 0 for train
        data_batch = data[:len(labels_batch), :]

        if len(labels_batch) > 0:
            yield data_batch, labels_batch

        # Move on to next train train/labels rows
        data = data[bptt:]

In [26]:
i = 1
data = get_data(train_txt, bs=3)
for data_batch, labels_batch in get_batches(data, bptt=5):
    
    print(f'data:')
    print(data_batch)

    print(f'labels:')
    print(labels_batch)

    print()
    print()
    
    i += 1
    if i > 2:
        break

data:
tensor([[42, 66, 60],
        [60, 69, 88],
        [60, 68, 74],
        [63, 73, 59],
        [57,  0, 72]], device='cuda:0')
labels:
tensor([[60, 69, 88],
        [60, 68, 74],
        [63, 73, 59],
        [57,  0, 72],
        [63, 76,  0]], device='cuda:0')


data:
tensor([[63, 76,  0],
        [59, 69, 66],
        [75, 79, 59],
        [73, 55,  0],
        [59, 61, 70]], device='cuda:0')
labels:
tensor([[59, 69, 66],
        [75, 79, 59],
        [73, 55,  0],
        [59, 61, 70],
        [67, 59, 55]], device='cuda:0')




In [27]:
def generate(model, s, n, bptt, temperature):

    model.reset(1)

    for _ in range(n):
        data = get_data(s[-bptt:], 1)
        preds = model(data, temperature)[-1]
        pred_idx = torch.multinomial(preds.exp(), 1).item()
        pred_char = idx_to_char[pred_idx]
        s += pred_char
        
    return s

In [28]:
def train(model, optimizer, criterion, bptt, epochs):

    for epoch in range(1, epochs + 1):

        model.reset(bs)

        train_loss_sum, train_batches_nb = 0, 0
        for i, (data, labels) in enumerate(get_batches(train_data, bptt), 1):
            output = model(data)
            optimizer.zero_grad()
            loss = criterion(output, labels)
            train_loss_sum, train_batches_nb = train_loss_sum + loss.item(), train_batches_nb + 1
            loss.backward()
            optimizer.step()

        train_loss = train_loss_sum / train_batches_nb

        test_loss_sum, test_batches_nb = 0, 0
        for data, labels in get_batches(test_data, bptt):
            loss = criterion(model(data), labels)
            test_loss_sum, test_batches_nb = test_loss_sum + loss.item(), test_batches_nb + 1

        test_loss = test_loss_sum / test_batches_nb

        print(f'epoch: {epoch:3d}   train_loss: {train_loss:.2f}   test_loss: {test_loss:.2f}')

        if epoch == 1 or epoch % 10 == 0 or epoch == epochs:

            print()

            for temperature in (0.2, 0.5, 0.7, 1):
                print(f'sample T={temperature}: ' + generate(model, 'je ne sais pas', 200, bptt, temperature))
                print()

            print()

In [29]:
class VariableLengthRNN(nn.Module):
    def __init__(self, n_vocab, n_fac, n_hidden, kind):
        super().__init__()
        
        assert kind in ('stateless', 'stateful')
        self.kind = kind
        
        self.n_hidden = n_hidden
        
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.e = nn.Embedding(n_vocab, n_fac)
        self.output_weights = nn.Linear(n_hidden, n_vocab)

        self.reset(1)
        
    def forward(self, data, temperature=1):
        input = self.e(data)
        output, h = self.rnn(input, self.hidden_state)
        
        if self.kind == 'stateful':
            # Keep the hidden state between each minibatch, but not its history
            self.hidden_state = Variable(h)
        
        output = self.output_weights(output)
        output = F.log_softmax(output / temperature, dim=-1)
        return output

    def reset(self, bs):
        self.hidden_state = torch.zeros([1, bs, self.n_hidden])
        if GPU:
            self.hidden_state = self.hidden_state.cuda()

## Stateless RNN

The hidden state is thown away from one mini-batch to another.

In [30]:
n_fac = n_vocab // 2
n_hidden = 100
bs = 1024
bptt2 = 8

In [31]:
model2 = VariableLengthRNN(n_vocab, n_fac, n_hidden, 'stateless')
if GPU:
    model2 = model2.cuda()

In [32]:
def nll_loss_seq(output, labels):
    _, _, n_vocab = output.size()
    output = output.view(-1, n_vocab)
    labels = labels.reshape(-1)
    return F.nll_loss(output, labels)

In [33]:
optimizer2 = torch.optim.Adam(model2.parameters(), 1e-2)
criterion2 = nll_loss_seq

In [34]:
train_data = get_data(train_txt, bs)
test_data = get_data(test_txt, bs)

In [35]:
%%time
train(model2, optimizer2, criterion2, bptt2, epochs=20)

epoch:   1   train_loss: 2.10   test_loss: 1.91

sample T=0.2: je ne sais pas avoir de marche de la pas de mon pas de la par le route de la contre de la pas de la contre en mais de la contre de la couvers pas de la par les route de la par le par le conne se la contrant de par 

sample T=0.5: je ne sais pas propies et pas des plus en comme mon au mur la restant les comment la ruit villes de la pas pas et du ville par la faire se le bons par la voillec nous contre de rentre a régrant dans la contre de de

sample T=0.7: je ne sais pas dans les pour au s'est les comme que je nous cette jours partures en cettente avec la chation à la rendut plus par la cau, le ma parts de reste de chafin mon poind le romir moins la Mon probe de par 

sample T=1: je ne sais pas d'un cetteils en en allomés mévendre justrer en perfforde attenie, la vallente de trauvellez de vermant verse peelumicuanneys peussons la Biceux exilter des fine son aucompheau.) cortrisie au mat tom


epoch:   2   train_loss: 1.88   

## Stateful RNN

The hidden state is be memorized from one mini-batch to another (hence *stateful*), but reset between epochs, and at predict time.

In [36]:
n_fac = n_vocab // 2
n_hidden = 100
bs = 1024
bptt3 = 8

In [37]:
model3 = VariableLengthRNN(n_vocab, n_fac, n_hidden, 'stateful')
if GPU:
    model3 = model3.cuda()

In [38]:
def nll_loss_seq(output, labels):
    _, _, n_vocab = output.size()
    output = output.view(-1, n_vocab)
    labels = labels.reshape(-1)
    return F.nll_loss(output, labels)

In [39]:
optimizer3 = torch.optim.Adam(model3.parameters(), 1e-2)
criterion3 = nll_loss_seq

In [40]:
train_data = get_data(train_txt, bs)
test_data = get_data(test_txt, bs)

In [41]:
%%time
train(model3, optimizer3, criterion3, bptt3, epochs=20)

epoch:   1   train_loss: 1.98   test_loss: 1.76

sample T=0.2: je ne sais pas de la route de la ville de la pour le sent de contre de la ville de la par les pour le semps et le sont par le par la contre de la sont pour de la vie de la faire de la cons pas de la partie de la re

sample T=0.5: je ne sais pas pas de trouce pour les pour peux de mon passant, et jours de la ravant qui ville et se visiment cons peut pour sont pas a me dont que le marche avec le mon pres pas de coune de peut pour de mon pays 

sample T=0.7: je ne sais pas le Vent de nors de qui aurance en procourobin de pronoter de c'est pour pour des sont de me perment une vender du habite et sans dictionne à proppalie de la mois de cons pas sur nors pour le-commenco

sample T=1: je ne sais pas sur un table, quans en Cho, je côtent voyre Vous à nous qu'énale de me morame du pors ochefosi dans fous sanser de côtéments. Ilement, et les bouri muis pembo miment sur les goup la crèsons me flus q


epoch:   2   train_loss: 1.71   

# LSTM

http://colah.github.io/posts/2015-08-Understanding-LSTMs/

![](img/lstm.jpg)

In [42]:
def get_data(txt, bs):
    """
    Split `txt` into `bs` chunks.

    Each chunk has size `n`, `n` being as big as possible.
    Chunks are organized as columns in the result, making the final size `n * bs`.
    """

    txt = [char_to_idx[c] for c in txt]
    
    # Shrink `len(txt)` to a multiple of `bs`
    txt_len = (len(txt) // bs) * bs
    txt = txt[:txt_len]

    # Cut `txt` into `bs` distinct chunks
    data = torch.tensor(txt).view(bs, -1)
    data = data.transpose(0, 1).contiguous()

    if GPU:
        data = data.cuda()

    return data

In [43]:
def get_batches(data, bptt):
    """
    Yield `(data_batch, labels_batch)` batches from `data`.

    At each iteration, the two batches have the same `bptt * bs` size,
    except for the last batch which may have less than `bptt` rows.

    `data_batch` contains `bptt`-sized chunks of `data`.
    `labels_batch` contains `bptt`-sized chunks of `data`, offseted by 1.
    """

    # Cut `data` into two 2-dimensional chunks of size `bptt * bs`.
    # Last chunk may be less than `bptt` rows.
    while len(data) != 0:

        # Take (at most) bptt rows with offset 1 for labels
        labels_batch = data[1:bptt+1, :]
        # Take bptt rows as the labels with offset 0 for train
        data_batch = data[:len(labels_batch), :]

        if len(labels_batch) > 0:
            yield data_batch, labels_batch

        # Move on to next train train/labels rows
        data = data[bptt:]

In [44]:
i = 1
data = get_data(train_txt, bs=3)
for data_batch, labels_batch in get_batches(data, bptt=5):
    
    print(f'data:')
    print(data_batch)

    print(f'labels:')
    print(labels_batch)

    print()
    print()
    
    i += 1
    if i > 2:
        break

data:
tensor([[42, 66, 60],
        [60, 69, 88],
        [60, 68, 74],
        [63, 73, 59],
        [57,  0, 72]], device='cuda:0')
labels:
tensor([[60, 69, 88],
        [60, 68, 74],
        [63, 73, 59],
        [57,  0, 72],
        [63, 76,  0]], device='cuda:0')


data:
tensor([[63, 76,  0],
        [59, 69, 66],
        [75, 79, 59],
        [73, 55,  0],
        [59, 61, 70]], device='cuda:0')
labels:
tensor([[59, 69, 66],
        [75, 79, 59],
        [73, 55,  0],
        [59, 61, 70],
        [67, 59, 55]], device='cuda:0')




In [45]:
def generate(model, s, n, bptt, temperature):

    model.reset(1)

    for _ in range(n):
        data = get_data(s[-bptt:], 1)
        preds = model(data, temperature)[-1]
        pred_idx = torch.multinomial(preds.exp(), 1).item()
        pred_char = idx_to_char[pred_idx]
        s += pred_char
        
    return s

In [46]:
def train(model, optimizer, criterion, bptt, epochs):

    for epoch in range(1, epochs + 1):

        model.reset(bs)

        train_loss_sum, train_batches_nb = 0, 0
        for i, (data, labels) in enumerate(get_batches(train_data, bptt), 1):
            output = model(data)
            optimizer.zero_grad()
            loss = criterion(output, labels)
            train_loss_sum, train_batches_nb = train_loss_sum + loss.item(), train_batches_nb + 1
            loss.backward()
            optimizer.step()

        train_loss = train_loss_sum / train_batches_nb

        test_loss_sum, test_batches_nb = 0, 0
        for data, labels in get_batches(test_data, bptt):
            loss = criterion(model(data), labels)
            test_loss_sum, test_batches_nb = test_loss_sum + loss.item(), test_batches_nb + 1

        test_loss = test_loss_sum / test_batches_nb

        print(f'epoch: {epoch:3d}   train_loss: {train_loss:.2f}   test_loss: {test_loss:.2f}')

        if epoch == 1 or epoch % 10 == 0 or epoch == epochs:

            print()

            for temperature in (0.2, 0.5, 0.7, 1):
                print(f'sample T={temperature}: ' + generate(model, 'je ne sais pas', 200, bptt, temperature))
                print()

            print()

In [47]:
class LSTMCell(nn.Module):
    
    def __init__(self, n_fac, n_hidden):
        
        super().__init__()
        
        self.n_fac = n_fac
        self.n_hidden = n_hidden

        self.input_input_weights = nn.Linear(n_fac, n_hidden)
        self.hidden_input_weights = nn.Linear(n_hidden, n_hidden)

        self.input_forget_weights = nn.Linear(n_fac, n_hidden)
        self.hidden_forget_weights = nn.Linear(n_hidden, n_hidden)

        self.input_cell_weights = nn.Linear(n_fac, n_hidden)
        self.hidden_cell_weights = nn.Linear(n_hidden, n_hidden)

        self.input_hidden_weights = nn.Linear(n_fac, n_hidden)
        self.hidden_hidden_weights = nn.Linear(n_hidden, n_hidden)
        
    def forward(self, x, h, c):
        """
        `x` (input) is of size `bs * n_fac`
        `h` (hidden state) and `c` (cell state) are of size `bs * n_hidden`
        """

        # Forget relevant bits of the cell state
        forget_state = torch.sigmoid(self.input_forget_weights(x) + self.hidden_forget_weights(h))
        c *= forget_state
        
        # Update relevant bits of the cell state
        input_state = torch.sigmoid(self.input_input_weights(x) + self.hidden_input_weights(h))
        cell_update_state = torch.tanh(self.input_cell_weights(x) + self.hidden_cell_weights(h))
        c += input_state * cell_update_state
        
        # Forget relevant bits of the hidden state with the cell state
        hidden_update_state = self.input_hidden_weights(x) + self.hidden_hidden_weights(h)
        h = torch.tanh(c) * torch.sigmoid(hidden_update_state)
        
        return h, c

In [48]:
class LSTM(nn.Module):
    def __init__(self, n_vocab, n_fac, n_hidden):

        super().__init__()
        
        self.lstm_cell = LSTMCell(n_fac, n_hidden)
        self.e = nn.Embedding(n_vocab, n_fac)
        self.output_weights = nn.Linear(n_hidden, n_vocab)
        
        self.n_hidden = n_hidden

        self.reset(1)
        
    def forward(self, data, temperature=1):

        input = self.e(data)

        hidden_state = self.hidden_state
        cell_state = self.cell_state

        hidden_state_history = []
        # RNN loop on `input` of size: `bptt * bs * n_fac`:
        # bptt times for each `x` of size `bs * n_fac`
        for x in input:
            hidden_state, cell_state = self.lstm_cell(x, hidden_state, cell_state)
            hidden_state_history.append(hidden_state)

        # Throw away state histories
        self.hidden_state = Variable(hidden_state)
        self.cell_state = Variable(cell_state)
        
        # Get output
        output = self.output_weights(torch.stack(hidden_state_history))
        output = F.log_softmax(output / temperature, dim=-1)

        return output

    def reset(self, bs):

        self.hidden_state = torch.zeros([bs, self.n_hidden])
        self.cell_state = torch.zeros([bs, self.n_hidden])

        if GPU:
            self.hidden_state = self.hidden_state.cuda()
            self.cell_state = self.cell_state.cuda()

In [62]:
n_fac = n_vocab // 2
n_hidden = 100
bs = 1024
bptt4 = 8

In [63]:
model4 = LSTM(n_vocab, n_fac, n_hidden)
if GPU:
    model4 = model4.cuda()

In [64]:
def nll_loss_seq(output, labels):
    _, _, n_vocab = output.size()
    output = output.view(-1, n_vocab)
    labels = labels.reshape(-1)
    return F.nll_loss(output, labels)

In [65]:
optimizer4 = torch.optim.Adam(model4.parameters(), 1e-2)
criterion4 = nll_loss_seq

In [66]:
train_data = get_data(train_txt, bs)
test_data = get_data(test_txt, bs)

In [67]:
%%time
train(model4, optimizer4, criterion4, bptt4, epochs=20)

epoch:   1   train_loss: 1.94   test_loss: 1.66

sample T=0.2: je ne sais pas de la conner de la ville de la conne de la profite de la constraliens par les autre de la fait le part de la rue de la consait de la plus de la contre de la piens par les par le plus de la proche de 

sample T=0.5: je ne sais pas de suinde de la ville par les plus par le mon a plus en plande de l'artera peu de l'aime, mon couper ce de la celle de vous fait par le pas de parc. Après pas de la trance avec une passé de l'Husion 

sample T=0.7: je ne sais pas couple pas que je cessé de la chaup de l'accomps de la grantiers que nas pour les passer la nuit. Nous avec le suptais le travon renconstradion n'ai par un trop me fenint de même en autres profier ti

sample T=1: je ne sais pas d'allais ses sa plus, ou sudne qun mon noit-cament heurep 1°, je tar à trai lignement imment, accelsir en stop en centainex nous aurent le reversé pleur est entrez Shos d'aistrage à boit, duvet touve


epoch:   2   train_loss: 1.60   

# Pytorch LSTM

Let's stop reinventing the wheel for once.

In [68]:
def get_data(txt, bs):
    """
    Split `txt` into `bs` chunks.

    Each chunk has size `n`, `n` being as big as possible.
    Chunks are organized as columns in the result, making the final size `n * bs`.
    """

    txt = [char_to_idx[c] for c in txt]
    
    # Shrink `len(txt)` to a multiple of `bs`
    txt_len = (len(txt) // bs) * bs
    txt = txt[:txt_len]

    # Cut `txt` into `bs` distinct chunks
    data = torch.tensor(txt).view(bs, -1)
    data = data.transpose(0, 1).contiguous()

    if GPU:
        data = data.cuda()

    return data

In [69]:
def get_batches(data, bptt):
    """
    Yield `(data_batch, labels_batch)` batches from `data`.

    At each iteration, the two batches have the same `bptt * bs` size,
    except for the last batch which may have less than `bptt` rows.

    `data_batch` contains `bptt`-sized chunks of `data`.
    `labels_batch` contains `bptt`-sized chunks of `data`, offseted by 1.
    """

    # Cut `data` into two 2-dimensional chunks of size `bptt * bs`.
    # Last chunk may be less than `bptt` rows.
    while len(data) != 0:

        # Take (at most) bptt rows with offset 1 for labels
        labels_batch = data[1:bptt+1, :]
        # Take bptt rows as the labels with offset 0 for train
        data_batch = data[:len(labels_batch), :]

        if len(labels_batch) > 0:
            yield data_batch, labels_batch

        # Move on to next train train/labels rows
        data = data[bptt:]

In [70]:
def generate(model, s, n, bptt, temperature):

    model.reset(1)

    for _ in range(n):
        data = get_data(s[-bptt:], 1)
        preds = model(data, temperature)[-1]
        pred_idx = torch.multinomial(preds.exp(), 1).item()
        pred_char = idx_to_char[pred_idx]
        s += pred_char
        
    return s

In [71]:
def train(model, optimizer, criterion, bptt, epochs):

    for epoch in range(1, epochs + 1):

        model.reset(bs)

        train_loss_sum, train_batches_nb = 0, 0
        for i, (data, labels) in enumerate(get_batches(train_data, bptt), 1):
            output = model(data)
            optimizer.zero_grad()
            loss = criterion(output, labels)
            train_loss_sum, train_batches_nb = train_loss_sum + loss.item(), train_batches_nb + 1
            loss.backward()
            optimizer.step()

        train_loss = train_loss_sum / train_batches_nb

        test_loss_sum, test_batches_nb = 0, 0
        for data, labels in get_batches(test_data, bptt):
            loss = criterion(model(data), labels)
            test_loss_sum, test_batches_nb = test_loss_sum + loss.item(), test_batches_nb + 1

        test_loss = test_loss_sum / test_batches_nb

        print(f'epoch: {epoch:3d}   train_loss: {train_loss:.2f}   test_loss: {test_loss:.2f}')

        if epoch == 1 or epoch % 10 == 0 or epoch == epochs:

            print()

            for temperature in (0.2, 0.5, 0.7, 1):
                print(f'sample T={temperature}: ' + generate(model, 'je ne sais pas', 200, bptt, temperature))
                print()

            print()

In [72]:
class PytorchLSTM(nn.Module):
    def __init__(self, n_vocab, n_fac, n_hidden, n_layers, dropout=0):

        super().__init__()

        self.n_layers = n_layers
        self.n_hidden = n_hidden
        
        self.lstm = nn.LSTM(n_fac, n_hidden, n_layers, dropout)
        self.e = nn.Embedding(n_vocab, n_fac)
        self.output_weights = nn.Linear(n_hidden, n_vocab)

        self.reset(1)
        
    def forward(self, data, temperature=1):

        input = self.e(data)

        hidden_state = self.hidden_state
        cell_state = self.cell_state

        output, (hidden_state, cell_state) = self.lstm(input, (hidden_state, cell_state))

        # Throw away state histories
        self.hidden_state = Variable(hidden_state)
        self.cell_state = Variable(cell_state)
        
        # Get output
        output = self.output_weights(output)
        output = F.log_softmax(output / temperature, dim=-1)

        return output

    def reset(self, bs):

        self.hidden_state = torch.zeros([self.n_layers, bs, self.n_hidden])
        self.cell_state = torch.zeros([self.n_layers, bs, self.n_hidden])

        if GPU:
            self.hidden_state = self.hidden_state.cuda()
            self.cell_state = self.cell_state.cuda()

In [73]:
n_fac = n_vocab // 2
n_hidden = 512
bs = 1024
bptt5 = 300
n_layers = 2
dropout = 0.5

In [74]:
model5 = PytorchLSTM(n_vocab, n_fac, n_hidden, n_layers, dropout)
if GPU:
    model5 = model5.cuda()

In [75]:
def nll_loss_seq(output, labels):
    _, _, n_vocab = output.size()
    output = output.view(-1, n_vocab)
    labels = labels.reshape(-1)
    return F.nll_loss(output, labels)

In [76]:
optimizer5 = torch.optim.Adam(model5.parameters(), 1e-2)
criterion5 = nll_loss_seq

In [77]:
train_data = get_data(train_txt, bs)
test_data = get_data(test_txt, bs)

In [78]:
%%time
train(model5, optimizer5, criterion5, bptt5, epochs=20)

epoch:   1   train_loss: 3.49   test_loss: 3.05

sample T=0.2: je ne sais pas   et  ee e    ee  eue   e    e e   eu  e  ee   e    e  ee       e es    e ee  es    eu  er  ee  e e    ee  ee  e   en e  e      ee e  e et  ee  ee  eu  ee   e es   e    e    e  e  ee     es  eu ee  e

sample T=0.5: je ne sais pasre tai' es ee  eueue   eau ne   eu auorn et   uin si   n toe tte   et eotur ltaere tne  sf ttee rntttnn  p eros uutenltu   opi.n  re   ,so tsssu mseo e.anruet  no et  enueseo  te uoueu     t ureerue' 

sample T=0.7: je ne sais pasuu luer teaes etTlte :nuf el soaoiseri ttumu.e runC 2tareea he tsrsoés pe.x rsdsgsn nnssniuenx  s  rnersm lt o.  erneeertussnjtc .oe.ees e'u rnasute e nrl rt bo 2r ee .u  on ns vn eeud jnn jue l a e :

sample T=1: je ne sais pasno èItoe wtonms  rt serslLounpen'prstté f uelepeem steubsrPu.mdt see gml -auhicuuit2lsà'0urur  t J! kernse zme sda xotWxetr4s3o 4ilPdutvéitmt e Cuà0uD iuooutcu -eaeneolmfdets v e,a "  usimt a,re sp nn


epoch:   2   train_loss: 2.87   

In [79]:
%%time
train(model5, optimizer5, criterion5, bptt5, epochs=20)

epoch:   1   train_loss: 1.17   test_loss: 1.22

sample T=0.2: je ne sais pas de partir de la construction de la construction de la pluie de la construction de la capitale de la partie de la compagnie de la course de la capitale de la plage de la coupe de la pluie pour le pays

sample T=0.5: je ne sais pas de marche dans le forme de l'hospitalité 100000 Riogre. De la pluie en rencontre de la campagne de marche et la vie en plein matin et je rencontrerai un pays où il est de la plupart de la première vi

sample T=0.7: je ne sais pas facile de cette avertie de l'orage de Vanimo. Le poids de nous avait discuter à Biocto (Australie et d'abriter de la campagne de mètres de situation, il m'a pas remplacé par le fille de faciliter de 

sample T=1: je ne sais pas. Ils : prenant 2 ans... Il curte, égognant, mais ma vie de m'est que la langue américain où il est artité pour le (norn contre Sal Hédana. Bien visiter l'accueil et seulement le maroc, chirra 3 heure


epoch:   2   train_loss: 1.15   

# Compare models

In [81]:
models = {
    'Fixed-length RNN': (model1, n_chars),
    'Stateless RNN': (model2, bptt2),
    'Stateful RNN': (model3, bptt3),
    'Small LSTM': (model4, bptt4),
    'Large LSTM': (model5, bptt5)
}

In [82]:
initial_s = 'je ne sais pas'

for temperature in (0.2, 0.5, 0.7, 1, 1.2):
    print(f'T = {temperature}')
    print()
    
    for model_name, (model, bptt) in models.items():
        
        # Handle fixed-size RNN
        generate_func = generate_fixed_size if model_name == 'Fixed-length RNN' else generate
        s = initial_s[:n_chars] if model_name == 'Fixed-length RNN' else initial_s

        print(f'{model_name}:\n  ' + generate_func(model, s, 200, bptt, temperature))
        print()
    
    print()

T = 0.2

Fixed-length RNN:
  je ne sans de la cartes de la contre de par les cample de la mais pour les par la rencontres de la mais de la mais de cample de l'au par les par les par le par la contrais les part de la contre de l'au contre

Stateless RNN:
  je ne sais pas de la retrouver les petit par le mon au minutes de la rencontrer de la rencontrer de la partie de la plus de la ville de la planges de la chance de la route de la personnes de la rencontrer le partir

Stateful RNN:
  je ne sais pas pour par les plus par les plus pour les contraite de la compagnir les problème de la couple de la ville de la plus par le souette de la plus de mon peut de la connaisse de la proposer de la petit de 

Small LSTM:
  je ne sais pas de la connaissant des passer les petites de la plage de particule de la problème de la maison de la plage de la plage de la pluparationnelles de la plage de la policier de la pluie de profiter le sol

Large LSTM:
  je ne sais pas encore plus de changer de la populati

In [84]:
print(generate_func(model5, 'je ', 10000, bptt5, temperature=0.5))

je déjà envie de passer la journée et le bordel de la population française et visiter le sol de la famille pour découvrir l'article de l'autostop, avec un camion secondaire, et les moines de marches et des enfants nous avaient promenant avec les marchandises pour comprendre. La prochaine ville est possible de trouver une petite nuit et de son propre plaine qui me dit qu'il y a des toilettes en voitures qui m'avaient pour les chemins de la plage et que je ne suis pas gardé une seule fois dans l'un des plus chanceux au milieu de la compagnie de Papouasie. C'est le plus tard que je n'ai pas l'air d'avoir un peu d'argent de la confiture, le sol est pour ne pas l'aider à faire du stop que je souhaite donc passer la région de Medellin. Sur les problèmes d'argent avec les montagnes en compagnie de sponsor en Australie et me dépose à toute la santé de la compagnie de la conflit abiment de pain avec le temps de départ et de la compagnie. C'est sur le sol, en plein soleil et les amis en pleine m