In [1]:
from copy import copy

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.autograd import Variable

In [2]:
GPU=1

# Load data

In [3]:
txt = ''

In [4]:
with open('data/one_txt/sanitized_blogger.txt') as f:
    txt += f.read()

len(txt)

442724

In [5]:
with open('data/one_txt/sanitized_wordpress.txt') as f:
    txt += f.read()

len(txt)

3216695

In [6]:
vocab = sorted(list(set(txt)))
n_vocab = len(vocab)
print(''.join(vocab))

 !"$%'()+,-./0123456789:;=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~°àâçèéêëîïôùûœо€


In [7]:
char_to_idx = {char: idx for idx, char in enumerate(vocab)}
idx_to_char = {idx: char for idx, char in enumerate(vocab)}

In [8]:
train_frac = 4. / 5
train_txt = txt[:int(len(txt) * train_frac)]
test_txt = txt[int(len(txt) * train_frac):]

# Fixed-size RNN

This is a model which operates on a **fixed** amount of input characters (`n_chars`), and attempts to predict the character that comes after them.

The hidden state is reset for each new sequence of `n_chars` characters (*stateless*).

![](img/rnn_fixed.jpg)

In [9]:
n_chars = 8

In [10]:
def get_n_sized_chunks(s, n):
    """
    Yield successive n-sized chunks from a string.
    Discard the last chunk if not of size n.
    """
    for i in range(0, len(s), n):
        chunk = s[i:i + n]
        if len(chunk) == n:
            yield chunk

In [11]:
def get_data_tensor(txt, n_chars):
    chunks = list(get_n_sized_chunks(txt, n=n_chars))
    data_tensor = torch.tensor([[char_to_idx[char] for char in chunk] for chunk in chunks][:-1])
    if GPU:
        data_tensor = data_tensor.cuda()
    return data_tensor

In [12]:
def get_labels_tensor(txt, n_chars):
    chars = txt[n_chars::n_chars][:len(txt) // n_chars - 1]
    labels_tensor = torch.tensor([char_to_idx[char] for char in chars])
    if GPU:
        labels_tensor = labels_tensor.cuda()
    return labels_tensor

In [13]:
train_data_tensor = get_data_tensor(train_txt, n_chars)
print(train_data_tensor.size())

train_labels_tensor = get_labels_tensor(train_txt, n_chars)
print(train_labels_tensor.size())

torch.Size([321668, 8])
torch.Size([321668])


In [14]:
train_ds = TensorDataset(train_data_tensor, train_labels_tensor)
train_dl = DataLoader(train_ds, batch_size=1024)

In [15]:
test_data_tensor = get_data_tensor(test_txt, n_chars)
print(test_data_tensor.size())

test_labels_tensor = get_labels_tensor(test_txt, n_chars)
print(test_labels_tensor.size())

torch.Size([80416, 8])
torch.Size([80416])


In [16]:
test_ds = TensorDataset(test_data_tensor, test_labels_tensor)
test_dl = DataLoader(test_ds, batch_size=1024)

In [17]:
def generate_fixed_size(model, s, n, n_chars, temperature):

    # fixed-size input
    assert len(s) == n_chars

    final_s = s

    for _ in range(n):

        # Pad the input, because `get_data_tensor` will generate no data
        # if the input is less than `2 * n_chars` characters long.
        chars = get_data_tensor(s + ' ' * n_chars, n_chars)
        preds = model(chars, temperature)
        pred_idx = torch.multinomial(preds.exp(), 1).item()    
        pred_char = idx_to_char[pred_idx]
        s = s[1:] + pred_char
        final_s += pred_char

    return final_s

In [18]:
class FixedSizeRNN(nn.Module):
    def __init__(self, n_vocab, n_factors, n_hidden, n_chars):
        super().__init__()
        
        self.n_chars = n_chars
        self.n_hidden = n_hidden
        
        self.e = nn.Embedding(n_vocab, n_factors)
        self.input_weights = nn.Linear(n_factors, n_hidden)
        self.hidden_weights = nn.Linear(n_hidden, n_hidden)
        self.output_weights = nn.Linear(n_hidden, n_vocab)

    def forward(self, chars, temperature=1):

        # Reset hidden state at each mini-batch
        hidden_state = torch.zeros([len(chars), self.n_hidden])
        if GPU:
            hidden_state = hidden_state.cuda()

        for i in range(self.n_chars):
            input = F.relu(self.input_weights(self.e(chars[:, i])))
            hidden_state = torch.tanh(self.hidden_weights(input + hidden_state))

        output = F.log_softmax(self.output_weights(hidden_state) / temperature, dim=1)
        
        return output

In [19]:
n_fac = n_vocab // 2
n_hidden = 100

In [20]:
model1 = FixedSizeRNN(n_vocab, n_fac, n_hidden, n_chars)
if GPU:
    model1 = model1.cuda()

In [21]:
optimizer1 = torch.optim.Adam(model1.parameters(), 1e-2)
criterion1 = nn.NLLLoss()

In [22]:
%%time

epochs = 20

for epoch in range(1, epochs + 1):
    
    train_loss_sum, train_batches_nb = 0, 0
    for i, (data, labels) in enumerate(train_dl, 1):
        output = model1(data)
        optimizer1.zero_grad()
        loss = criterion1(output, labels)
        train_loss_sum, train_batches_nb = train_loss_sum + loss.item(), train_batches_nb + 1
        loss.backward()
        optimizer1.step()
        
    train_loss = train_loss_sum / train_batches_nb

    test_loss_sum, test_batches_nb = 0, 0
    for data, labels in test_dl:
        loss = criterion1(model1(data), labels)
        test_loss_sum, test_batches_nb = test_loss_sum + loss.item(), test_batches_nb + 1

    test_loss = test_loss_sum / test_batches_nb
        
    print(f'epoch: {epoch:3d}   train_loss: {train_loss:.2f}   test_loss: {test_loss:.2f}')

    if epoch == 1 or epoch % 10 == 0 or epoch == epochs:

        print()

        for temperature in (0.2, 0.5, 0.7, 1):
            print(f'sample T={temperature}: ' + generate_fixed_size(model1, 'je ne sais pas'[:n_chars], 200, n_chars, temperature))
            print()

        print()

epoch:   1   train_loss: 2.19   test_loss: 2.00

sample T=0.2: je ne sans de la cont de la comme le cuit de mais de la cont de par les mon par les par le chant de la par les mon de la comme les par les par les par le par les cont de les cont de les les la comme de la par

sample T=0.5: je ne sans se comme le plais. Au les ricuit de de maut de ses par les cultes ce les de les en rette air par les ceux et les de des mais les de cont plus de petit de par l'aute commient l'argent les mais la cu

sample T=0.7: je ne sans sant prop dans pares la ciat de prainte " quis ce mai ce mons et le chant est les la jes nommes voli partie de oir infais de cont sent les en rêpricont de de touter par mon préme une voir les lies,

sample T=1: je ne sa vide art : hanes neux de les très réchil de 50 Dinrieura, une fiert étocemes plus je peur puris traises gréjour maut. Cesquera retacnant veimênéte, musilati de les tourt que l'5/oude poures peux ? Je


epoch:   2   train_loss: 1.95   test_loss: 1.92
epoch:  

# Variable-size model

This is a model which operates on a **variable** amount of input characters, and attempts to predict the next character **after each input character**.

![](img/rnn_variable.jpg)

In [23]:
def get_data(txt, bs):
    """
    Split `txt` into `bs` chunks.

    Each chunk has size `n`, `n` being as big as possible.
    Chunks are organized as columns in the result, making the final size `n * bs`.
    """

    txt = [char_to_idx[c] for c in txt]
    
    # Shrink `len(txt)` to a multiple of `bs`
    txt_len = (len(txt) // bs) * bs
    txt = txt[:txt_len]

    # Cut `txt` into `bs` distinct chunks
    data = torch.tensor(txt).view(bs, -1)
    data = data.transpose(0, 1).contiguous()

    if GPU:
        data = data.cuda()

    return data

In [24]:
def get_batches(data, bptt):
    """
    Yield `(data_batch, labels_batch)` batches from `data`.

    At each iteration, the two batches have the same `bptt * bs` size,
    except for the last batch which may have less than `bptt` rows.

    `data_batch` contains `bptt`-sized chunks of `data`.
    `labels_batch` contains `bptt`-sized chunks of `data`, offseted by 1.
    """

    # Cut `data` into two 2-dimensional chunks of size `bptt * bs`.
    # Last chunk may be less than `bptt` rows.
    while len(data) != 0:

        # Take (at most) bptt rows with offset 1 for labels
        labels_batch = data[1:bptt+1, :]
        # Take bptt rows as the labels with offset 0 for train
        data_batch = data[:len(labels_batch), :]

        if len(labels_batch) > 0:
            yield data_batch, labels_batch

        # Move on to next train train/labels rows
        data = data[bptt:]

In [25]:
i = 1
data = get_data(train_txt, bs=3)
for data_batch, labels_batch in get_batches(data, bptt=5):
    
    print(f'data:')
    print(data_batch)

    print(f'labels:')
    print(labels_batch)

    print()
    print()
    
    i += 1
    if i > 2:
        break

data:
tensor([[42, 66, 60],
        [60, 69, 88],
        [60, 68, 74],
        [63, 73, 59],
        [57,  0, 72]], device='cuda:0')
labels:
tensor([[60, 69, 88],
        [60, 68, 74],
        [63, 73, 59],
        [57,  0, 72],
        [63, 76,  0]], device='cuda:0')


data:
tensor([[63, 76,  0],
        [59, 69, 66],
        [75, 79, 59],
        [73, 55,  0],
        [59, 61, 70]], device='cuda:0')
labels:
tensor([[59, 69, 66],
        [75, 79, 59],
        [73, 55,  0],
        [59, 61, 70],
        [67, 59, 55]], device='cuda:0')




In [26]:
def generate(model, s, n, bptt, temperature):

    model.reset(1)

    for _ in range(n):
        data = get_data(s[-bptt:], 1)
        preds = model(data, temperature)[-1]
        pred_idx = torch.multinomial(preds.exp(), 1).item()
        pred_char = idx_to_char[pred_idx]
        s += pred_char
        
    return s

In [27]:
def train(model, optimizer, criterion, bptt, epochs):

    for epoch in range(1, epochs + 1):

        model.reset(bs)

        train_loss_sum, train_batches_nb = 0, 0
        for i, (data, labels) in enumerate(get_batches(train_data, bptt), 1):
            output = model(data)
            optimizer.zero_grad()
            loss = criterion(output, labels)
            train_loss_sum, train_batches_nb = train_loss_sum + loss.item(), train_batches_nb + 1
            loss.backward()
            optimizer.step()

        train_loss = train_loss_sum / train_batches_nb

        test_loss_sum, test_batches_nb = 0, 0
        for data, labels in get_batches(test_data, bptt):
            loss = criterion(model(data), labels)
            test_loss_sum, test_batches_nb = test_loss_sum + loss.item(), test_batches_nb + 1

        test_loss = test_loss_sum / test_batches_nb

        print(f'epoch: {epoch:3d}   train_loss: {train_loss:.2f}   test_loss: {test_loss:.2f}')

        if epoch == 1 or epoch % 10 == 0 or epoch == epochs:

            print()

            for temperature in (0.2, 0.5, 0.7, 1):
                print(f'sample T={temperature}: ' + generate(model, 'je ne sais pas', 200, bptt, temperature))
                print()

            print()

In [28]:
class VariableLengthRNN(nn.Module):
    def __init__(self, n_vocab, n_fac, n_hidden, kind):
        super().__init__()
        
        assert kind in ('stateless', 'stateful')
        self.kind = kind
        
        self.n_hidden = n_hidden
        
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.e = nn.Embedding(n_vocab, n_fac)
        self.output_weights = nn.Linear(n_hidden, n_vocab)

        self.reset(1)
        
    def forward(self, data, temperature=1):
        input = self.e(data)
        output, h = self.rnn(input, self.hidden_state)
        
        if self.kind == 'stateful':
            # Keep the hidden state between each minibatch, but not its history
            self.hidden_state = Variable(h)
        
        output = self.output_weights(output)
        output = F.log_softmax(output / temperature, dim=-1)
        return output

    def reset(self, bs):
        self.hidden_state = torch.zeros([1, bs, self.n_hidden])
        if GPU:
            self.hidden_state = self.hidden_state.cuda()

## Stateless RNN

The hidden state is thown away from one mini-batch to another.

In [29]:
n_fac = n_vocab // 2
n_hidden = 100
bs = 1024
bptt2 = 8

In [30]:
model2 = VariableLengthRNN(n_vocab, n_fac, n_hidden, 'stateless')
if GPU:
    model2 = model2.cuda()

In [31]:
def nll_loss_seq(output, labels):
    _, _, n_vocab = output.size()
    output = output.view(-1, n_vocab)
    labels = labels.reshape(-1)
    return F.nll_loss(output, labels)

In [32]:
optimizer2 = torch.optim.Adam(model2.parameters(), 1e-2)
criterion2 = nll_loss_seq

In [33]:
train_data = get_data(train_txt, bs)
test_data = get_data(test_txt, bs)

In [34]:
%%time
train(model2, optimizer2, criterion2, bptt2, epochs=20)

epoch:   1   train_loss: 2.09   test_loss: 1.91

sample T=0.2: je ne sais pas de la pas de l'autre de la pas le prendre de la comporte de la partis de la rendre de mon partir le pas de la pour avec l'autre de mon partir de la ville de la partir de la par les mon peu de la part

sample T=0.5: je ne sais pas la mal de trandon pas de milien la ville sur le ports ne voyage de la chande de mes de la chames le contré mons mois de la lais le ville de la bulors de l'au don avant de pour au pil marche avoir est

sample T=0.7: je ne sais pas au vin compas à miliens rous au rentopantes somple de main en Argent de mes prope à Matirre, de mange d'autourant moindralier rélés je me sans le visitérent de les mois voyage. La frons de la vif dis

sample T=1: je ne sais pas dontrisons, s'auto troude les bouvrion dans peur, ennier les moncaumant, je carrières, repui le bara 10 l'inait dugrand vicités diste es boup comment, des mouventre don pieu bul poéries dorinve. Noil


epoch:   2   train_loss: 1.88   

## Stateful RNN

The hidden state is be memorized from one mini-batch to another (hence *stateful*), but reset between epochs, and at predict time.

In [35]:
n_fac = n_vocab // 2
n_hidden = 100
bs = 1024
bptt3 = 8

In [36]:
model3 = VariableLengthRNN(n_vocab, n_fac, n_hidden, 'stateful')
if GPU:
    model3 = model3.cuda()

In [37]:
def nll_loss_seq(output, labels):
    _, _, n_vocab = output.size()
    output = output.view(-1, n_vocab)
    labels = labels.reshape(-1)
    return F.nll_loss(output, labels)

In [38]:
optimizer3 = torch.optim.Adam(model3.parameters(), 1e-2)
criterion3 = nll_loss_seq

In [39]:
train_data = get_data(train_txt, bs)
test_data = get_data(test_txt, bs)

In [40]:
%%time
train(model3, optimizer3, criterion3, bptt3, epochs=20)

epoch:   1   train_loss: 1.98   test_loss: 1.75

sample T=0.2: je ne sais pas de la permet de petit de proper de proper de pas de parti de parte de protion de la permet de perde de petit pas de plus au montant de marche de mais de pas pas de mon autres de connaise de la perde 

sample T=0.5: je ne sais pas de des sombre de toure aus par le proble le faire pour les gens de perde pas de ville en passe de boins au matint pas travant à l'auglage une propison d'aiture de mais qui aux plus des peropes de voy

sample T=0.7: je ne sais pas à Maragtement frions village à la n'autre seradons mariment en Boluite de tout au plus pour les villabre. Je marcilles dans le pays de continixparieurs à la plusi quelques cest distant dors. Du ne pe

sample T=1: je ne sais pas au très porson à loingeon au dénvés les son Khakm Mellement  Tistauxisoconce sont un réviel par même peu du petits de plans du propater pas pour gagne cortion est moosis) qui passatique du kilotauent


epoch:   2   train_loss: 1.70   

# LSTM

http://colah.github.io/posts/2015-08-Understanding-LSTMs/

![](img/lstm.jpg)

In [41]:
def get_data(txt, bs):
    """
    Split `txt` into `bs` chunks.

    Each chunk has size `n`, `n` being as big as possible.
    Chunks are organized as columns in the result, making the final size `n * bs`.
    """

    txt = [char_to_idx[c] for c in txt]
    
    # Shrink `len(txt)` to a multiple of `bs`
    txt_len = (len(txt) // bs) * bs
    txt = txt[:txt_len]

    # Cut `txt` into `bs` distinct chunks
    data = torch.tensor(txt).view(bs, -1)
    data = data.transpose(0, 1).contiguous()

    if GPU:
        data = data.cuda()

    return data

In [42]:
def get_batches(data, bptt):
    """
    Yield `(data_batch, labels_batch)` batches from `data`.

    At each iteration, the two batches have the same `bptt * bs` size,
    except for the last batch which may have less than `bptt` rows.

    `data_batch` contains `bptt`-sized chunks of `data`.
    `labels_batch` contains `bptt`-sized chunks of `data`, offseted by 1.
    """

    # Cut `data` into two 2-dimensional chunks of size `bptt * bs`.
    # Last chunk may be less than `bptt` rows.
    while len(data) != 0:

        # Take (at most) bptt rows with offset 1 for labels
        labels_batch = data[1:bptt+1, :]
        # Take bptt rows as the labels with offset 0 for train
        data_batch = data[:len(labels_batch), :]

        if len(labels_batch) > 0:
            yield data_batch, labels_batch

        # Move on to next train train/labels rows
        data = data[bptt:]

In [43]:
i = 1
data = get_data(train_txt, bs=3)
for data_batch, labels_batch in get_batches(data, bptt=5):
    
    print(f'data:')
    print(data_batch)

    print(f'labels:')
    print(labels_batch)

    print()
    print()
    
    i += 1
    if i > 2:
        break

data:
tensor([[42, 66, 60],
        [60, 69, 88],
        [60, 68, 74],
        [63, 73, 59],
        [57,  0, 72]], device='cuda:0')
labels:
tensor([[60, 69, 88],
        [60, 68, 74],
        [63, 73, 59],
        [57,  0, 72],
        [63, 76,  0]], device='cuda:0')


data:
tensor([[63, 76,  0],
        [59, 69, 66],
        [75, 79, 59],
        [73, 55,  0],
        [59, 61, 70]], device='cuda:0')
labels:
tensor([[59, 69, 66],
        [75, 79, 59],
        [73, 55,  0],
        [59, 61, 70],
        [67, 59, 55]], device='cuda:0')




In [44]:
def generate(model, s, n, bptt, temperature):

    model.reset(1)

    for _ in range(n):
        data = get_data(s[-bptt:], 1)
        preds = model(data, temperature)[-1]
        pred_idx = torch.multinomial(preds.exp(), 1).item()
        pred_char = idx_to_char[pred_idx]
        s += pred_char
        
    return s

In [45]:
def train(model, optimizer, criterion, bptt, epochs):

    for epoch in range(1, epochs + 1):

        model.reset(bs)

        train_loss_sum, train_batches_nb = 0, 0
        for i, (data, labels) in enumerate(get_batches(train_data, bptt), 1):
            output = model(data)
            optimizer.zero_grad()
            loss = criterion(output, labels)
            train_loss_sum, train_batches_nb = train_loss_sum + loss.item(), train_batches_nb + 1
            loss.backward()
            optimizer.step()

        train_loss = train_loss_sum / train_batches_nb

        test_loss_sum, test_batches_nb = 0, 0
        for data, labels in get_batches(test_data, bptt):
            loss = criterion(model(data), labels)
            test_loss_sum, test_batches_nb = test_loss_sum + loss.item(), test_batches_nb + 1

        test_loss = test_loss_sum / test_batches_nb

        print(f'epoch: {epoch:3d}   train_loss: {train_loss:.2f}   test_loss: {test_loss:.2f}')

        if epoch == 1 or epoch % 10 == 0 or epoch == epochs:

            print()

            for temperature in (0.2, 0.5, 0.7, 1):
                print(f'sample T={temperature}: ' + generate(model, 'je ne sais pas', 200, bptt, temperature))
                print()

            print()

In [46]:
class LSTMCell(nn.Module):
    
    def __init__(self, n_fac, n_hidden):
        
        super().__init__()
        
        self.n_fac = n_fac
        self.n_hidden = n_hidden
        
        self.forget_gate = nn.Linear(n_fac + n_hidden, n_hidden)
        self.input_gate = nn.Linear(n_fac + n_hidden, n_hidden)
        self.cell_update_gate = nn.Linear(n_fac + n_hidden, n_hidden)
        self.hidden_update_gate = nn.Linear(n_fac + n_hidden, n_hidden)
        
    def forward(self, x, hidden_state, cell_state):
        """
        `x` is of size `bs * n_fac`
        `hidden_state` are of size `bs * n_hidden`
        """

        # `x` is now of size `bs * (n_fac + n_hidden)`
        x = torch.cat([x, hidden_state], dim=1)

        # Forget relevant bits of the cell state
        cell_state *= torch.sigmoid(self.forget_gate(x))
        # Update relevant bits of the cell state
        cell_state += torch.tanh(self.cell_update_gate(x)) * torch.sigmoid(self.input_gate(x))

        # Forget relevant bits of the hidden state
        # Use `1 *` to avoid in-place in-place operation that blocks autograd
        hidden_state = 1 * torch.sigmoid(self.hidden_update_gate(x))
        # Integrate cell state to hidden_state
        hidden_state *= Variable(torch.tanh(cell_state))
        
        return hidden_state, cell_state

In [47]:
class LSTM(nn.Module):
    def __init__(self, n_vocab, n_fac, n_hidden):

        super().__init__()
        
        self.lstm_cell = LSTMCell(n_fac, n_hidden)
        self.e = nn.Embedding(n_vocab, n_fac)
        self.output_weights = nn.Linear(n_hidden, n_vocab)
        
        self.n_hidden = n_hidden

        self.reset(1)
        
    def forward(self, data, temperature=1):

        input = self.e(data)

        hidden_state = self.hidden_state
        cell_state = self.cell_state

        hidden_state_history = []
        # RNN loop on `input` of size: `bptt * bs * n_fac`:
        # bptt times for each `x` of size `bs * n_fac`
        for x in input:
            hidden_state, cell_state = self.lstm_cell(x, hidden_state, cell_state)
            hidden_state_history.append(hidden_state)

        # Throw away state histories
        self.hidden_state = Variable(hidden_state)
        self.cell_state = Variable(cell_state)
        
        # Get output
        output = self.output_weights(torch.stack(hidden_state_history))
        output = F.log_softmax(output / temperature, dim=-1)

        return output

    def reset(self, bs):

        self.hidden_state = torch.zeros([bs, self.n_hidden])
        self.cell_state = torch.zeros([bs, self.n_hidden])

        if GPU:
            self.hidden_state = self.hidden_state.cuda()
            self.cell_state = self.cell_state.cuda()

In [48]:
n_fac = n_vocab // 2
n_hidden = 100
bs = 1024
bptt4 = 8

In [49]:
model4 = LSTM(n_vocab, n_fac, n_hidden)
if GPU:
    model4 = model4.cuda()

In [50]:
def nll_loss_seq(output, labels):
    _, _, n_vocab = output.size()
    output = output.view(-1, n_vocab)
    labels = labels.reshape(-1)
    return F.nll_loss(output, labels)

In [51]:
optimizer4 = torch.optim.Adam(model4.parameters(), 1e-2)
criterion4 = nll_loss_seq

In [52]:
train_data = get_data(train_txt, bs)
test_data = get_data(test_txt, bs)

In [53]:
%%time
train(model4, optimizer4, criterion4, bptt4, epochs=20)

epoch:   1   train_loss: 2.36   test_loss: 2.03

sample T=0.2: je ne sais pas de la me par le prent de la prous de par de le mon aut se de la son de la prent de la mais de la mais de pres de la prent de la ple de cont de mon de la par le de la prons de la peur de la plus pour 

sample T=0.5: je ne sais pas cheur autit au son cont l'est du pour avais nous et sont de re von de l'a bonsement et de la bour de sent les mon dis en dit suit les son fait de vol mons de bormpir en pous pour de tour la de décont

sample T=0.7: je ne sais pas seur de qu'illamers de con et rou me la sout ravencement en pas du marchene du milles, pas coins au à sen déparge endre toun dans la tre la le s'avent et trertès passez au t de la vines tours fais pl

sample T=1: je ne sais pas le vorme et prent Lat, et aventouretre notau pouimsr de mouofi, bons viér poumes dan Bioss des .. Esutuale qusien de sorpute dontaneaisements di fon tour upetatrantivité dud la du colule fairmend pla


epoch:   2   train_loss: 1.95   

# Pytorch LSTM

Let's stop reinventing the wheel for once.

In [54]:
def get_data(txt, bs):
    """
    Split `txt` into `bs` chunks.

    Each chunk has size `n`, `n` being as big as possible.
    Chunks are organized as columns in the result, making the final size `n * bs`.
    """

    txt = [char_to_idx[c] for c in txt]
    
    # Shrink `len(txt)` to a multiple of `bs`
    txt_len = (len(txt) // bs) * bs
    txt = txt[:txt_len]

    # Cut `txt` into `bs` distinct chunks
    data = torch.tensor(txt).view(bs, -1)
    data = data.transpose(0, 1).contiguous()

    if GPU:
        data = data.cuda()

    return data

In [55]:
def get_batches(data, bptt):
    """
    Yield `(data_batch, labels_batch)` batches from `data`.

    At each iteration, the two batches have the same `bptt * bs` size,
    except for the last batch which may have less than `bptt` rows.

    `data_batch` contains `bptt`-sized chunks of `data`.
    `labels_batch` contains `bptt`-sized chunks of `data`, offseted by 1.
    """

    # Cut `data` into two 2-dimensional chunks of size `bptt * bs`.
    # Last chunk may be less than `bptt` rows.
    while len(data) != 0:

        # Take (at most) bptt rows with offset 1 for labels
        labels_batch = data[1:bptt+1, :]
        # Take bptt rows as the labels with offset 0 for train
        data_batch = data[:len(labels_batch), :]

        if len(labels_batch) > 0:
            yield data_batch, labels_batch

        # Move on to next train train/labels rows
        data = data[bptt:]

In [56]:
def generate(model, s, n, bptt, temperature):

    model.reset(1)

    for _ in range(n):
        data = get_data(s[-bptt:], 1)
        preds = model(data, temperature)[-1]
        pred_idx = torch.multinomial(preds.exp(), 1).item()
        pred_char = idx_to_char[pred_idx]
        s += pred_char
        
    return s

In [57]:
def train(model, optimizer, criterion, bptt, epochs):

    for epoch in range(1, epochs + 1):

        model.reset(bs)

        train_loss_sum, train_batches_nb = 0, 0
        for i, (data, labels) in enumerate(get_batches(train_data, bptt), 1):
            output = model(data)
            optimizer.zero_grad()
            loss = criterion(output, labels)
            train_loss_sum, train_batches_nb = train_loss_sum + loss.item(), train_batches_nb + 1
            loss.backward()
            optimizer.step()

        train_loss = train_loss_sum / train_batches_nb

        test_loss_sum, test_batches_nb = 0, 0
        for data, labels in get_batches(test_data, bptt):
            loss = criterion(model(data), labels)
            test_loss_sum, test_batches_nb = test_loss_sum + loss.item(), test_batches_nb + 1

        test_loss = test_loss_sum / test_batches_nb

        print(f'epoch: {epoch:3d}   train_loss: {train_loss:.2f}   test_loss: {test_loss:.2f}')

        if epoch == 1 or epoch % 10 == 0 or epoch == epochs:

            print()

            for temperature in (0.2, 0.5, 0.7, 1):
                print(f'sample T={temperature}: ' + generate(model, 'je ne sais pas', 200, bptt, temperature))
                print()

            print()

In [58]:
class PytorchLSTM(nn.Module):
    def __init__(self, n_vocab, n_fac, n_hidden, n_layers):

        super().__init__()

        self.n_layers = n_layers
        self.n_hidden = n_hidden
        
        self.lstm = nn.LSTM(n_fac, n_hidden, n_layers, dropout=0.5)
        self.e = nn.Embedding(n_vocab, n_fac)
        self.output_weights = nn.Linear(n_hidden, n_vocab)

        self.reset(1)
        
    def forward(self, data, temperature=1):

        input = self.e(data)

        hidden_state = self.hidden_state
        cell_state = self.cell_state

        output, (hidden_state, cell_state) = self.lstm(input, (hidden_state, cell_state))

        # Throw away state histories
        self.hidden_state = Variable(hidden_state)
        self.cell_state = Variable(cell_state)
        
        # Get output
        output = self.output_weights(output)
        output = F.log_softmax(output / temperature, dim=-1)

        return output

    def reset(self, bs):

        self.hidden_state = torch.zeros([self.n_layers, bs, self.n_hidden])
        self.cell_state = torch.zeros([self.n_layers, bs, self.n_hidden])

        if GPU:
            self.hidden_state = self.hidden_state.cuda()
            self.cell_state = self.cell_state.cuda()

In [67]:
n_fac = n_vocab * 2 // 3
n_hidden = 512
bs = 1024
bptt5 = 30
n_layers = 2

In [68]:
model5 = PytorchLSTM(n_vocab, n_fac, n_hidden, n_layers)
if GPU:
    model5 = model5.cuda()

In [69]:
def nll_loss_seq(output, labels):
    _, _, n_vocab = output.size()
    output = output.view(-1, n_vocab)
    labels = labels.reshape(-1)
    return F.nll_loss(output, labels)

In [70]:
optimizer5 = torch.optim.Adam(model5.parameters(), 1e-2)
criterion5 = nll_loss_seq

In [71]:
train_data = get_data(train_txt, bs)
test_data = get_data(test_txt, bs)

In [72]:
%%time
train(model5, optimizer5, criterion5, bptt5, epochs=20)

epoch:   1   train_loss: 2.38   test_loss: 1.85

sample T=0.2: je ne sais pas en stite de la vit de la pors de soute de voir de sont de la vie de monte de propite de la mois de la propite de plus de la mois de mon de passer de me partire de la bontion de la nous de restent de 

sample T=0.5: je ne sais pas à Soute au ville de rondre de la sartions de nomme d'autre prement sotisite défité de tout pas arronter les plus alorne de mon dans on soir la  plus mon traves la bonce, arriver les bimelle exté de m

sample T=0.7: je ne sais passent peux la rait il vis. Je noir et au pland d'austrez entouche la prosent la rainement la la comme somme seventer dans avons la que de jours petit de faiiller le rentéré pas du relle le motate de mo

sample T=1: je ne sais pas qui supkitres buvumnatiol, dons tures et le propalages du lusibuement me trurs même austande avec au feunemir " 2€ , -al, hostroi des portant que Je rechevommé je sementité mais arréficut, le lépunie


epoch:   2   train_loss: 1.67   

# Compare models

In [73]:
models = {
    'Fixed-length RNN': (model1, n_chars),
    'Stateless RNN': (model2, bptt2),
    'Stateful RNN': (model3, bptt3),
    'Small LSTM': (model4, bptt4),
    'Large LSTM': (model5, bptt5)
}

In [74]:
initial_s = 'je ne sais pas'

for temperature in (0.2, 0.5, 0.7, 1, 1.2):
    print(f'T = {temperature}')
    print()
    
    for model_name, (model, bptt) in models.items():
        
        # Handle fixed-size RNN
        generate_func = generate_fixed_size if model_name == 'Fixed-length RNN' else generate
        s = initial_s[:n_chars] if model_name == 'Fixed-length RNN' else initial_s

        print(f'{model_name}:\n  ' + generate_func(model, s, 200, bptt, temperature))
        print()
    
    print()

T = 0.2

Fixed-length RNN:
  je ne sans les mais de les partiques des pas des parte de les pas de la pas seulement de la parce de la peur des pas de les partie de la pas de les parce de les pas des parte de ce pour des pas de les parte d

Stateless RNN:
  je ne sais pas de la plus de la plus de mon peu pour les petit de la plus de mon peu par les permette de la mon peu par les petit de la plus de marcher de mais de la ville de la partie de la ville de la construite 

Stateful RNN:
  je ne sais pas pas pas par le soleil et pas pas pas par construiser le travaillais de la chance de la plus de mon peut la ville de pays de la coupe de la fond de la construide de la route et le dépossions de la vil

Small LSTM:
  je ne sais pas de la compagnie de la prendre le proche de la cons les pris de la compagnie de la route de la compagnie de l'avais de la ville de travailler en Australie de la ville de plus de la prendre les partie 

Large LSTM:
  je ne sais pas de profiter des pays de la ville de l

In [79]:
print(generate_func(model5, 'je ', 1000, bptt5, temperature=0.3))

je suffisant de la ville et de l'autostop sont en compagnie de la ville en pleine ville de Sydney) et le voyage de cette fois que nous avons rendu son sac-à-dos. En effet, un peu de temps en sachant une construction de plus de 2 pour dormir des pays et des petits villages de la frontière et le pays de la route de l'autostop pour me rendre par consequent sur le soleil et de nombreux mois et nous avons parti de la plage de la ville et les plus belles villes de mon sac de marche de l'argent et le pays de la planète et de la ville de la route et sont déjà un peu de ce moment de partir de la première fois plus de 2000 Rinas d'eau de la pluie de la route de la population de l'autostop pour le pays et de la route de la ville est un peu de soleil et les plats de sourires et des petits pays pour dormir dans les petits boulots de la soirée et de l'autostop est de l'autre côté de la route pour les plus tard et pour la pluie et le pays de terme de la plage de la plupart des plats et les plus propr