In [1]:
from copy import copy

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader

from torch.autograd import Variable

# Load data

In [2]:
txt = ''

In [3]:
with open('data/one_txt/sanitized_blogger.txt') as f:
    txt += f.read()

len(txt)

442724

In [4]:
#with open('data/one_txt/sanitized_wordpress.txt') as f:
#    txt += f.read()
#
#len(txt)

In [5]:
vocab = sorted(list(set(txt)))
n_vocab = len(vocab)
print(''.join(vocab))

 !"$%'()+,-./0123456789:;=>?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz~°àâçèéêëîïôùûœ€


In [6]:
char_to_idx = {char: idx for idx, char in enumerate(vocab)}
idx_to_char = {idx: char for idx, char in enumerate(vocab)}

In [7]:
train_frac = 3. / 4
train_txt = txt[:int(len(txt) * train_frac)]
test_txt = txt[int(len(txt) * train_frac):]

# Stateless model

This is a model which operates on a **fixed** amount of input characters (`n_chars`), and attempts to predict the character that comes after them.

The hidden state is reset for each new sequence of `n_chars` characters (hence *stateless*).

In [8]:
n_chars = 3

In [9]:
def get_n_sized_chunks(s, n):
    """
    Yield successive n-sized chunks from a string.
    Discard the last chunk if not of size n.
    """
    for i in range(0, len(s), n):
        chunk = s[i:i + n]
        if len(chunk) == n:
            yield chunk

In [10]:
def get_data_tensor(txt, n_chars):
    chunks = list(get_n_sized_chunks(txt, n=n_chars))
    data_tensor = torch.tensor([[char_to_idx[char] for char in chunk] for chunk in chunks][:-1])
    return data_tensor

In [11]:
def get_labels_tensor(txt, n_chars):
    chars = txt[n_chars::n_chars][:len(txt) // n_chars - 1]
    labels_tensor = torch.tensor([char_to_idx[char] for char in chars])
    return labels_tensor

In [12]:
train_data_tensor = get_data_tensor(train_txt, n_chars)
print(train_data_tensor.size())

train_labels_tensor = get_labels_tensor(train_txt, n_chars)
print(train_labels_tensor.size())

torch.Size([110680, 3])
torch.Size([110680])


In [13]:
train_ds = TensorDataset(train_data_tensor, train_labels_tensor)
train_dl = DataLoader(train_ds, batch_size=1024)

In [14]:
test_data_tensor = get_data_tensor(test_txt, n_chars)
print(test_data_tensor.size())

test_labels_tensor = get_labels_tensor(test_txt, n_chars)
print(test_labels_tensor.size())

torch.Size([36892, 3])
torch.Size([36892])


In [15]:
test_ds = TensorDataset(test_data_tensor, test_labels_tensor)
test_dl = DataLoader(test_ds, batch_size=1024)

In [16]:
def generate1(model, s, n, kind):

    assert kind in ('top', 'multinomial')
    assert len(s) == n_chars

    final_s = s

    for _ in range(n):

        chars = get_data_tensor(s + '   ', n_chars)
        preds = model(chars)

        if kind == 'top':
            pred_idx = preds.argmax().item()

        elif kind == 'multinomial':
            pred_idx = torch.multinomial(preds.exp(), 1).item()
            
        pred_char = idx_to_char[pred_idx]
        s = s[1:] + pred_char
        final_s += pred_char

    return final_s

![](img/rnn1.jpg)

In [17]:
class StatelessModel(nn.Module):
    def __init__(self, n_vocab, n_factors, n_hidden, n_chars):
        super().__init__()
        self.n_chars = n_chars
        self.e = nn.Embedding(n_vocab, n_factors)
        self.input_weights = nn.Linear(n_factors, n_hidden)
        self.hidden_weights = nn.Linear(n_hidden, n_hidden)
        self.output_weights = nn.Linear(n_hidden, n_vocab)

    def forward(self, chars):

        hidden = torch.zeros([len(chars), n_hidden])

        for i in range(self.n_chars):
            input = F.relu(self.input_weights(self.e(chars[:, i])))
            hidden = torch.tanh(self.hidden_weights(input + hidden))

        output = F.log_softmax(self.output_weights(hidden), dim=1)
        
        return output

In [18]:
n_fac = n_vocab // 2
n_hidden = 100

In [19]:
model1 = StatelessModel(n_vocab, n_fac, n_hidden, n_chars)

In [20]:
optimizer1 = torch.optim.Adam(model1.parameters(), 1e-2)
criterion1 = nn.NLLLoss()

In [22]:
epochs = 50

for epoch in range(1, epochs + 1):
    
    print(f'epoch: {epoch}')
    
    train_loss_sum, train_batches_nb = 0, 0
    for i, (data, labels) in enumerate(train_dl, 1):
        output = model1(data)
        optimizer1.zero_grad()
        loss = criterion1(output, labels)
        train_loss_sum, train_batches_nb = train_loss_sum + loss.item(), train_batches_nb + 1
        loss.backward()
        optimizer1.step()

    test_loss_sum, test_batches_nb = 0, 0
    for data, labels in test_dl:
        loss = criterion1(model1(data), labels)
        test_loss_sum, test_batches_nb = test_loss_sum + loss.item(), test_batches_nb + 1

    if epoch == 1 or epoch % 10 == 0 or epoch == epochs:

        print()
        
        print(f'train loss: {round(train_loss_sum / train_batches_nb, 2)}')
        print(f'test loss: {round(test_loss_sum / test_batches_nb, 2)}')
        
        print()
        
        for kind in ('top', 'multinomial'):
            print(f'sample {kind}: ' + generate1(model1, 'je ', 200, kind))
            print()

epoch: 1

train loss: 2.13
test loss: 2.1

sample top: je pas de pas de pas de pas de pas de pas de pas de pas de pas de pas de pas de pas de pas de pas de pas de pas de pas de pas de pas de pas de pas de pas de pas de pas de pas de pas de pas de pas de pas 

sample multinomial: je lapirs au qui décont uno. Toit paiment, quenfen 4 taet in envommay conquta. Ils et Il heutossouvrandende nont à ques à qu'à cros. La best ave auses hest du pas ceux cu pour est frore de ser es, à leur

epoch: 2
epoch: 3
epoch: 4
epoch: 5
epoch: 6
epoch: 7
epoch: 8
epoch: 9
epoch: 10

train loss: 1.84
test loss: 1.9

sample top: je pas de cont pas de cont pas de cont pas de cont pas de cont pas de cont pas de cont pas de cont pas de cont pas de cont pas de cont pas de cont pas de cont pas de cont pas de cont pas de cont pas de c

sample multinomial: je noleans paraïtantors, et leage érois, le faRit, astop qu'à mon combe demant pour dires soine gnomponn est pour av'est tours même" prent est ? Tillées ensis, à 

# Stateful model

This is a model which operates on a **variable** amount of input characters, and attempts to predict the next character **after each input character**.

The hidden state is memorized from one mini-batch to another time (hence *stateful*), but reset between epochs, and at predict time.

In [23]:
def get_data(txt, bs):
    """
    Split `txt` into `bs` chunks.

    Each chunk has size `n`, `n` being as big as possible.
    Chunks are organized as columns in the result, making the final size `n * bs`.
    """

    txt = [char_to_idx[c] for c in txt]
    
    # Shrink `len(txt)` to a multiple of `bs`
    txt_len = (len(txt) // bs) * bs
    txt = txt[:txt_len]

    # Cut `txt` into `bs` distinct chunks
    data = torch.tensor(txt).view(bs, -1)
    data = data.transpose(0, 1).contiguous()
    
    return data

In [24]:
def get_batches(data, bptt):
    """
    Yield `(data_batch, labels_batch)` batches from `data`.

    At each iteration, the two batches have the same `bptt * bs` size,
    except for the last batch which may have less than `bptt` rows.

    `data_batch` contains `bptt`-sized chunks of `data`.
    `labels_batch` contains `bptt`-sized chunks of `data`, offseted by 1.
    """

    # Cut `data` into two 2-dimensional chunks of size `bptt * bs`.
    # Last chunk may be less than `bptt` rows.
    while len(data) != 0:

        # Take (at most) bptt rows with offset 1 for labels
        labels_batch = data[1:bptt+1, :]
        # Take bptt rows as the labels with offset 0 for train
        data_batch = data[:len(labels_batch), :]

        if len(labels_batch) > 0:
            yield data_batch, labels_batch

        # Move on to next train train/labels rows
        data = data[bptt:]

In [25]:
i = 1
data = get_data(train_txt, bs=3)
for data_batch, labels_batch in get_batches(data, bptt=5):
    
    print(f'data:')
    print(data_batch)

    print(f'labels:')
    print(labels_batch)

    print()
    print()
    
    i += 1
    if i > 2:
        break

data:
tensor([[42, 54, 72],
        [59, 67,  0],
        [59, 73, 11],
        [62,  0, 11],
        [56, 69, 11]])
labels:
tensor([[59, 67,  0],
        [59, 73, 11],
        [62,  0, 11],
        [56, 69, 11],
        [62, 62,  0]])


data:
tensor([[62, 62,  0],
        [58, 72, 30],
        [74, 11,  5],
        [72,  0, 58],
        [58, 32, 72]])
labels:
tensor([[58, 72, 30],
        [74, 11,  5],
        [72,  0, 58],
        [58, 32, 72],
        [66, 67, 73]])




In [26]:
def generate2(model, s, n, kind):

    assert kind in ('top', 'multinomial')

    model.reset(1)

    res = s
    for _ in range(n):
        data = get_data(s, 1)
        preds = model(data)[-1]

        if kind == 'top':
            pred_idx = preds.argmax().item()

        elif kind == 'multinomial':
            pred_idx = torch.multinomial(preds.exp(), 1).item()

        pred_char = idx_to_char[pred_idx]
        res += pred_char
        s = s[1:] + pred_char
        
    return res

![](img/rnn2.jpg)

In [29]:
class StatefulModel(nn.Module):
    def __init__(self, n_vocab, n_fac, n_hidden):
        super().__init__()
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.e = nn.Embedding(n_vocab, n_fac)
        self.output_weights = nn.Linear(n_hidden, n_vocab)

    def forward(self, data):
        input = self.e(data)
        output, h = self.rnn(input, self.hidden_weights)
        self.hidden_weights = Variable(h.data)
        output = self.output_weights(output)
        output = F.log_softmax(output, dim=-1)
        return output

    def reset(self, bs):
        self.hidden_weights = torch.zeros([1, bs, n_hidden])

In [30]:
n_fac = n_vocab // 2
n_hidden = 100
bs = 1024
bptt = 3

In [31]:
model2 = StatefulModel(n_vocab, n_fac, n_hidden)

In [32]:
def nll_loss_seq(output, labels):
    _, _, n_vocab = output.size()
    output = output.view(-1, n_vocab)
    labels = labels.reshape(-1)
    return F.nll_loss(output, labels)

In [33]:
optimizer2 = torch.optim.Adam(model2.parameters(), 1e-2)
criterion2 = nll_loss_seq

In [34]:
train_data = get_data(train_txt, bs)
test_data = get_data(test_txt, bs)

In [35]:
epochs = 50

for epoch in range(1, epochs + 1):
    
    print(f'epoch: {epoch}')

    model2.reset(bs)

    train_loss_sum, train_batches_nb = 0, 0
    for i, (data, labels) in enumerate(get_batches(train_data, bptt), 1):
        output = model2(data)
        optimizer2.zero_grad()
        loss = criterion2(output, labels)
        train_loss_sum, train_batches_nb = train_loss_sum + loss.item(), train_batches_nb + 1
        loss.backward()
        optimizer2.step()

    test_loss_sum, test_batches_nb = 0, 0
    for data, labels in get_batches(test_data, bptt):
        loss = criterion2(model2(data), labels)
        test_loss_sum, test_batches_nb = test_loss_sum + loss.item(), test_batches_nb + 1

    if epoch == 1 or epoch % 10 == 0 or epoch == epochs:

        print()
        
        print(f'train loss: {round(train_loss_sum / train_batches_nb, 2)}')
        print(f'test loss: {round(test_loss_sum / test_batches_nb, 2)}')
        
        print()
        
        for kind in ('top', 'multinomial'):
            print(f'sample {kind}: ' + generate2(model2, 'je ', 200, kind))
            print()

epoch: 1

train loss: 2.29
test loss: 2.02

sample top: je pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas 

sample multinomial: je pomma à lour, innments. on jout prens las. M'enlaiment de ca pasteles aus, tur. Jaut danter nlautes 300 goz mon mars tralenttain a yu pas. Il on (avain pon quiiz. Jen etreaper, bêmand ? s'y bagces aux

epoch: 2
epoch: 3
epoch: 4
epoch: 5
epoch: 6
epoch: 7
epoch: 8
epoch: 9
epoch: 10

train loss: 1.63
test loss: 1.73

sample top: je de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de

sample multinomial: je pascile. Etre à melle. Imde lanchus, jusie. Le bienceile (ne élater sacteures de en (chément 4 hamere, en se "Tture quant sy dant". "3 haud je mure-cille ave