In [224]:
from copy import copy

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader

from torch.autograd import Variable

In [225]:
txt = 'this is a string of a number of characters that expresses absolutely nothing'

In [226]:
txt = ''

In [227]:
with open('data/one_txt/sanitized_blogger.txt') as f:
    txt += f.read()

In [228]:
len(txt)

442724

In [223]:
with open('data/one_txt/sanitized_wordpress.txt') as f:
    txt += f.read()

In [168]:
len(txt)

3216695

In [229]:
vocab = sorted(list(set(txt)))
n_vocab = len(vocab)
print(''.join(vocab))

 !"$%'()+,-./0123456789:;=>?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz~°àâçèéêëîïôùûœ€


In [230]:
char_to_idx = {char: idx for idx, char in enumerate(vocab)}
idx_to_char = {idx: char for idx, char in enumerate(vocab)}

In [231]:
train_frac = 3. / 4
train_txt = txt[:int(len(txt) * train_frac)]
test_txt = txt[int(len(txt) * train_frac):]

In [8]:
def get_n_sized_chunks(s, n):
    """
    Yield successive n-sized chunks from a string.
    Discard the last chunk if not of size n.
    """
    for i in range(0, len(s), n):
        chunk = s[i:i + n]
        if len(chunk) == n:
            yield chunk

In [9]:
n_chars = 3

In [10]:
def get_data_tensor(txt, n_chars):
    chunks = list(get_n_sized_chunks(txt, n=n_chars))
    data_tensor = torch.tensor([[char_to_idx[char] for char in chunk] for chunk in chunks][:-1])
    return data_tensor

In [11]:
def get_labels_tensor(txt, n_chars):
    chars = txt[n_chars::n_chars][:len(txt) // n_chars - 1]
    labels_tensor = torch.tensor([char_to_idx[char] for char in chars])
    return labels_tensor

In [12]:
train_data_tensor = get_data_tensor(train_txt, n_chars)
print(train_data_tensor.size())

train_labels_tensor = get_labels_tensor(train_txt, n_chars)
print(train_labels_tensor.size())

torch.Size([18, 3])
torch.Size([18])


In [13]:
train_ds = TensorDataset(train_data_tensor, train_labels_tensor)
train_dl = DataLoader(train_ds, batch_size=1024, shuffle=True)

In [14]:
test_data_tensor = get_data_tensor(test_txt, n_chars)
print(test_data_tensor.size())

test_labels_tensor = get_labels_tensor(test_txt, n_chars)
print(test_labels_tensor.size())

torch.Size([5, 3])
torch.Size([5])


In [15]:
test_ds = TensorDataset(test_data_tensor, test_labels_tensor)

![](img/rnn.jpg)

In [199]:
class Model(nn.Module):
    def __init__(self, n_vocab, n_factors, n_hidden, n_chars):
        super().__init__()
        self.n_chars = n_chars
        self.e = nn.Embedding(n_vocab, n_factors)
        self.input_weights = nn.Linear(n_factors, n_hidden)
        self.hidden_weights = nn.Linear(n_hidden, n_hidden)
        self.output_weights = nn.Linear(n_hidden, n_vocab)

    def forward(self, chars):

        hidden = torch.zeros([len(chars), n_hidden])

        for i in range(self.n_chars):
            input = F.relu(self.input_weights(self.e(chars[:, i])))
            hidden = torch.tanh(self.hidden_weights(input + hidden))

        output = F.log_softmax(self.output_weights(hidden), dim=1)
        print(output)
        
        return output

In [200]:
n_fac = n_vocab // 2
n_hidden = 100

In [201]:
model = Model(n_vocab, n_fac, n_hidden, n_chars)

In [202]:
optimizer = torch.optim.Adam(model.parameters(), 1e-2)
criterion = nn.NLLLoss()

In [22]:
epochs = 300

for epoch in range(1, epochs + 1):

    print(f'epoch: {epoch}')
    
    for i, (data, labels) in enumerate(train_dl, 1):
        output = model(data)
        optimizer.zero_grad()
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        
    train_loss = round(criterion(model(train_data_tensor), train_labels_tensor).item(), 2)
    print(f'  train loss: {train_loss}')
        
    test_loss = round(criterion(model(test_data_tensor), test_labels_tensor).item(), 2)
    print(f'  test loss: {test_loss}')

    print()

epoch: 1


NameError: name 'train_dl' is not defined

In [22]:
def generate(s, n):

    assert len(s) == n_chars

    final_s = s
    for _ in range(n):
        chars = get_data_tensor(s + 'aaa', n_chars)
        #print(chars, model(chars))
        pred_idx = model(chars).argmax().item()
        pred_char = idx_to_char[pred_idx]
        s = s[1:] + pred_char
        final_s += pred_char

    return final_s

In [26]:
generate('je ', 100)

'hi gha a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a '

In [245]:
def get_data(txt, bs):

    txt = [char_to_idx[c] for c in txt]
    
    # Shrink `len(txt)` to a multiple of `bs`
    txt_len = (len(txt) // bs) * bs
    txt = txt[:txt_len]

    # Cut `txt` into `bs` distinct chunks
    all_data = torch.tensor(txt).view(bs, -1)
    all_data = all_data.transpose(0, 1).contiguous()

    data = all_data[:-1, :]
    labels = all_data[1:, :]
    
    return data, labels

In [246]:
data, labels = get_data(train_txt, bs=3)

In [247]:
print('data:')
print(data)

print()

print('labels:')
print(labels)

data:
tensor([[42, 54, 72],
        [59, 67,  0],
        [59, 73, 11],
        ...,
        [73, 66,  0],
        [11, 58, 68],
        [ 0, 67, 74]])

labels:
tensor([[59, 67,  0],
        [59, 73, 11],
        [62,  0, 11],
        ...,
        [11, 58, 68],
        [ 0, 67, 74],
        [47, 73, 73]])


In [248]:
def get_batches(data, bptt):
    """
    Yield (train, labels) batches from `data`.

    At each iteration, the two batches have the same `bptt * bs` size,
    except for the last iteration which may yield less than bptt rows.
    """

    # Cut `data` into two 2-dimensional chunks of size `bptt * bs`.
    # Last chunk may be less than `bptt` rows.
    while len(data) != 0:

        # Take (at most) bptt rows with offset 1 for labels
        labels_batch = data[1:bptt+1, :]
        # Take bptt rows as the labels with offset 0 for train
        data_batch = data[:len(labels_batch), :]

        if len(labels_batch) > 0:
            yield data_batch, labels_batch

        # Move on to next train train/labels rows
        data = data[bptt:]

In [249]:
i = 1
for data_batch, labels_batch in get_batches(data, bptt=5):
    
    print(f'data:')
    print(data_batch)

    print(f'labels:')
    print(labels_batch)

    print()
    print()
    
    i += 1
    if i > 2:
        break

data:
tensor([[42, 54, 72],
        [59, 67,  0],
        [59, 73, 11],
        [62,  0, 11],
        [56, 69, 11]])
labels:
tensor([[59, 67,  0],
        [59, 73, 11],
        [62,  0, 11],
        [56, 69, 11],
        [62, 62,  0]])


data:
tensor([[62, 62,  0],
        [58, 72, 30],
        [74, 11,  5],
        [72,  0, 58],
        [58, 32, 72]])
labels:
tensor([[58, 72, 30],
        [74, 11,  5],
        [72,  0, 58],
        [58, 32, 72],
        [66, 67, 73]])




In [237]:
def generate(model, s, n):

    model.reset(1)

    res = s
    for _ in range(n):
        data, _ = get_data(s + ' ', 1)
        preds = model(data)
        pred_idx = preds[-1].argmax().item()
        pred_char = idx_to_char[pred_idx]
        res += pred_char
        s = s[1:] + pred_char
        
    return res

In [238]:
class Model(nn.Module):
    def __init__(self, n_vocab, n_fac, n_hidden):
        super().__init__()
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.e = nn.Embedding(n_vocab, n_fac)
        self.output_weights = nn.Linear(n_hidden, n_vocab)

    def forward(self, data):
        input = self.e(data)
        output, h = self.rnn(input, self.hidden_weights)
        self.hidden_weights = Variable(h.data)
        output = self.output_weights(output)
        output = F.log_softmax(output, dim=-1)
        return output

    def reset(self, bs):
        self.hidden_weights = torch.zeros([1, bs, n_hidden])

In [239]:
n_fac = n_vocab // 2
n_hidden = 100
bs = 1024
bptt = 3

In [240]:
model = Model(n_vocab, n_fac, n_hidden)

In [241]:
def nll_loss_seq(output, labels):
    _, _, n_vocab = output.size()
    output = output.view(-1, n_vocab)
    labels = labels.reshape(-1)
    return F.nll_loss(output, labels)

In [242]:
optimizer = torch.optim.Adam(model.parameters(), 1e-2)
criterion = nll_loss_seq

In [243]:
train_data_tensor, train_labels_tensor = get_data(train_txt, bs)
test_data_tensor, test_labels_tensor = get_data(test_txt, bs)

In [244]:
epochs = 30

for epoch in range(1, epochs + 1):

    model.reset(bs)
    
    print(f'epoch: {epoch}')

    train_loss_sum, train_batches_nb = 0, 0
    for i, (data, labels) in enumerate(get_batches(train_data_tensor, bptt), 1):
        output = model(data)
        optimizer.zero_grad()
        loss = criterion(output, labels)
        train_loss_sum, train_batches_nb = train_loss_sum + loss.item(), train_batches_nb + 1
        loss.backward()
        optimizer.step()

    test_loss_sum, test_batches_nb = 0, 0
    for data, labels in get_batches(test_data_tensor, bptt):
        loss = criterion(model(data), labels)
        test_loss_sum, test_batches_nb = test_loss_sum + loss.item(), test_batches_nb + 1
    
    print(f'  train loss: {round(train_loss_sum / train_batches_nb, 2)}')
    print(f'  test loss: {round(test_loss_sum / test_batches_nb, 2)}')

    if epoch % 10 == 0 or epoch == epochs:
        sample = generate(model, 'je ', 200)
        print()
        print(f'sample: {sample}')

    print()

epoch: 1
  train loss: 2.29
  test loss: 2.0

epoch: 2
  train loss: 1.89
  test loss: 1.87

epoch: 3
  train loss: 1.79
  test loss: 1.81

epoch: 4
  train loss: 1.73
  test loss: 1.78

epoch: 5
  train loss: 1.7
  test loss: 1.76

epoch: 6
  train loss: 1.67
  test loss: 1.75

epoch: 7
  train loss: 1.65
  test loss: 1.74

epoch: 8
  train loss: 1.64
  test loss: 1.74

epoch: 9
  train loss: 1.63
  test loss: 1.73

epoch: 10
  train loss: 1.62
  test loss: 1.72

sample: je solientriculité de paris paris paris paris paris paris paris paris paris paris paris paris paris paris paris paris paris paris paris paris paris paris paris paris paris paris paris paris paris paris p

epoch: 11
  train loss: 1.61
  test loss: 1.71

epoch: 12
  train loss: 1.6
  test loss: 1.72

epoch: 13
  train loss: 1.6
  test loss: 1.71

epoch: 14
  train loss: 1.6
  test loss: 1.71

epoch: 15
  train loss: 1.59
  test loss: 1.71

epoch: 16
  train loss: 1.59
  test loss: 1.71

epoch: 17
  train loss: 1.58
  te

In [None]:
generate('je ', 200)