In [1]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader

In [2]:
txt = 'this is a string of a number of characters that expresses absolutely nothing'

In [3]:
with open('data/one_txt/sanitized_blogger.txt') as f:
    txt = f.read()

In [4]:
vocab = sorted(list(set(txt)))
n_vocab = len(vocab)
print(''.join(vocab))

 !"$%'()+,-./0123456789:;=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~°àâçèéêëîïôùûœо€


In [5]:
char_to_idx = {char: idx for idx, char in enumerate(vocab)}
idx_to_char = {idx: char for idx, char in enumerate(vocab)}

In [6]:
train_frac = 3. / 4
train_txt = txt[:int(len(txt) * train_frac)]
test_txt = txt[int(len(txt) * train_frac):]

In [7]:
def get_chunks(s, n):
    """
    Yield successive n-sized chunks from a string
    """
    for i in range(0, len(s), n):
        chunk = s[i:i + n]
        if len(chunk) == n:
            yield chunk

In [8]:
n_chars = 3

In [9]:
def get_data_tensor(txt, n_chars):
    chunks = list(get_chunks(txt, n_chars))
    data_tensor = torch.tensor([[char_to_idx[char] for char in chunk] for chunk in chunks][:-1])
    return data_tensor

In [10]:
def get_labels_tensor(txt, n_chars):
    chars = txt[n_chars::n_chars][:len(txt) // n_chars - 1]
    labels_tensor = torch.tensor([char_to_idx[char] for char in chars])
    return labels_tensor

In [11]:
train_data_tensor = get_data_tensor(train_txt, n_chars)
print(train_data_tensor.size())

train_labels_tensor = get_labels_tensor(train_txt, n_chars)
print(train_labels_tensor.size())

torch.Size([693491, 3])
torch.Size([693491])


In [12]:
train_ds = TensorDataset(train_data_tensor, train_labels_tensor)
train_dl = DataLoader(train_ds, batch_size=1024, shuffle=True)

In [13]:
test_data_tensor = get_data_tensor(test_txt, n_chars)
print(test_data_tensor.size())

test_labels_tensor = get_labels_tensor(test_txt, n_chars)
print(test_labels_tensor.size())

torch.Size([231163, 3])
torch.Size([231163])


In [14]:
test_ds = TensorDataset(test_data_tensor, test_labels_tensor)

![](img/rnn.jpg)

In [15]:
class Model(nn.Module):
    def __init__(self, n_vocab, n_factors, n_hidden, n_chars):
        super().__init__()
        self.n_chars = n_chars
        self.e = nn.Embedding(n_vocab, n_factors)
        self.input_weights = nn.Linear(n_factors, n_hidden)
        self.hidden_weights = nn.Linear(n_hidden, n_hidden)
        self.output_weights = nn.Linear(n_hidden, n_vocab)

    def forward(self, chars):

        hidden = torch.zeros([len(chars), n_hidden])

        for i in range(self.n_chars):
            input = F.relu(self.input_weights(self.e(chars[:, i])))
            hidden = torch.tanh(self.hidden_weights(input + hidden))

        output = F.log_softmax(self.output_weights(hidden), dim=1)
        
        return output

In [16]:
n_fac = n_vocab // 2
n_hidden = 100

In [17]:
model = Model(n_vocab, n_fac, n_hidden, n_chars)

In [18]:
optimizer = torch.optim.Adam(model.parameters(), 1e-2)
criterion = nn.CrossEntropyLoss()

In [19]:
epochs = 15

for epoch in range(1, epochs + 1):

    print(f'epoch: {epoch}')
    
    for i, (data, labels) in enumerate(train_dl, 1):
        output = model(data)
        optimizer.zero_grad()
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        
    train_loss = round(criterion(model(train_data_tensor), train_labels_tensor).item(), 2)
    print(f'  train loss: {train_loss}')
        
    test_loss = round(criterion(model(test_data_tensor), test_labels_tensor).item(), 2)
    print(f'  test loss: {test_loss}')

    print()

epoch: 1
  train loss: 1.95
  test loss: 1.95

epoch: 2
  train loss: 1.88
  test loss: 1.89

epoch: 3
  train loss: 1.85
  test loss: 1.87

epoch: 4
  train loss: 1.83
  test loss: 1.85

epoch: 5
  train loss: 1.82
  test loss: 1.83

epoch: 6
  train loss: 1.81
  test loss: 1.82

epoch: 7
  train loss: 1.81
  test loss: 1.82

epoch: 8
  train loss: 1.79
  test loss: 1.81

epoch: 9
  train loss: 1.79
  test loss: 1.81

epoch: 10
  train loss: 1.79
  test loss: 1.81

epoch: 11
  train loss: 1.79
  test loss: 1.81

epoch: 12
  train loss: 1.8
  test loss: 1.82

epoch: 13
  train loss: 1.79
  test loss: 1.81

epoch: 14
  train loss: 1.79
  test loss: 1.8

epoch: 15
  train loss: 1.8
  test loss: 1.82



In [20]:
def generate(s, n):

    assert len(s) == n_chars

    final_s = s
    for _ in range(n):
        chars = get_data_tensor(s + 'aaa', n_chars)
        #print(chars, model(chars))
        pred_idx = model(chars).argmax().item()
        pred_char = idx_to_char[pred_idx]
        s = s[1:] + pred_char
        final_s += pred_char

    return final_s

In [22]:
generate('je ', 100)

'je pour le de le de le de le de le de le de le de le de le de le de le de le de le de le de le de le de'