In [1]:
import numpy as np

with open('../data/1268-0.txt', 'r', encoding='utf8') as fp:
    text = fp.read()

start_idx = text.find('THE MYSTERIOUS ISLAND')
end_idx = text.find('End of the Project Gutenberg')
text = text[start_idx:end_idx]
char_set = set(text)
print('Total Length:', len(text))
print('Unique Characters:', len(char_set))

Total Length: 1112310
Unique Characters: 80


In [2]:
chars_sorted = sorted(char_set)
char2int = {ch:i for i, ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)
text_encoded = np.array([char2int[ch] for ch in text], dtype=np.int32)
print('Text encoded shape:', text_encoded.shape)
print(text[:15], '== Encoding ==>', text_encoded[:15])
print(text_encoded[15:21], '== Reversse ==>', ''.join(char_array[text_encoded[15:21]]))

Text encoded shape: (1112310,)
THE MYSTERIOUS  == Encoding ==> [44 32 29  1 37 48 43 44 29 42 33 39 45 43  1]
[33 43 36 25 38 28] == Reversse ==> ISLAND


In [3]:
for ex in text_encoded[:5]:
    print('{} -> {}'.format(ex, char_array[ex]))

44 -> T
32 -> H
29 -> E
1 ->  
37 -> M


In [None]:
seq_length = 40
chunk_size = seq_length + 1
text_chunks = [text_encoded[i:i+chunk_size] for i in range(len(text_encoded) - chunk_size)]

for seq in text_chunks[:1]:
    input_seq = seq[:seq_length]
    target = seq[seq_length]
    print(input_seq, '->', target)
    print(repr(''.join(char_array[input_seq])), 
          ' -> ', repr(''.join(char_array[target])))

[44 32 29  1 37 48 43 44 29 42 33 39 45 43  1 33 43 36 25 38 28  0  0 51
 74  1 34 70 61 54 68  1 46 54 67 63 54  0  0 12] -> 19
'THE MYSTERIOUS ISLAND\n\nby Jules Verne\n\n1'  ->  '8'


In [13]:
import torch
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)
    
    def __getitem__(self, idx):
        text_chunk = self.text_chunks[idx]
        return text_chunk[:-1].long(), text_chunk[1:].long()

seq_dataset = TextDataset(torch.tensor(np.array(text_chunks)))

In [15]:
decode = lambda mask: repr(''.join(char_array[mask]))

for i, (seq, target) in enumerate(seq_dataset):
    print('Input (x):\n%s' % (decode(seq)))
    print('Target (y):\n%s' % (decode(target)))
    print()
    
    if i == 1:
        break

Input (x):
'THE MYSTERIOUS ISLAND\n\nby Jules Verne\n\n1'
Target (y):
'HE MYSTERIOUS ISLAND\n\nby Jules Verne\n\n18'

Input (x):
'HE MYSTERIOUS ISLAND\n\nby Jules Verne\n\n18'
Target (y):
'E MYSTERIOUS ISLAND\n\nby Jules Verne\n\n187'



In [16]:
device = torch.device('cuda')

In [17]:
from torch.utils.data import DataLoader
batch_size = 64
torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

In [18]:
from torch import nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)

        self.fc = nn.Linear(rnn_hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        out = self.embedding(x).unsqueeze(1)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell
    
    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden.to(device), cell.to(device)

In [19]:
vocab_size = len(char_array)
embed_dim = 256
rnn_hidden_size = 512
torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size).to(device)
model

RNN(
  (embedding): Embedding(80, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=80, bias=True)
)

In [21]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10000
torch.manual_seed(1)

for epoch in range(num_epochs):
    hidden, cell = model.init_hidden(batch_size)
    seq_batch, target_batch = next(iter(seq_dl))
    seq_batch = seq_batch.to(device)
    target_batch = target_batch.to(device)
    optimizer.zero_grad()
    loss = 0
    for c in range(seq_length):
        pred, hidden, cell = model(seq_batch[:, c], hidden, cell)
        loss += loss_fn(pred, target_batch[:, c])
    loss.backward()
    optimizer.step()
    loss = loss.item()/seq_length
    if epoch % 500 == 0:
        print(f'Epoch {epoch} loss: {loss:.4f}')

Epoch 0 loss: 4.3720
Epoch 500 loss: 1.5593
Epoch 1000 loss: 1.3769
Epoch 1500 loss: 1.3499
Epoch 2000 loss: 1.2380
Epoch 2500 loss: 1.1904
Epoch 3000 loss: 1.1461
Epoch 3500 loss: 1.1644
Epoch 4000 loss: 1.1000
Epoch 4500 loss: 1.1095
Epoch 5000 loss: 1.0973
Epoch 5500 loss: 1.0847
Epoch 6000 loss: 1.0589
Epoch 6500 loss: 1.0919
Epoch 7000 loss: 1.0441
Epoch 7500 loss: 1.0409
Epoch 8000 loss: 1.0405
Epoch 8500 loss: 1.0644
Epoch 9000 loss: 1.0320
Epoch 9500 loss: 1.0395


In [22]:
torch.save(model.state_dict(), '../models/ch15-rnn-v1.pth')

In [26]:
from torch.distributions.categorical import Categorical
torch.manual_seed(1)
logits = torch.tensor([[1.0, 1.0, 1.0]])
print('probabilities:', nn.functional.softmax(logits, dim=1).numpy()[0])

probabilities: [0.33333334 0.33333334 0.33333334]


In [27]:
m = Categorical(logits=logits)
samples = m.sample((10,))
print(samples.numpy())

[[0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [2]
 [1]
 [1]]


In [28]:
logits = torch.tensor([[1.0, 1.0, 3.0]])
print('probabilities:', nn.functional.softmax(logits, dim=1).numpy()[0])

probabilities: [0.10650698 0.10650698 0.78698605]


In [31]:
m = Categorical(logits=logits)
samples = m.sample((10,))
print(samples.numpy())

[[2]
 [2]
 [2]
 [2]
 [2]
 [1]
 [2]
 [2]
 [2]
 [2]]


In [34]:
def sample(model, starting_str, len_generated_text=500, scale_factor=1.0):
    encoded_input = torch.tensor([char2int[s] for s in starting_str])
    encoded_input = torch.reshape(encoded_input, (1, -1)).to(device)
    generated_str = starting_str
    
    model.eval()
    hidden, cell = model.init_hidden(1)
    for c in range(len(starting_str)-1):
        _, hidden, cell = model(encoded_input[:, c].view(1), hidden, cell)

    last_char = encoded_input[:, -1]
    for i in range(len_generated_text):
        logits, hidden, cell = model(last_char.view(1), hidden, cell)
        logits = torch.squeeze(logits, 0)
        scaled_logits = logits * scale_factor
        m = Categorical(logits=scaled_logits)
        last_char = m.sample()
        generated_str += str(char_array[last_char])

    return generated_str

In [42]:
torch.manual_seed(4)
print(sample(model, starting_str='But what was'))

But what was a complete sure of
the mystery which had been so establiquely. Pencroft, with intending to hoist emotion to all.

Towards six, Cyrus Harding, “whered are the precious verous time to Cyrus Harding’s inhabited; there even intended to survey the cost of the corral.

The six hard later stopped, and then returned to the bay, which were watercourses of our new feet nor their feasts were gazing at the southern part of the ladder--a
right by the fine season whose branches was habitable to remain power 


In [46]:
torch.manual_seed(4)
print(sample(model, starting_str='But what was', scale_factor=2.0))

But what was a man of the convicts which should be done in a certainty incident work. The sides of the northern
point of the island, and the temperature was not a continent would be to be done but to be feared, the wind had discovered the winter of the plants were to be seen that some days after the
most proposed to be desired. The reporter and his companions had not been so stretched on the shore, so that the sea
was a man of five minutes to the bottom of the well, which were all was there that he was acco


In [47]:
torch.manual_seed(4)
print(sample(model, starting_str='But what was', scale_factor=0.5))

But what was alive Linnarius gated vike is quelush. Sear had cestwaPequescollemple, but I0 isten
an thinkon-shelong. All alt.
Beoked quift, “you Cyre, cay make baddering! Where, perhrope,--tying Chasked hee
as! ” could I; sed fiers-hot.”

It had hearpy, did shop why, my bong-maYiqsies. Would sy, my oughezo;
thenvery,”o observed.

Come? It wishtwayed? Looks:-”0

Gineer; alfood awlique alypew.,--gazigh Prospearos.--


“Or, Pencroft!” send hidoxed thing; fnew effere?s
frequented fruith by seas hairwapses Ra’cv


In [48]:
torch.manual_seed(4)
print(sample(model, starting_str='which implement the neural network', scale_factor=1.5))

which implement the neural network and strewn house, and then so as to be unable to established, the positive passage had then finished, and the trees were laid on the 25th of
March, and they were about to sent them to the 15th of February, the colonists were not to be destroyed some embarking nearly swiftly on the 17th of August, and as hope themselves with the sailor’s name. There was
not a communication between the southern part of the day before and the presence of new fire was necessary to reach the bay, as they had suppose
