In [8]:
with open('1268-0.txt', 'r', encoding='utf8') as fp:
  text = fp.read()

start_idx = text.find('THE MYSTERIOUS ISLAND')
end_idx = text.find('\n\n*** END OF THE PROJECT GUTENBERG')
text = text[start_idx:end_idx]
char_set = set(text)
print('total length', len(text))
print('unique characters:', len(char_set))

total length 1112261
unique characters: 79


In [100]:
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.distributions.categorical import Categorical

from tqdm import tqdm
torch.cuda.is_available()

True

In [13]:

chars_sorted = sorted(char_set)
c2i = {ch:i for i, ch in enumerate(chars_sorted)}
i2c = np.array(chars_sorted)
text_encoded = np.array([c2i[ch] for ch in text], dtype=np.int32)
decode = lambda arr: ''.join(i2c[arr])

In [15]:
print(text_encoded[100:200])
decode(text_encoded[100:200])

[53  1 66 57 67 57 62 55  1 49 55 49 57 62 23 78  1 77 37 63  8  1 38 62
  1 68 56 53  1 51 63 62 68 66 49 66 73  8 78  1 77 24 66 53  1 71 53  1
 52 53 67 51 53 62 52 57 62 55 23 78  0  1 77 46 63 66 67 53  1 68 56 49
 62  1 68 56 49 68  6  1 51 49 64 68 49 57 62  2  1 71 53  1 49 66 53  1
 54 49 60 60]


'e rising again?” “No. On the contrary.” “Are we descending?”\n “Worse than that, captain! we are fall'

In [17]:
seq_length = 40
chunk_size = seq_length + 1
text_chunks = np.array([text_encoded[i:i+chunk_size] for i in range(len(text_encoded) - chunk_size)])

In [25]:
class TextDataset(Dataset):
  def __init__(self, text_chunks):
    self.text_chunks = torch.tensor(text_chunks)

  def __len__(self):
    return len(self.text_chunks)

  def __getitem__(self, idx):
    chunk = self.text_chunks[idx]
    return chunk[:-1].long(), chunk[1:].long()

seq_dataset = TextDataset(text_chunks)

In [27]:
batch_size = 64
seq_dl = DataLoader(seq_dataset, batch_size=64, shuffle=True, drop_last=True)

In [29]:
device = torch.device('cuda')

class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)

        self.fc = nn.Linear(rnn_hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        out = self.embedding(x).unsqueeze(1)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell

    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden.to(device), cell.to(device)

In [30]:
vocab_size = len(i2c)
embed_dim = 256
rnn_hidden_size = 512
torch.manual_seed(999)
model = RNN(vocab_size, embed_dim, rnn_hidden_size).to(device)
model

RNN(
  (embedding): Embedding(79, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=79, bias=True)
)

In [32]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10000
torch.manual_seed(1)

for epoch in tqdm(range(num_epochs)):
    hidden, cell = model.init_hidden(batch_size)
    seq_batch, target_batch = next(iter(seq_dl))
    seq_batch = seq_batch.to(device)
    target_batch = target_batch.to(device)
    optimizer.zero_grad()
    loss = 0
    for c in range(seq_length):
        pred, hidden, cell = model(seq_batch[:, c], hidden, cell)
        loss += loss_fn(pred, target_batch[:, c])
    loss.backward()
    optimizer.step()
    loss = loss.item()/seq_length
    if epoch % 500 == 0:
        print(f'Epoch {epoch} loss: {loss:.4f}')

  0%|          | 2/10000 [00:00<1:07:05,  2.48it/s]

Epoch 0 loss: 4.3808


  5%|▌         | 502/10000 [01:02<17:28,  9.06it/s]

Epoch 500 loss: 1.4309


 10%|█         | 1002/10000 [01:59<15:27,  9.70it/s]

Epoch 1000 loss: 1.3561


 15%|█▌        | 1502/10000 [02:54<14:41,  9.64it/s]

Epoch 1500 loss: 1.1829


 20%|██        | 2002/10000 [03:48<13:34,  9.82it/s]

Epoch 2000 loss: 1.2763


 25%|██▌       | 2502/10000 [04:43<12:42,  9.83it/s]

Epoch 2500 loss: 1.2098


 30%|███       | 3002/10000 [05:37<12:03,  9.67it/s]

Epoch 3000 loss: 1.1994


 35%|███▌      | 3502/10000 [06:32<11:08,  9.72it/s]

Epoch 3500 loss: 1.1642


 40%|████      | 4002/10000 [07:26<10:09,  9.84it/s]

Epoch 4000 loss: 1.1820


 45%|████▌     | 4502/10000 [08:21<09:38,  9.51it/s]

Epoch 4500 loss: 1.1529


 50%|█████     | 5002/10000 [09:16<08:34,  9.72it/s]

Epoch 5000 loss: 1.0796


 55%|█████▌    | 5502/10000 [10:11<07:37,  9.84it/s]

Epoch 5500 loss: 1.1064


 60%|██████    | 6002/10000 [11:06<10:17,  6.47it/s]

Epoch 6000 loss: 1.0831


 65%|██████▌   | 6503/10000 [12:01<05:57,  9.78it/s]

Epoch 6500 loss: 1.0862


 70%|███████   | 7002/10000 [12:57<05:11,  9.62it/s]

Epoch 7000 loss: 1.0915


 75%|███████▌  | 7502/10000 [13:52<04:33,  9.13it/s]

Epoch 7500 loss: 1.0820


 80%|████████  | 8003/10000 [14:47<03:24,  9.76it/s]

Epoch 8000 loss: 1.0788


 85%|████████▌ | 8502/10000 [15:41<02:38,  9.46it/s]

Epoch 8500 loss: 1.0881


 90%|█████████ | 9002/10000 [16:38<01:46,  9.35it/s]

Epoch 9000 loss: 1.0339


 95%|█████████▌| 9502/10000 [17:35<00:52,  9.45it/s]

Epoch 9500 loss: 1.0344


100%|██████████| 10000/10000 [18:32<00:00,  8.99it/s]


In [34]:
torch.save(model.state_dict(), '/content/models/gen-v1.pth')

In [35]:
vocab_size = len(i2c)
embed_dim = 128
rnn_hidden_size = 512
torch.manual_seed(999)
model = RNN(vocab_size, embed_dim, rnn_hidden_size).to(device)
model

RNN(
  (embedding): Embedding(79, 128)
  (rnn): LSTM(128, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=79, bias=True)
)

In [98]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10000
torch.manual_seed(1)
milestone_length = 500
epoch_ranges = [(0, 1), (1, milestone_length)] + [(i, i+milestone_length) for i in range(milestone_length, num_epochs, milestone_length)]

for i, (start_epoch, end_epoch) in enumerate(epoch_ranges):
  loss: float
  for epoch in tqdm(range(start_epoch, end_epoch), f"epoch {start_epoch:04d}-{end_epoch-1:04d} "):
    hidden, cell = model.init_hidden(batch_size)
    seq_batch, target_batch = next(iter(seq_dl))
    seq_batch = seq_batch.to(device)
    target_batch = target_batch.to(device)
    optimizer.zero_grad()
    loss = 0
    for c in range(seq_length):
        pred, hidden, cell = model(seq_batch[:, c], hidden, cell)
        loss += loss_fn(pred, target_batch[:, c])
    loss.backward()
    optimizer.step()
    loss = loss.item()/seq_length

  if i == len(epoch_ranges) - 1:
    print()
  print(f'loss = {loss}')

epoch 0000-0000 : 100%|██████████| 1/1 [00:00<00:00,  8.78it/s]


loss = 4.35783805847168


epoch 0001-0499 : 100%|██████████| 499/499 [00:56<00:00,  8.83it/s]


loss = 1.6139293670654298


epoch 0500-0999 : 100%|██████████| 500/500 [00:55<00:00,  8.99it/s]


loss = 1.3695477485656737


epoch 1000-1499 : 100%|██████████| 500/500 [00:56<00:00,  8.91it/s]


loss = 1.3497934341430664


epoch 1500-1999 : 100%|██████████| 500/500 [01:00<00:00,  8.20it/s]


loss = 1.2303714752197266


epoch 2000-2499 : 100%|██████████| 500/500 [00:56<00:00,  8.92it/s]


loss = 1.2857511520385743


epoch 2500-2999 : 100%|██████████| 500/500 [00:55<00:00,  8.95it/s]


loss = 1.1749987602233887


epoch 3000-3499 : 100%|██████████| 500/500 [00:55<00:00,  8.97it/s]


loss = 1.1708408355712892


epoch 3500-3999 : 100%|██████████| 500/500 [00:56<00:00,  8.86it/s]


loss = 1.1311406135559081


epoch 4000-4499 : 100%|██████████| 500/500 [01:00<00:00,  8.29it/s]


loss = 1.1033601760864258


epoch 4500-4999 : 100%|██████████| 500/500 [00:56<00:00,  8.88it/s]


loss = 1.1575303077697754


epoch 5000-5499 : 100%|██████████| 500/500 [00:56<00:00,  8.90it/s]


loss = 1.0231558799743652


epoch 5500-5999 : 100%|██████████| 500/500 [00:56<00:00,  8.78it/s]


loss = 1.1229991912841797


epoch 6000-6499 : 100%|██████████| 500/500 [00:56<00:00,  8.82it/s]


loss = 1.0438895225524902


epoch 6500-6999 : 100%|██████████| 500/500 [00:56<00:00,  8.91it/s]


loss = 1.0402191162109375


epoch 7000-7499 : 100%|██████████| 500/500 [00:56<00:00,  8.78it/s]


loss = 1.1067254066467285


epoch 7500-7999 : 100%|██████████| 500/500 [00:56<00:00,  8.80it/s]


loss = 1.026358413696289


epoch 8000-8499 : 100%|██████████| 500/500 [00:56<00:00,  8.84it/s]


loss = 1.0748133659362793


epoch 8500-8999 : 100%|██████████| 500/500 [00:59<00:00,  8.47it/s]


loss = 0.978275203704834


epoch 9000-9499 : 100%|██████████| 500/500 [00:58<00:00,  8.62it/s]


loss = 1.016568374633789


epoch 9500-9999 : 100%|██████████| 500/500 [01:00<00:00,  8.27it/s]


loss = 0.9583173751831054





In [99]:
torch.save(model.state_dict(), '/content/models/gen-v1.1.pth')

In [101]:
def sample(model, starting_str, len_generated_text=500, scale_factor=1.0):
    encoded_input = torch.tensor([c2i[s] for s in starting_str])
    encoded_input = torch.reshape(encoded_input, (1, -1)).to(device)
    generated_str = starting_str

    model.eval()
    hidden, cell = model.init_hidden(1)
    for c in range(len(starting_str)-1):
        _, hidden, cell = model(encoded_input[:, c].view(1), hidden, cell)

    last_char = encoded_input[:, -1]
    for i in range(len_generated_text):
        logits, hidden, cell = model(last_char.view(1), hidden, cell)
        logits = torch.squeeze(logits, 0)
        scaled_logits = logits * scale_factor
        m = Categorical(logits=scaled_logits)
        last_char = m.sample()
        generated_str += str(i2c[last_char])

    return generated_str

In [124]:
torch.manual_seed(4)
print(sample(model, starting_str='But what was', scale_factor=2.0))

But what was impossible to draw fire at the moment when they had been settled to do the colonists felt out the most presentiments and magnetized by the wind and explored the strangers of the boat, and the first degree for the time they must have been cast on the first part of the trees still remained in the sea, which rested on the sand, the building of the forest,
and began to fire, began to accompany him, but not a second beak, so for the appearance of the colonists presented in the corral, when the engin


In [107]:
class RNN_v2(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, num_layers=1, bidirectional=True, batch_first=True)
        self.layer_norm = nn.LayerNorm(rnn_hidden_size)
        self.fc = nn.Linear(rnn_hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        out = self.embedding(x).unsqueeze(1)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        out = self.layer_norm(out)
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell

    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden.to(device), cell.to(device)

vocab_size = len(i2c)
embed_dim = 128
rnn_hidden_size = 512
torch.manual_seed(999)
model_v2 = RNN(vocab_size, embed_dim, rnn_hidden_size).to(device)
model_v2


RNN(
  (embedding): Embedding(79, 128)
  (rnn): LSTM(128, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=79, bias=True)
)

In [108]:
batch_size = 64
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_v2.parameters(), lr=0.001)

num_epochs = 5000
torch.manual_seed(1)
milestone_length = 500
epoch_ranges = [(0, 1), (1, milestone_length)] + [(i, i+milestone_length) for i in range(milestone_length, num_epochs, milestone_length)]

for i, (start_epoch, end_epoch) in enumerate(epoch_ranges):
  train_loss: float
  for epoch in tqdm(range(start_epoch, end_epoch), f"epoch {start_epoch:04d}-{end_epoch-1:04d} ", leave=False):
    hidden, cell = model_v2.init_hidden(batch_size)
    seq_batch, target_batch = next(iter(seq_dl))
    seq_batch = seq_batch.to(device)
    target_batch = target_batch.to(device)
    optimizer.zero_grad()
    loss = 0
    for c in range(seq_length):
        pred, hidden, cell = model_v2(seq_batch[:, c], hidden, cell)
        loss += loss_fn(pred, target_batch[:, c])
    loss.backward()
    optimizer.step()
    loss = loss.item()/seq_length

  if i == len(epoch_ranges) - 1:
    print(f'\r\repoch {epoch+1} loss = {loss}')
  else:
    print(f'epoch {epoch+1} loss = {loss}')



epoch 1 loss = 4.35783805847168




epoch 500 loss = 1.6139293670654298




epoch 1000 loss = 1.3695477485656737




epoch 1500 loss = 1.3497934341430664




epoch 2000 loss = 1.2303714752197266




epoch 2500 loss = 1.2857511520385743




epoch 3000 loss = 1.1749987602233887




epoch 3500 loss = 1.1708408355712892




epoch 4000 loss = 1.1311406135559081




epoch 4500 loss = 1.1033601760864258


                                                                   

epoch 5000 loss = 1.1575303077697754




In [120]:
torch.save(model_v2.state_dict(), '/content/models/gen-v2.0.pth')

In [123]:
torch.manual_seed(4)
print(sample(model_v2, starting_str='But what was', scale_factor=2))

But what was not succeeded the desert of the lava, and the tide was no longer
the shore of the little bottom of the colony. The place which had contained
them to the engineer.

“The spot more than a man of many despet. In a few minutes they had been accomplished, and the southern coast of the colonists were
able to replace the convicts, not a serious second capes of a continent, and the two companions set out the same time to the settlers were all of his companions, and his companions could not be able to w
