In [1]:
import torch
import numpy as np
import torch.nn as nn
import torchtext
from glob import glob

In [2]:
files = glob(f'../input/poemsdataset/forms/**/*.txt')
files[:5]

['../input/poemsdataset/forms/lay/LayPoemsLayAGarlandOnMyHearsePoembyFrancisBeaumont.txt',
 '../input/poemsdataset/forms/lay/LayPoemsLayHisSwordByHisSidePoembyThomasMoore.txt',
 '../input/poemsdataset/forms/lay/LayPoemsLayAGarlandOnMyHearsePoembyBeaumontandFletcher.txt',
 '../input/poemsdataset/forms/lay/LayPoemsAsILayWithHeadInYourLapCameradoPoembyWaltWhitman.txt',
 '../input/poemsdataset/forms/lay/LayPoemsTheDeerLayDownTheirBonesPoembyRobinsonJeffers.txt']

In [5]:
len(files)

6322

In [38]:
all_texts = [open(f, encoding='utf8').read() for f in files[:2000]]

In [39]:
text = [f for sublist in all_texts for f in sublist]

In [40]:
char_set = set(text)

print('Total Length:', len(text))
print('Unique characters:', len(char_set))

Total Length: 2025685
Unique characters: 311


In [41]:
chars_sorted = sorted(char_set)
char2int = {ch:i for i, ch in enumerate(chars_sorted)}

char_array = np.array(chars_sorted)

text_encoded = np.array([char2int[ch] for ch in text], dtype=np.int32)

print('Text encoded shape: ', text_encoded.shape)

print(text[:15], '     == Encoding ==> ', text_encoded[:15])
print(text_encoded[15:21], ' == Reverse  ==> ', ''.join(char_array[text_encoded[15:21]]))

Text encoded shape:  (2025685,)
['L', 'a', 'y', ' ', 'a', ' ', 'g', 'a', 'r', 'l', 'a', 'n', 'd', ' ', 'o']      == Encoding ==>  [46 67 91  2 67  2 73 67 84 78 67 80 70  2 81]
[80  2 79 91  2 74]  == Reverse  ==>  n my h


In [42]:
len(text), len(text_encoded)

(2025685, 2025685)

In [43]:
for ex in text_encoded[:5]:
    print('{} -> {}'.format(ex, char_array[ex]))

46 -> L
67 -> a
91 -> y
2 ->  
67 -> a


In [44]:
seq_length = 20
chunk_size = seq_length + 1

text_chunks = [text_encoded[i:i+chunk_size] 
               for i in range(len(text_encoded)-chunk_size)] 

## inspection:
for seq in text_chunks[:1]:
    input_seq = seq[:seq_length]
    target = seq[seq_length] 
    print(input_seq, ' -> ', target)
    print(repr(''.join(char_array[input_seq])), 
          ' -> ', repr(''.join(char_array[target])))

[46 67 91  2 67  2 73 67 84 78 67 80 70  2 81 80  2 79 91  2]  ->  74
'Lay a garland on my '  ->  'h'


In [45]:
import torch
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)
    
    def __getitem__(self, idx):
        text_chunk = self.text_chunks[idx]
        return text_chunk[:-1].long(), text_chunk[1:].long()
    


In [46]:
seq_dataset = TextDataset(torch.tensor(text_chunks))

In [47]:
for i, (seq, target) in enumerate(seq_dataset):
    print(' Input (x):', repr(''.join(char_array[seq])))
    print('Target (y):', repr(''.join(char_array[target])))
    print()
    if i == 1:
        break
    

 Input (x): 'Lay a garland on my '
Target (y): 'ay a garland on my h'

 Input (x): 'ay a garland on my h'
Target (y): 'y a garland on my he'



In [48]:
device = torch.device("cuda:0")
# device = 'cpu'

In [49]:
from torch.utils.data import DataLoader
 
batch_size = 64

torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

In [50]:
len(seq_dataset), len(seq_dl)

(2025664, 31651)

## Building a character-level RNN model

In [51]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim) 
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, 
                           batch_first=True)
        self.fc = nn.Linear(rnn_hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        out = self.embedding(x).unsqueeze(1)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell

    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden.to(device), cell.to(device)
    
vocab_size = len(char_array)
embed_dim = 64
rnn_hidden_size = 64

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size) 
model = model.to(device)
model

RNN(
  (embedding): Embedding(311, 64)
  (rnn): LSTM(64, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=311, bias=True)
)

In [52]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10000

torch.manual_seed(1)

for epoch in range(num_epochs):
    hidden, cell = model.init_hidden(batch_size)
    seq_batch, target_batch = next(iter(seq_dl))
    seq_batch = seq_batch.to(device)
    target_batch = target_batch.to(device)
    optimizer.zero_grad()
    loss = 0
    for c in range(seq_length):
        pred, hidden, cell = model(seq_batch[:, c], hidden, cell) 
        loss += loss_fn(pred, target_batch[:, c])
    loss.backward()
    optimizer.step()
    loss = loss.item()/seq_length
    if epoch % 500 == 0:
        print(f'Epoch {epoch} loss: {loss:.4f}')

Epoch 0 loss: 5.7540
Epoch 500 loss: 2.5926
Epoch 1000 loss: 2.4624
Epoch 1500 loss: 2.3914
Epoch 2000 loss: 2.3157
Epoch 2500 loss: 2.2121
Epoch 3000 loss: 2.1496
Epoch 3500 loss: 2.1315
Epoch 4000 loss: 2.1071
Epoch 4500 loss: 2.1271
Epoch 5000 loss: 2.0341
Epoch 5500 loss: 2.1391
Epoch 6000 loss: 1.9936
Epoch 6500 loss: 2.1497
Epoch 7000 loss: 1.9627
Epoch 7500 loss: 2.0165
Epoch 8000 loss: 2.0778
Epoch 8500 loss: 2.0017
Epoch 9000 loss: 1.9117
Epoch 9500 loss: 1.9623


In [53]:
from torch.distributions.categorical import Categorical

def sample(model, starting_str, 
           len_generated_text=500, 
           scale_factor=1.0):

    encoded_input = torch.tensor([char2int[s] for s in starting_str])
    encoded_input = torch.reshape(encoded_input, (1, -1))

    generated_str = starting_str

    model.eval()
    hidden, cell = model.init_hidden(1)
    hidden = hidden.to('cpu')
    cell = cell.to('cpu')
    for c in range(len(starting_str)-1):
        _, hidden, cell = model(encoded_input[:, c].view(1), hidden, cell) 
    
    last_char = encoded_input[:, -1]
    for i in range(len_generated_text):
        logits, hidden, cell = model(last_char.view(1), hidden, cell) 
        logits = torch.squeeze(logits, 0)
        scaled_logits = logits * scale_factor
        m = Categorical(logits=scaled_logits)
        last_char = m.sample()
        generated_str += str(char_array[last_char])
        
    return generated_str

torch.manual_seed(1)
model.to('cpu')
print(sample(model, starting_str='The island'))

The island
Gift is light and súts,
By heirly well chanes her hould unnvence!
No leight stant;
Thy côritup; toul sone whrow beside: © Musediont gay halled all thy for on depre lat wing,
Though,
Affel
I kill; I waltime the conk it.               Af Brisish, Michael N'Fro lines.
Oga
afocks neam. groshnichint trown setter, make hully the have soin;
now leaver, goses is love ever flove parch
A may the losk formoren.
My shve chain
like assellodir; time abacc;
6 their your ner yet
Invericitam the aill the hight


In [None]:
# https://github.com/rasbt/machine-learning-book/blob/main/ch15/ch15_part3.ipynb