In [142]:
import torch
import numpy as np
import torch.nn as nn
import torchtext
from glob import glob

In [143]:
files = glob(f'../input/poemsdataset/forms/**/*.txt')
print(len(files));files[:5]

6322


['../input/poemsdataset/forms/lay/LayPoemsLayAGarlandOnMyHearsePoembyFrancisBeaumont.txt',
 '../input/poemsdataset/forms/lay/LayPoemsLayHisSwordByHisSidePoembyThomasMoore.txt',
 '../input/poemsdataset/forms/lay/LayPoemsLayAGarlandOnMyHearsePoembyBeaumontandFletcher.txt',
 '../input/poemsdataset/forms/lay/LayPoemsAsILayWithHeadInYourLapCameradoPoembyWaltWhitman.txt',
 '../input/poemsdataset/forms/lay/LayPoemsTheDeerLayDownTheirBonesPoembyRobinsonJeffers.txt']

In [144]:
all_texts = [open(f, encoding='utf8').read() for f in files[:1000]]
all_texts = [f.split('\n') for f in all_texts]
all_texts[:2]

[['Lay a garland on my hearse,',
  'Of the dismal yew,',
  'Maidens, willow branches bear,',
  'Say I died true.',
  'My love was false, but I was firm',
  'From my hour of birth;',
  'Upon my buried body lie',
  'Lightly, gentle earth.'],
 ['Lay his sword by his side -- it hath served him too well',
  'Not to rest near his pillow below;',
  'To the last moment true, from his hand ere it fell,',
  "Its point was still turn'd to a flying foe.",
  'Fellow-labourers in life, let them slumber in death,',
  'Side by side, as becomes the reposing brave --',
  'That sword which he loved still unbroke in its sheath,',
  'And himself unsubdued in his grave.',
  'Yet pause -- for, in fancy, a still voice I hear,',
  "As if breathed from his brave heart's remains; --",
  "Faint echo of that which, in Slavery's ear,",
  'Once sounded the war-word, "Burst your chains."',
  'And it cries, from the grave where the hero lies deep,',
  '"Though the day of your Chieftain for ever hath set,',
  'Oh leave

In [145]:
sentences = [sent for sublist in all_texts for sent in sublist]
sentences[:5]

['Lay a garland on my hearse,',
 'Of the dismal yew,',
 'Maidens, willow branches bear,',
 'Say I died true.',
 'My love was false, but I was firm']

In [146]:
from collections import Counter, OrderedDict
from nltk.tokenize import word_tokenize

token_counts = Counter()

def tokenizer(sent):
    return word_tokenize(sent)

for sent in sentences:
    tokens = tokenizer(sent)
    token_counts.update(tokens)
 
    
print('Vocab-size:', len(token_counts))

Vocab-size: 25682


In [147]:
## Step 3: encoding each unique token into integers
from torchtext.vocab import vocab

sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)

vocab = vocab(ordered_dict)

vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)
vocab.set_default_index(1)

print([vocab[token] for token in ['this', 'is', 'an', 'example']])

[52, 15, 85, 8263]


In [148]:
text_encoded = [vocab(tokenizer(sent)) for sent in sentences]
print(len(text_encoded));text_encoded[:4]

33388


[[2500, 11, 4827, 24, 19, 7707, 2],
 [84, 3, 4008, 5884, 2],
 [4009, 2, 5885, 1441, 590, 2],
 [1114, 10, 618, 323, 4]]

In [149]:
print(sentences[:6])
for text in text_encoded[:6]:
    print(text)
    print(vocab.lookup_tokens(text))

['Lay a garland on my hearse,', 'Of the dismal yew,', 'Maidens, willow branches bear,', 'Say I died true.', 'My love was false, but I was firm', 'From my hour of birth;']
[2500, 11, 4827, 24, 19, 7707, 2]
['Lay', 'a', 'garland', 'on', 'my', 'hearse', ',']
[84, 3, 4008, 5884, 2]
['Of', 'the', 'dismal', 'yew', ',']
[4009, 2, 5885, 1441, 590, 2]
['Maidens', ',', 'willow', 'branches', 'bear', ',']
[1114, 10, 618, 323, 4]
['Say', 'I', 'died', 'true', '.']
[111, 42, 31, 1369, 2, 57, 10, 31, 2275]
['My', 'love', 'was', 'false', ',', 'but', 'I', 'was', 'firm']
[132, 19, 462, 6, 463, 12]
['From', 'my', 'hour', 'of', 'birth', ';']


In [150]:
test = [torch.tensor(t) for t in text_encoded]
text_chunk = nn.utils.rnn.pad_sequence(
        test, batch_first=True)

In [151]:
text_chunk

tensor([[2500,   11, 4827,  ...,    0,    0,    0],
        [  84,    3, 4008,  ...,    0,    0,    0],
        [4009,    2, 5885,  ...,    0,    0,    0],
        ...,
        [3540,    2,  598,  ...,    0,    0,    0],
        [7264,   41,  434,  ...,    0,    0,    0],
        [  69,   23,   46,  ...,    0,    0,    0]])

In [152]:
text_chunk[0]

tensor([2500,   11, 4827,   24,   19, 7707,    2,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,   

In [153]:
text_chunk[1]

tensor([  84,    3, 4008, 5884,    2,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,   

In [154]:
import torch
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)
    
    def __getitem__(self, idx):
        text_chunk = self.text_chunks[idx]
        return text_chunk[:-1].long(), text_chunk[1:].long()

In [155]:
seq_dataset = TextDataset(torch.tensor(text_chunk))

  """Entry point for launching an IPython kernel.


In [156]:
seq_dataset[0]

(tensor([2500,   11, 4827,   24,   19, 7707,    2,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,  

In [157]:
x, y = seq_dataset[0]
print(vocab.lookup_tokens(x.numpy()))
print(vocab.lookup_tokens(y.numpy()))

['Lay', 'a', 'garland', 'on', 'my', 'hearse', ',', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pa

In [158]:
print(len(seq_dataset))
len(seq_dataset[0][0])

33388


559

In [159]:
device = torch.device("cuda:0")
# device = 'cpu'

In [160]:
from torch.utils.data import DataLoader
 
batch_size = 32

torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

In [161]:
len(seq_dataset), len(seq_dl)

(33388, 1043)

## Building a character-level RNN model

In [162]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim) 
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, 
                           batch_first=True)
        self.fc = nn.Linear(rnn_hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        out = self.embedding(x).unsqueeze(1)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell

    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden.to(device), cell.to(device)
    
vocab_size = len(vocab)
embed_dim = 64
rnn_hidden_size = 64
seq_length = len(seq_dataset[0][0])

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size) 
model = model.to(device)
model

RNN(
  (embedding): Embedding(25684, 64)
  (rnn): LSTM(64, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=25684, bias=True)
)

In [163]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10000

torch.manual_seed(1)

for epoch in range(num_epochs):
    hidden, cell = model.init_hidden(batch_size)
    seq_batch, target_batch = next(iter(seq_dl))
    seq_batch = seq_batch.to(device)
    target_batch = target_batch.to(device)
    optimizer.zero_grad()
    loss = 0
    for c in range(seq_length):
        pred, hidden, cell = model(seq_batch[:, c], hidden, cell) 
        loss += loss_fn(pred, target_batch[:, c])
    loss.backward()
    optimizer.step()
    loss = loss.item()/seq_length
    if epoch % 500 == 0:
        print(f'Epoch {epoch} loss: {loss:.4f}')

Epoch 0 loss: 10.1427
Epoch 500 loss: 0.0967
Epoch 1000 loss: 0.0892
Epoch 1500 loss: 0.0932
Epoch 2000 loss: 0.0883
Epoch 2500 loss: 0.0909
Epoch 3000 loss: 0.0781
Epoch 3500 loss: 0.0918
Epoch 4000 loss: 0.3468
Epoch 4500 loss: 0.0804
Epoch 5000 loss: 0.0645
Epoch 5500 loss: 0.0909
Epoch 6000 loss: 0.0934
Epoch 6500 loss: 0.0752
Epoch 7000 loss: 0.0620
Epoch 7500 loss: 0.0700
Epoch 8000 loss: 0.0555
Epoch 8500 loss: 0.0614
Epoch 9000 loss: 0.0656
Epoch 9500 loss: 0.0640


In [164]:
from torch.distributions.categorical import Categorical

def sample(model, starting_str, 
           len_generated_text=20, 
           scale_factor=1.0):

    encoded_input = torch.tensor([vocab[s] for s in starting_str])
    encoded_input = torch.reshape(encoded_input, (1, -1))

    generated_str = starting_str

    model.eval()
    hidden, cell = model.init_hidden(1)
    hidden = hidden.to('cpu')
    cell = cell.to('cpu')
    for c in range(len(starting_str)-1):
        _, hidden, cell = model(encoded_input[:, c].view(1), hidden, cell) 
    
    last_char = encoded_input[:, -1]
    for i in range(len_generated_text):
        logits, hidden, cell = model(last_char.view(1), hidden, cell) 
        logits = torch.squeeze(logits, 0)
        scaled_logits = logits * scale_factor
        m = Categorical(logits=scaled_logits)
        last_char = m.sample()
        generated_str += str(vocab.lookup_token(last_char.numpy()))
        
    return generated_str

torch.manual_seed(1)
model.to('cpu')
print(sample(model, starting_str='The island'))

The island<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


In [165]:
print(sample(model, starting_str='The silence tree'))

The silence treeHOVERINGrimsrepublicslosinggloom,<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


In [166]:
print(sample(model, starting_str='The silence tree',len_generated_text=10))

The silence treedeLaservice<pad><pad><pad><pad><pad><pad><pad>


In [167]:
print(sample(model, starting_str='In the end',len_generated_text=10))

In the endcut<pad><pad><pad><pad><pad><pad><pad><pad><pad>


In [168]:
print(sample(model, starting_str='I see the light',len_generated_text=10))

I see the lightlivedesign,<pad><pad><pad><pad><pad><pad><pad>


In [None]:
# https://github.com/rasbt/machine-learning-book/blob/main/ch15/ch15_part3.ipynb