In [31]:
from tqdm import tqdm
import torch.nn as nn
import numpy as np
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [32]:
FILL_IN = "FILL_IN"

### Get the data and process
- This is the Mysterious island found in Project Gutenberg.

In [33]:
## Reading and processing text
with open('data/1268-0.txt', 'r', encoding="utf8") as fp:
    text=fp.read()

# Get the index of 'THE MYSTERIOUS ISLAND' or 'The Mysterious Island'
start_indx = text.find('THE MYSTERIOUS ISLAND')
# Get the index of 'End of the Project Gutenberg'
end_indx = text.find('END of the Project Gutenberg')

# Set text to the text between start and end idx.
text = text[start_indx:end_indx]
# Get the unique set of characters.
char_set = set(text)
print('Total Length:', len(text))
print('Unique Characters:', len(char_set))
assert(len(text) == 1130711)
assert(len(char_set) == 85)

Total Length: 1130711
Unique Characters: 85


### Tokenze and get other helpers
- We do this manually since everything is character based.

In [34]:
# The universe of words.
chars_sorted = sorted(char_set)

# Effectively, these maps are the tokenizer.
# Map each char to a unique int. This is a dict.
char2int = {char: i for i, char in enumerate(chars_sorted)}
# Do the revverse of the above, this should be a np array.
int2char =  np.array(chars_sorted)

# Tokenize the entire corpus. This should be an np array of np.int32 type.
text_encoded = np.array([char2int[char] for char in text], dtype=np.int32)

print('Text encoded shape: ', text_encoded.shape)

print(text[:15], '     == Encoding ==> ', text_encoded[:15])
print(text_encoded[15:21], ' == Reverse  ==> ', ''.join(int2char[text_encoded[15:21]]))

Text encoded shape:  (1130711,)
THE MYSTERIOUS       == Encoding ==>  [48 36 33  1 41 53 47 48 33 46 37 43 49 47  1]
[37 47 40 29 42 32]  == Reverse  ==>  ISLAND


#### Examples

In [35]:
print('Text encoded shape: ', text_encoded.shape)
print(text[:15], '     == Encoding ==> ', text_encoded[:15])
print(text_encoded[15:21], ' == Reverse  ==> ', ''.join(int2char[text_encoded[15:21]]))

Text encoded shape:  (1130711,)
THE MYSTERIOUS       == Encoding ==>  [48 36 33  1 41 53 47 48 33 46 37 43 49 47  1]
[37 47 40 29 42 32]  == Reverse  ==>  ISLAND


In [36]:
assert(
    np.array_equal(
    text_encoded[:15],
        [48, 36, 33, 1, 41, 53, 47, 48, 33, 46, 37, 43, 49, 47,  1]
    )
)

### Process the data and get the data loader

In [37]:
seq_length = 40
chunk_size = seq_length + 1

# Break up the data into chunks of size 41. This should be a list of lists.
# Use text_encoded. This will be used to get (x, y) pairs.
text_chunks = [text_encoded[i:i + chunk_size] for i in range(0, len(text_encoded) - chunk_size + 1, 1)]

In [38]:
class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)
    
    def __getitem__(self, idx):
        # Get the text chunk at index idx.
        text_chunk = self.text_chunks[idx]
        # Return (x, y) where x has length 40 and y has length 40.
        # y should be x shifted by 1 time.
        return text_chunk[:-1], text_chunk[1:]
    
seq_dataset = TextDataset(torch.tensor(text_chunks))

In [39]:
for i, (seq, target) in enumerate(seq_dataset):
    # 40 characters for source and target ...
    print(seq.shape, target.shape)
    print('Input (x):', repr(''.join(int2char[seq])))
    print('Target (y):', repr(''.join(int2char[target])))
    print()
    if i == 1:
        break 

torch.Size([40]) torch.Size([40])
Input (x): 'THE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTER'
Target (y): 'HE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERI'

torch.Size([40]) torch.Size([40])
Input (x): 'HE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERI'
Target (y): 'E MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERIO'



In [40]:
device = torch.device("cpu")

In [41]:
batch_size = 64
torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

### Write the models

In [42]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        # Set to an embedding layer of vocab_size by embed_dim.
        self.embedding = nn.Embedding(vocab_size, embed_dim) 
        self.rnn_hidden_size = rnn_hidden_size
        # Set to an LSTM with x having embed_dim and h dimension rnn_hidden_size.
        # batch_first shoould be true.
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)
        
        # Make a linear layer from rnn_hidden_size to vocab_size.
        # This will be used to get the yt for each xt.
        self.fc = nn.Linear(rnn_hidden_size, vocab_size)

    def forward(self, text, hidden=None, cell=None):
        # Get the embeddings for text.
        out = self.embedding(text)
        
        # Pass out, hidden and cell through the rnn.
        # If hidden is None, don't specify it and just use out.
        if hidden is not None:
            out, (hidden, cell) = self.rnn(out, (hidden, cell))
        else:
            out, (hidden, cell) = self.rnn(out)
        
        # Pass out through fc.
        out = self.fc(out)
        
        return out, (hidden, cell)

    def init_hidden(self, batch_size):
        # Initialize to zeros of 1 by ??? appropriate dimensions.
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden.to(device), cell.to(device)

### Do this right way - across all data all at once!

In [43]:
vocab_size = len(int2char)
embed_dim = 256
rnn_hidden_size = 512

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size) 
model = model.to(device)
model

RNN(
  (embedding): Embedding(85, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=85, bias=True)
)

In [44]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

# Set to 10000.
num_epochs = 1000

torch.manual_seed(1)

# epochs here will mean batches.
# If the above takes too long, use 1000.

for epoch in range(num_epochs):
    hidden, cell = model.init_hidden(batch_size)
    
    # Get the next batch from seq_dl
    seq_batch, target_batch = next(iter(seq_dl))
        
    seq_batch = seq_batch.to(device)
    target_batch = target_batch.to(device)
    
    optimizer.zero_grad()
    
    loss = 0

    # Pass through the model.
    logits, _ = model(seq_batch, hidden, cell) 
    
    # Get the loss.
    # You'll need to reshape / view things to make this work.
    loss += criterion(logits.transpose(1, 2), target_batch.long())
        
    # Do back prop.
    loss.backward()
    optimizer.step()
        
    # Get the value in the tensor loss.
    loss = loss.item()
    
    if epoch % 100 == 0:
        print(f'Epoch {epoch} loss: {loss:.4f}')

Epoch 0 loss: 4.4364
Epoch 100 loss: 1.7578
Epoch 200 loss: 1.5684
Epoch 300 loss: 1.4286
Epoch 400 loss: 1.4797
Epoch 500 loss: 1.3564
Epoch 600 loss: 1.3440
Epoch 700 loss: 1.3495
Epoch 800 loss: 1.3161
Epoch 900 loss: 1.2960


In [45]:
from torch.distributions.categorical import Categorical

torch.manual_seed(1)

logits = torch.tensor([[-1.0, 1.0, 3.0]])

# Get the probabilities for these logits.
print('Probabilities:', logits.softmax(dim=-1))

# Get a Categorical random variable with the above probabilities for each of the classes.
m = Categorical(logits.softmax(dim=-1))
# Generate 10 things.
samples = m.sample(sample_shape=torch.Size([10]))
 
print(samples.numpy())

Probabilities: tensor([[0.0159, 0.1173, 0.8668]])
[[1]
 [2]
 [2]
 [2]
 [2]
 [1]
 [2]
 [2]
 [2]
 [2]]


### Random decoding.
- This compounds problems: once you make a mistake, you can't undo it.

In [75]:
def random_sample(
    model,
    starting_str, 
    len_generated_text=500, 
):

    # Encode starting string into a tensor using char2str.
    encoded_input = torch.tensor([char2int[s] for s in starting_str])
    
    # Reshape to be 1 by ??? - let PyTorch figure this out.
    encoded_input = encoded_input.unsqueeze(0)

    # This will be what you generate, but it starts off with something.
    generated_str = starting_str

    # Put model in eval mode. This matters if we had dropout o batch / layer norms.
    model.eval()
    
    hidden, cell = model.init_hidden(1)
    
    hidden = hidden.to(device)
    
    cell = cell.to(device)
        
    # Build up the starting hidden and cell states.
    # You can do this all in one go?
    for c in range(len(starting_str)-1):
        # Feed each letter 1 by 1 and then get the final hidden state.
        out = encoded_input[:, c:c+1]
        # Pass out through, note we update hidden and cell and use them again
        _, (hidden, cell) = model(out, hidden, cell)
    
    # Gte the last char; note we did not do go to the last char above.
    last_char = encoded_input[:, -1:]

    # Generate chars one at a time, add them to generated_str.
    # Do this over and over until you get the desired length.
    for i in range(len_generated_text):
        
        # Use hidden and cell from the above.
        # Use last_char, which will be updated over and over.
        logits, (hidden, cell) = model(last_char, hidden, cell)
        
        # Get the logits.
        logits = logits[:, -1, :]

        # m is a random variable with probabilities based on the softmax of the logits.
        m = Categorical(logits.softmax(dim=-1))
        
        # Generate from m 1 char.
        last_char = m.sample().view(1,1)

        # Add the geenrated char to generated_str, but pass it through int2str so that 
        generated_str += int2char[last_char.item()]
        
    return generated_str

torch.manual_seed(1)
model.to(device)
print(random_sample(model, starting_str='The island'))

The island had build ane used a different mingly.

“Anglo, audd on writted Harding, “orded on the
moons of the rich bime direction of the mind; the “Bonadventure!” said Pencroft.

““The shunger soon an’th the trees. The reporter wheen, Ayrton or exactly
through the currently case, cried the settlers in the
purpost in such year, which
they she or descent of the sand. Their discharge collections escaped, and
fallows of his shoopes and soon on the lad been massing the most carning with will in a
hander sprin
