In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

In [1]:
print('''
# ###########################################################################################################
#  NLP with PyTorch - Encoding Text Data
# ###########################################################################################################
''')


# ###########################################################################################################
#  NLP with PyTorch - Encoding Text Data
# ###########################################################################################################



In [3]:
with open('../../../notebooks/Data/shakespeare.txt', mode='r', encoding='utf-8') as f:
    text = f.read()

print(text)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [7]:
print(type(text))
print(len(text))

<class 'str'>
5445609


In [8]:
print(text[:1000])


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bud buriest thy content,
  And tender churl mak'st waste in niggarding:
    Pity the world, or else this glutton be,
    To eat the world's due, by the grave and thee.


                     2
  When forty winters shall besiege thy brow,
  And dig deep trenches in thy beauty's field,
  Thy youth's proud livery so gazed on now,
  Will be a tattered weed of small worth held:  
  Then being asked, where all thy beauty lies,
  Where all the treasure of thy lusty days;
  To say within thine own deep su

In [10]:
# Derive unique characters from the text.
all_unique_chars = set(text)
print(len(all_unique_chars))
print(all_unique_chars)

84
{'F', 'z', 'h', 'u', 'm', 'U', 'k', ':', 'X', 'g', 'W', 'I', 'Y', '!', ']', 'f', 'Z', 'a', '(', 'A', '8', 'P', '"', 'R', ' ', "'", 'G', '6', 'l', 'K', '<', '>', 'p', 'o', ',', 'j', '\n', 'b', '&', '.', 'H', '[', 'L', '7', '2', 'e', 'q', 'n', '|', 'i', 't', 'D', '`', '_', 'x', 'C', '-', 'y', '4', 'V', '3', 'c', 'J', 'r', 'S', 'E', 'd', 'M', 'N', 'T', 'v', 'w', ';', 'Q', '5', '1', ')', '0', 's', '}', 'O', '9', 'B', '?'}


In [13]:
# number -> letter (Decoder takes the number values and return respective character - basically a
# lookup dictionary)
decoder = dict(enumerate(all_unique_chars))
decoder

{0: 'F',
 1: 'z',
 2: 'h',
 3: 'u',
 4: 'm',
 5: 'U',
 6: 'k',
 7: ':',
 8: 'X',
 9: 'g',
 10: 'W',
 11: 'I',
 12: 'Y',
 13: '!',
 14: ']',
 15: 'f',
 16: 'Z',
 17: 'a',
 18: '(',
 19: 'A',
 20: '8',
 21: 'P',
 22: '"',
 23: 'R',
 24: ' ',
 25: "'",
 26: 'G',
 27: '6',
 28: 'l',
 29: 'K',
 30: '<',
 31: '>',
 32: 'p',
 33: 'o',
 34: ',',
 35: 'j',
 36: '\n',
 37: 'b',
 38: '&',
 39: '.',
 40: 'H',
 41: '[',
 42: 'L',
 43: '7',
 44: '2',
 45: 'e',
 46: 'q',
 47: 'n',
 48: '|',
 49: 'i',
 50: 't',
 51: 'D',
 52: '`',
 53: '_',
 54: 'x',
 55: 'C',
 56: '-',
 57: 'y',
 58: '4',
 59: 'V',
 60: '3',
 61: 'c',
 62: 'J',
 63: 'r',
 64: 'S',
 65: 'E',
 66: 'd',
 67: 'M',
 68: 'N',
 69: 'T',
 70: 'v',
 71: 'w',
 72: ';',
 73: 'Q',
 74: '5',
 75: '1',
 76: ')',
 77: '0',
 78: 's',
 79: '}',
 80: 'O',
 81: '9',
 82: 'B',
 83: '?'}

In [14]:
# letter -> number (Encoder takes a letter and return the corresponding number for it)
encoder = {char: idx for idx, char in decoder.items()} # Dictionary generator.
encoder


{'F': 0,
 'z': 1,
 'h': 2,
 'u': 3,
 'm': 4,
 'U': 5,
 'k': 6,
 ':': 7,
 'X': 8,
 'g': 9,
 'W': 10,
 'I': 11,
 'Y': 12,
 '!': 13,
 ']': 14,
 'f': 15,
 'Z': 16,
 'a': 17,
 '(': 18,
 'A': 19,
 '8': 20,
 'P': 21,
 '"': 22,
 'R': 23,
 ' ': 24,
 "'": 25,
 'G': 26,
 '6': 27,
 'l': 28,
 'K': 29,
 '<': 30,
 '>': 31,
 'p': 32,
 'o': 33,
 ',': 34,
 'j': 35,
 '\n': 36,
 'b': 37,
 '&': 38,
 '.': 39,
 'H': 40,
 '[': 41,
 'L': 42,
 '7': 43,
 '2': 44,
 'e': 45,
 'q': 46,
 'n': 47,
 '|': 48,
 'i': 49,
 't': 50,
 'D': 51,
 '`': 52,
 '_': 53,
 'x': 54,
 'C': 55,
 '-': 56,
 'y': 57,
 '4': 58,
 'V': 59,
 '3': 60,
 'c': 61,
 'J': 62,
 'r': 63,
 'S': 64,
 'E': 65,
 'd': 66,
 'M': 67,
 'N': 68,
 'T': 69,
 'v': 70,
 'w': 71,
 ';': 72,
 'Q': 73,
 '5': 74,
 '1': 75,
 ')': 76,
 '0': 77,
 's': 78,
 '}': 79,
 'O': 80,
 '9': 81,
 'B': 82,
 '?': 83}

In [81]:
# Encode the entire text with encoder.
encoded_text = np.array([encoder[char] for char in text])
encoded_text[:500]

array([36, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
       24, 24, 24, 24, 24, 75, 36, 24, 24,  0, 63, 33,  4, 24, 15, 17, 49,
       63, 45, 78, 50, 24, 61, 63, 45, 17, 50,  3, 63, 45, 78, 24, 71, 45,
       24, 66, 45, 78, 49, 63, 45, 24, 49, 47, 61, 63, 45, 17, 78, 45, 34,
       36, 24, 24, 69,  2, 17, 50, 24, 50,  2, 45, 63, 45, 37, 57, 24, 37,
       45, 17,  3, 50, 57, 25, 78, 24, 63, 33, 78, 45, 24,  4, 49,  9,  2,
       50, 24, 47, 45, 70, 45, 63, 24, 66, 49, 45, 34, 36, 24, 24, 82,  3,
       50, 24, 17, 78, 24, 50,  2, 45, 24, 63, 49, 32, 45, 63, 24, 78,  2,
       33,  3, 28, 66, 24, 37, 57, 24, 50, 49,  4, 45, 24, 66, 45, 61, 45,
       17, 78, 45, 34, 36, 24, 24, 40, 49, 78, 24, 50, 45, 47, 66, 45, 63,
       24,  2, 45, 49, 63, 24,  4, 49,  9,  2, 50, 24, 37, 45, 17, 63, 24,
        2, 49, 78, 24,  4, 45,  4, 33, 63, 57,  7, 36, 24, 24, 82,  3, 50,
       24, 50,  2, 33,  3, 24, 61, 33, 47, 50, 63, 17, 61, 50, 45, 66, 24,
       50, 33, 24, 50,  2

In [85]:
# Declare a function for one-hot encoding.
def one_hot_encoder(encoded_text, number_of_unique_chars):

    # encoded_text -> batch of encoded text
    # number_of_unique_chars -> len(set(text))

    one_hot = np.zeros((encoded_text.size, number_of_unique_chars), dtype=np.float32)
    # OR use "one_hot = one_hot.astype(np.float32)"
    # Use the 'dtype=float32' to get precision on PyTorch.
    one_hot[np.arange(one_hot.shape[0]), encoded_text.flatten()] = 1.
    # FANCY INDEXING = Passing an array of indices to access multiple array elements at once.
    one_hot = one_hot.reshape((*encoded_text.shape, number_of_unique_chars)) # Not mandatory.
    return one_hot

In [86]:
ex = np.array([1, 2, 0, 1, 3])
print(ex)
print(one_hot_encoder(ex, 5))

[1 2 0 1 3]
[[0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0.]]


In [87]:
oh = np.zeros((5, 4), dtype=np.float32)
# FANCY INDEXING = Passing an array of indices to access multiple array elements at once.
oh[np.arange(oh.shape[0]), ex.flatten()] = 1.
oh.reshape(5, 4)

array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.]], dtype=float32)

In [88]:
print('''
# ###########################################################################################################
#  NLP with PyTorch - Generating Training Batches
# ###########################################################################################################
''')


# ###########################################################################################################
#  NLP with PyTorch - Generating Training Batches
# ###########################################################################################################



In [110]:
def generate_batches(encoded_text, sample_per_batch=10, sequence_length=50):

    # X : 'encoded_text' of length 'sequence_length' -> [0, 1, 2], [1, 2, 3]
    # Y : 'encoded_text' shifted by one to right.    -> [1, 2, 3], [2, 3, 4] : shifted to right by one index.

    # Number of characters per batch
    chars_per_batch = sample_per_batch * sequence_length
    # Number of batches available.
    number_of_batches = int(len(encoded_text) / chars_per_batch)

    # Cut-off the end of the encoded_text that won't fit evenly into a batch (Loss little bit info).
    encoded_text = encoded_text[:number_of_batches * chars_per_batch]

    encoded_text = encoded_text.reshape(sample_per_batch, -1)

    for n in range(0, encoded_text.shape[1], sequence_length):

        x = encoded_text[:, n:n + sequence_length]
        # Create a 'x' like zeros array.
        y = np.zeros_like(x)

        try:

            y[:, :-1] = x[:, 1:]
            y[:, -1] = encoded_text[:, n + sequence_length]

        except IndexError:

            y[:, :-1] = x[:, 1:]
            y[:, -1] = encoded_text[:, 0]

        yield  x, y

In [116]:
my_sample = np.arange(20)
my_sample

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [117]:
my_batch_generator = generate_batches(my_sample, sample_per_batch=2, sequence_length=5)
x, y = next(my_batch_generator)
x, y

(array([[ 0,  1,  2,  3,  4],
        [10, 11, 12, 13, 14]]),
 array([[ 1,  2,  3,  4,  5],
        [11, 12, 13, 14, 15]]))

In [118]:
print('''
# ###########################################################################################################
#  NLP with PyTorch - Creating the LSTM Model
# ###########################################################################################################
''')


# ###########################################################################################################
#  NLP with PyTorch - Creating the LSTM Model
# ###########################################################################################################



In [157]:
class CharacterModel(nn.Module):

    def __init__(self, all_unique_chars, hidden_size=256, num_layers=4, drop_p=0.5, use_gpu=False):
        super().__init__()
        self.drop_p = drop_p
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.use_gpu = use_gpu

        self.all_chars = all_unique_chars
        self.decoder = dict(enumerate(all_unique_chars))
        self.encoder = {char: idx for idx, char in self.decoder.items()}

        self.lstm = nn.LSTM(len(all_unique_chars),
                            hidden_size,
                            num_layers=num_layers,
                            dropout=drop_p,
                            batch_first=True)
        self.dropout = nn.Dropout(drop_p)
        self.fc_linear = nn.Linear(hidden_size, len(all_unique_chars))

    def forward(self, X, hidden):

        lstm, hidden = self.lstm(X, hidden)
        lstm = self.dropout(lstm)
        lstm = lstm.contiguous().view(-1, self.hidden_size)
        return self.fc_linear(lstm), hidden

    def hidden_state(self, batch_size):

        if self.use_gpu:
            hidden = (torch.zeros(self.num_layers, batch_size, self.hidden_size).cuda(),
                      torch.zeros(self.num_layers, batch_size, self.hidden_size).cuda())
        else:
            hidden = (torch.zeros(self.num_layers, batch_size, self.hidden_size),
                      torch.zeros(self.num_layers, batch_size, self.hidden_size))
        return hidden

In [158]:
model = CharacterModel(all_unique_chars, hidden_size=512, num_layers=3)
model

CharacterModel(
  (lstm): LSTM(84, 512, num_layers=3, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc_linear): Linear(in_features=512, out_features=84, bias=True)
)

In [125]:
total = 0
for p in model.parameters():
    total += p.numel()
total


5470292

In [126]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [127]:
train_percent = 0.9
train_idx = int(len(encoded_text) * train_percent)
train_idx

4901048

In [128]:
training_data = encoded_text[:train_idx]
test_data = encoded_text[train_idx:]

In [129]:
print('''
# ###########################################################################################################
#  NLP with PyTorch - Training LSTM Model
# ###########################################################################################################
''')


# ###########################################################################################################
#  NLP with PyTorch - Training LSTM Model
# ###########################################################################################################



In [130]:
epochs = 60
batch_size = 100
seq_length = 100

tracker = 0

num_chars = max(encoded_text) + 1 # + 1 because indexes start at zero.

In [132]:
# Set to training mode.
model.train()

if model.use_gpu:
    model = model.cuda()

In [134]:
for i in range(epochs):

    hidden = model.hidden_state(batch_size)

    for x, y in generate_batches(encoded_text, batch_size, seq_length):

        tracker += 1

        x = one_hot_encoder(x, num_chars)

        x = torch.tensor(x)
        target = torch.tensor(y)

        if model.use_gpu:
            x = x.cuda()
            target = target.cuda()

        # Reset the hidden state. to avoid back propagation of hidden layer.
        hidden = tuple([state.data for state in hidden])

        model.zero_grad()

        lstm, hidden = model(x, hidden)
        loss = criterion(lstm, target.view(batch_size * seq_length).long())

        # Back-propagate before 'gradient clipping'.
        loss.backward()

        # Gradient clipping to avoid gradient explosions.
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)

        optimizer.step()

        if tracker % 25 == 0:

            val_hidden = model.hidden_state(batch_size)

            val_losses = []
            model.eval()

            for x, y in generate_batches(test_data, batch_size, seq_length):

                x = one_hot_encoder(x, num_chars)

                x = torch.tensor(x)
                target = torch.tensor(y)

                if model.use_gpu:
                    x = x.cuda()
                    target = target.cuda()

                val_hidden = tuple([state.data for state in val_hidden])
                lstm , val_hidden = model(x, val_hidden)

                val_loss = criterion(lstm, target.view(batch_size * seq_length).long())

                val_losses.append(val_loss.item())

            model.train()

            print(f'epoch: {i}, step: {tracker} -> validation loss -> {val_loss.item()}')



epoch: 0, step: 25 -> validation loss -> 3.201533794403076


KeyboardInterrupt: 

In [None]:
# Save the trained model.
model_name = 'shakespeare_model.net___'
torch.save(model.state_dict(), model_name)


In [135]:
print('''
# ###########################################################################################################
#  NLP with PyTorch - Generating Predictions
# ###########################################################################################################
''')


# ###########################################################################################################
#  NLP with PyTorch - Generating Predictions
# ###########################################################################################################



In [159]:
# Change 'map_location' to CPU hence the model trained with GPU.
model = model.load_state_dict(torch.load(f='shakespeare_model.net', map_location=torch.device('cpu')))
model

<All keys matched successfully>

In [160]:
def predict_next_char(model, char, hidden=None, k=1):

    encoded_text = model.encoder[char]

    encoded_text = np.array([[encoded_text]])

    encoded_text = one_hot_encoder(encoded_text, len(model.all_chars))

    inputs = torch.tensor(encoded_text)

    # if model.use_gpu:
    #     inputs = inputs.cuda()

    hidden = tuple([state.data for state in model.hidden])

    lstm_out, hidden = model(inputs, hidden)

    probs = F.softmax(lstm_out, dim=1)

    # If the network is using a GPU for training and validation, probabilities returns by the model
    # should move back to the CPU, in order to further use with numpy.
    if model .use_gpu:
        probs = probs.cpu()

    # 'topk(int)' we can use to define how many top matching outputs we need consider from the probability
    # tensor
    probs, index_positions = probs.topk(k)

    index_positions = index_positions.numpy().squeeze()
    probs = probs.numpy().flatten()
    # Probabilities per index.
    probs = probs / probs.sum()

    char = np.random.choice(index_positions, p=probs)

    return model.decoder[char], hidden

In [161]:
def generate_text(model, future_predicts, seed='The', k=1):

    # if model.use_gpu:
    #     model = model.cuda()
    # else:
    #     model = model.cpu()

    model.eval()

    output_chars = [c for c in seed]
    hidden = model.hidden_state(batch_size=1)

    for char in seed:
        char, hidden = predict_next_char(model, char, hidden, k=k)

    output_chars.append(char)

    for i in range(future_predicts):
        char, hidden = predict_next_char(model, output_chars[-1], hidden, k=k)

        output_chars.append(char)

    return ''.join(output_chars)

In [162]:
print(generate_text(model, 1000, seed='The', k=1))

AttributeError: '_IncompatibleKeys' object has no attribute 'eval'