In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

In [8]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [9]:
print('''
# ###########################################################################################################
#  NLP with PyTorch - Encoding Text Data
# ###########################################################################################################
''')


# ###########################################################################################################
#  NLP with PyTorch - Encoding Text Data
# ###########################################################################################################



In [10]:
with open('../../../notebooks/Data/shakespeare.txt', mode='r', encoding='utf-8') as f:
    text = f.read()

print(text)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [11]:
print(type(text))
print(len(text))

<class 'str'>
5445609


In [12]:
print(text[:1000])


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bud buriest thy content,
  And tender churl mak'st waste in niggarding:
    Pity the world, or else this glutton be,
    To eat the world's due, by the grave and thee.


                     2
  When forty winters shall besiege thy brow,
  And dig deep trenches in thy beauty's field,
  Thy youth's proud livery so gazed on now,
  Will be a tattered weed of small worth held:  
  Then being asked, where all thy beauty lies,
  Where all the treasure of thy lusty days;
  To say within thine own deep su

In [13]:
# Derive unique characters from the text.
all_unique_chars = set(text)
print(len(all_unique_chars))
print(all_unique_chars)

84
{'P', '(', 'v', 'X', 'Y', 'r', 'd', 'u', '[', 'g', 'y', 'k', 'z', 'o', 'C', 'N', 'f', '}', '0', '6', 'F', 'W', '"', ':', 'M', '>', 'U', 'c', ')', 'h', 'D', 'V', '<', '8', 't', 'K', '|', 'H', 'q', 'Q', 'I', '\n', 'A', '.', 'p', 'w', '-', ',', 'n', '_', '5', 'x', '`', 'L', '2', '7', 'G', 'b', '9', '?', 'l', 'j', 'e', '&', 'J', 'm', '4', ' ', '!', 's', 'T', 'a', 'i', '3', 'Z', '1', 'O', ';', 'S', "'", 'R', 'E', ']', 'B'}


In [14]:
# number -> letter (Decoder takes the number values and return respective character - basically a
# lookup dictionary)
decoder = dict(enumerate(all_unique_chars))
decoder

{0: 'P',
 1: '(',
 2: 'v',
 3: 'X',
 4: 'Y',
 5: 'r',
 6: 'd',
 7: 'u',
 8: '[',
 9: 'g',
 10: 'y',
 11: 'k',
 12: 'z',
 13: 'o',
 14: 'C',
 15: 'N',
 16: 'f',
 17: '}',
 18: '0',
 19: '6',
 20: 'F',
 21: 'W',
 22: '"',
 23: ':',
 24: 'M',
 25: '>',
 26: 'U',
 27: 'c',
 28: ')',
 29: 'h',
 30: 'D',
 31: 'V',
 32: '<',
 33: '8',
 34: 't',
 35: 'K',
 36: '|',
 37: 'H',
 38: 'q',
 39: 'Q',
 40: 'I',
 41: '\n',
 42: 'A',
 43: '.',
 44: 'p',
 45: 'w',
 46: '-',
 47: ',',
 48: 'n',
 49: '_',
 50: '5',
 51: 'x',
 52: '`',
 53: 'L',
 54: '2',
 55: '7',
 56: 'G',
 57: 'b',
 58: '9',
 59: '?',
 60: 'l',
 61: 'j',
 62: 'e',
 63: '&',
 64: 'J',
 65: 'm',
 66: '4',
 67: ' ',
 68: '!',
 69: 's',
 70: 'T',
 71: 'a',
 72: 'i',
 73: '3',
 74: 'Z',
 75: '1',
 76: 'O',
 77: ';',
 78: 'S',
 79: "'",
 80: 'R',
 81: 'E',
 82: ']',
 83: 'B'}

In [15]:
# letter -> number (Encoder takes a letter and return the corresponding number for it)
encoder = {char: idx for idx, char in decoder.items()} # Dictionary generator.
encoder


{'P': 0,
 '(': 1,
 'v': 2,
 'X': 3,
 'Y': 4,
 'r': 5,
 'd': 6,
 'u': 7,
 '[': 8,
 'g': 9,
 'y': 10,
 'k': 11,
 'z': 12,
 'o': 13,
 'C': 14,
 'N': 15,
 'f': 16,
 '}': 17,
 '0': 18,
 '6': 19,
 'F': 20,
 'W': 21,
 '"': 22,
 ':': 23,
 'M': 24,
 '>': 25,
 'U': 26,
 'c': 27,
 ')': 28,
 'h': 29,
 'D': 30,
 'V': 31,
 '<': 32,
 '8': 33,
 't': 34,
 'K': 35,
 '|': 36,
 'H': 37,
 'q': 38,
 'Q': 39,
 'I': 40,
 '\n': 41,
 'A': 42,
 '.': 43,
 'p': 44,
 'w': 45,
 '-': 46,
 ',': 47,
 'n': 48,
 '_': 49,
 '5': 50,
 'x': 51,
 '`': 52,
 'L': 53,
 '2': 54,
 '7': 55,
 'G': 56,
 'b': 57,
 '9': 58,
 '?': 59,
 'l': 60,
 'j': 61,
 'e': 62,
 '&': 63,
 'J': 64,
 'm': 65,
 '4': 66,
 ' ': 67,
 '!': 68,
 's': 69,
 'T': 70,
 'a': 71,
 'i': 72,
 '3': 73,
 'Z': 74,
 '1': 75,
 'O': 76,
 ';': 77,
 'S': 78,
 "'": 79,
 'R': 80,
 'E': 81,
 ']': 82,
 'B': 83}

In [16]:
# Encode the entire text with encoder.
encoded_text = np.array([encoder[char] for char in text])
encoded_text[:500]

array([41, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
       67, 67, 67, 67, 67, 75, 41, 67, 67, 20,  5, 13, 65, 67, 16, 71, 72,
        5, 62, 69, 34, 67, 27,  5, 62, 71, 34,  7,  5, 62, 69, 67, 45, 62,
       67,  6, 62, 69, 72,  5, 62, 67, 72, 48, 27,  5, 62, 71, 69, 62, 47,
       41, 67, 67, 70, 29, 71, 34, 67, 34, 29, 62,  5, 62, 57, 10, 67, 57,
       62, 71,  7, 34, 10, 79, 69, 67,  5, 13, 69, 62, 67, 65, 72,  9, 29,
       34, 67, 48, 62,  2, 62,  5, 67,  6, 72, 62, 47, 41, 67, 67, 83,  7,
       34, 67, 71, 69, 67, 34, 29, 62, 67,  5, 72, 44, 62,  5, 67, 69, 29,
       13,  7, 60,  6, 67, 57, 10, 67, 34, 72, 65, 62, 67,  6, 62, 27, 62,
       71, 69, 62, 47, 41, 67, 67, 37, 72, 69, 67, 34, 62, 48,  6, 62,  5,
       67, 29, 62, 72,  5, 67, 65, 72,  9, 29, 34, 67, 57, 62, 71,  5, 67,
       29, 72, 69, 67, 65, 62, 65, 13,  5, 10, 23, 41, 67, 67, 83,  7, 34,
       67, 34, 29, 13,  7, 67, 27, 13, 48, 34,  5, 71, 27, 34, 62,  6, 67,
       34, 13, 67, 34, 29

In [17]:
# Declare a function for one-hot encoding.
def one_hot_encoder(encoded_text, number_of_unique_chars):

    # encoded_text -> batch of encoded text
    # number_of_unique_chars -> len(set(text))

    one_hot = np.zeros((encoded_text.size, number_of_unique_chars), dtype=np.float32)
    # OR use "one_hot = one_hot.astype(np.float32)"
    # Use the 'dtype=float32' to get precision on PyTorch.
    one_hot[np.arange(one_hot.shape[0]), encoded_text.flatten()] = 1.
    # FANCY INDEXING = Passing an array of indices to access multiple array elements at once.
    one_hot = one_hot.reshape((*encoded_text.shape, number_of_unique_chars)) # Not mandatory.
    return one_hot

In [18]:
ex = np.array([1, 2, 0, 1, 3])
print(ex)
print(one_hot_encoder(ex, 5))

[1 2 0 1 3]
[[0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0.]]


In [19]:
oh = np.zeros((5, 4), dtype=np.float32)
# FANCY INDEXING = Passing an array of indices to access multiple array elements at once.
oh[np.arange(oh.shape[0]), ex.flatten()] = 1.
oh.reshape(5, 4)

array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.]], dtype=float32)

In [20]:
print('''
# ###########################################################################################################
#  NLP with PyTorch - Generating Training Batches
# ###########################################################################################################
''')


# ###########################################################################################################
#  NLP with PyTorch - Generating Training Batches
# ###########################################################################################################



In [21]:
def generate_batches(encoded_text, sample_per_batch=10, sequence_length=50):

    # X : 'encoded_text' of length 'sequence_length' -> [0, 1, 2], [1, 2, 3]
    # Y : 'encoded_text' shifted by one to right.    -> [1, 2, 3], [2, 3, 4] : shifted to right by one index.

    # Number of characters per batch
    chars_per_batch = sample_per_batch * sequence_length
    # Number of batches available.
    number_of_batches = int(len(encoded_text) / chars_per_batch)

    # Cut-off the end of the encoded_text that won't fit evenly into a batch (Loss little bit info).
    encoded_text = encoded_text[:number_of_batches * chars_per_batch]

    encoded_text = encoded_text.reshape(sample_per_batch, -1)

    for n in range(0, encoded_text.shape[1], sequence_length):

        x = encoded_text[:, n:n + sequence_length]
        # Create a 'x' like zeros array.
        y = np.zeros_like(x)

        try:

            y[:, :-1] = x[:, 1:]
            y[:, -1] = encoded_text[:, n + sequence_length]

        except IndexError:

            y[:, :-1] = x[:, 1:]
            y[:, -1] = encoded_text[:, 0]

        yield  x, y

In [22]:
my_sample = np.arange(20)
my_sample

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [23]:
my_batch_generator = generate_batches(my_sample, sample_per_batch=2, sequence_length=5)
x, y = next(my_batch_generator)
x, y

(array([[ 0,  1,  2,  3,  4],
        [10, 11, 12, 13, 14]]),
 array([[ 1,  2,  3,  4,  5],
        [11, 12, 13, 14, 15]]))

In [24]:
print('''
# ###########################################################################################################
#  NLP with PyTorch - Creating the LSTM Model
# ###########################################################################################################
''')


# ###########################################################################################################
#  NLP with PyTorch - Creating the LSTM Model
# ###########################################################################################################



In [25]:
class CharacterModel(nn.Module):

    def __init__(self, all_unique_chars, hidden_size=256, num_layers=4, drop_p=0.5, use_gpu=False):
        super().__init__()
        self.drop_p = drop_p
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.use_gpu = use_gpu

        self.all_chars = all_unique_chars
        self.decoder = dict(enumerate(all_unique_chars))
        self.encoder = {char: idx for idx, char in self.decoder.items()}

        self.lstm = nn.LSTM(len(all_unique_chars),
                            hidden_size,
                            num_layers=num_layers,
                            dropout=drop_p,
                            batch_first=True)
        self.dropout = nn.Dropout(drop_p)
        self.fc_linear = nn.Linear(hidden_size, len(all_unique_chars))

    def forward(self, X, hidden):

        lstm, hidden = self.lstm(X, hidden)
        lstm = self.dropout(lstm)
        lstm = lstm.contiguous().view(-1, self.hidden_size)
        return self.fc_linear(lstm), hidden

    def hidden_state(self, batch_size):

        if self.use_gpu:
            hidden = (torch.zeros(self.num_layers, batch_size, self.hidden_size).cuda(),
                      torch.zeros(self.num_layers, batch_size, self.hidden_size).cuda())
        else:
            hidden = (torch.zeros(self.num_layers, batch_size, self.hidden_size),
                      torch.zeros(self.num_layers, batch_size, self.hidden_size))
        return hidden

In [34]:
model = CharacterModel(all_unique_chars, hidden_size=512, num_layers=3).to(device)
model

CharacterModel(
  (lstm): LSTM(84, 512, num_layers=3, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc_linear): Linear(in_features=512, out_features=84, bias=True)
)

In [35]:
total = 0
for p in model.parameters():
    total += p.numel()
total


5470292

In [36]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss().to(device)

In [37]:
train_percent = 0.9
train_idx = int(len(encoded_text) * train_percent)
train_idx

4901048

In [38]:
training_data = encoded_text[:train_idx]
test_data = encoded_text[train_idx:]

In [39]:
print('''
# ###########################################################################################################
#  NLP with PyTorch - Training LSTM Model
# ###########################################################################################################
''')


# ###########################################################################################################
#  NLP with PyTorch - Training LSTM Model
# ###########################################################################################################



In [40]:
epochs = 60
batch_size = 100
seq_length = 100

tracker = 0

num_chars = max(encoded_text) + 1 # + 1 because indexes start at zero.

In [41]:
# Set to training mode.
model.train()

In [44]:
for i in range(epochs):

    hidden = model.hidden_state(batch_size)

    for x, y in generate_batches(encoded_text, batch_size, seq_length):

        tracker += 1

        x = one_hot_encoder(x, num_chars)

        x = torch.tensor(x, device=device)
        target = torch.tensor(y, device=device)

        # Reset the hidden state. to avoid back propagation of hidden layer.
        hidden = tuple([state.data.to(device) for state in hidden])

        model.zero_grad()

        lstm, hidden = model(x, hidden)
        loss = criterion(lstm, target.view(batch_size * seq_length).long())

        # Back-propagate before 'gradient clipping'.
        loss.backward()

        # Gradient clipping to avoid gradient explosions.
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)

        optimizer.step()

        if tracker % 25 == 0:

            val_hidden = model.hidden_state(batch_size)

            val_losses = []
            model.eval()

            for x, y in generate_batches(test_data, batch_size, seq_length):

                x = one_hot_encoder(x, num_chars)

                x = torch.tensor(x)
                target = torch.tensor(y)

                if model.use_gpu:
                    x = x.cuda()
                    target = target.cuda()

                val_hidden = tuple([state.data for state in val_hidden])
                lstm , val_hidden = model(x, val_hidden)

                val_loss = criterion(lstm, target.view(batch_size * seq_length).long())

                val_losses.append(val_loss.item())

            model.train()

            print(f'epoch: {i}, step: {tracker} -> validation loss -> {val_loss.item()}')



RuntimeError: Input and parameter tensors are not at the same device, found input tensor at cpu and parameter tensor at cuda:0

In [None]:
# Save the trained model.
model_name = 'shakespeare_model.net___'
torch.save(model.state_dict(), model_name)


In [135]:
print('''
# ###########################################################################################################
#  NLP with PyTorch - Generating Predictions
# ###########################################################################################################
''')


# ###########################################################################################################
#  NLP with PyTorch - Generating Predictions
# ###########################################################################################################



In [159]:
# Change 'map_location' to CPU hence the model trained with GPU.
model = model.load_state_dict(torch.load(f='shakespeare_model.net', map_location=torch.device('cpu')))
model

<All keys matched successfully>

In [160]:
def predict_next_char(model, char, hidden=None, k=1):

    encoded_text = model.encoder[char]

    encoded_text = np.array([[encoded_text]])

    encoded_text = one_hot_encoder(encoded_text, len(model.all_chars))

    inputs = torch.tensor(encoded_text)

    # if model.use_gpu:
    #     inputs = inputs.cuda()

    hidden = tuple([state.data for state in model.hidden])

    lstm_out, hidden = model(inputs, hidden)

    probs = F.softmax(lstm_out, dim=1)

    # If the network is using a GPU for training and validation, probabilities returns by the model
    # should move back to the CPU, in order to further use with numpy.
    if model .use_gpu:
        probs = probs.cpu()

    # 'topk(int)' we can use to define how many top matching outputs we need consider from the probability
    # tensor
    probs, index_positions = probs.topk(k)

    index_positions = index_positions.numpy().squeeze()
    probs = probs.numpy().flatten()
    # Probabilities per index.
    probs = probs / probs.sum()

    char = np.random.choice(index_positions, p=probs)

    return model.decoder[char], hidden

In [161]:
def generate_text(model, future_predicts, seed='The', k=1):

    # if model.use_gpu:
    #     model = model.cuda()
    # else:
    #     model = model.cpu()

    model.eval()

    output_chars = [c for c in seed]
    hidden = model.hidden_state(batch_size=1)

    for char in seed:
        char, hidden = predict_next_char(model, char, hidden, k=k)

    output_chars.append(char)

    for i in range(future_predicts):
        char, hidden = predict_next_char(model, output_chars[-1], hidden, k=k)

        output_chars.append(char)

    return ''.join(output_chars)

In [162]:
print(generate_text(model, 1000, seed='The', k=1))

AttributeError: '_IncompatibleKeys' object has no attribute 'eval'

In [163]:
# This lecture series (NLP) is not completely done due to the performance issue in the laptop.
# Must re-evaluate and try alone to the same or in a different approach.