# RNN for Text Generation

## Generating Text (encoded variables)

We saw how to generate continuous values, now let's see how to generalize this to generate categorical sequences (such as words or letters).

In [1]:
import torch
from torch import nn
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
with open('../Data/shakespeare.txt','r',encoding='utf8') as f:
    text = f.read()

In [3]:
text[:1000]

"\n                     1\n  From fairest creatures we desire increase,\n  That thereby beauty's rose might never die,\n  But as the riper should by time decease,\n  His tender heir might bear his memory:\n  But thou contracted to thine own bright eyes,\n  Feed'st thy light's flame with self-substantial fuel,\n  Making a famine where abundance lies,\n  Thy self thy foe, to thy sweet self too cruel:\n  Thou that art now the world's fresh ornament,\n  And only herald to the gaudy spring,\n  Within thine own bud buriest thy content,\n  And tender churl mak'st waste in niggarding:\n    Pity the world, or else this glutton be,\n    To eat the world's due, by the grave and thee.\n\n\n                     2\n  When forty winters shall besiege thy brow,\n  And dig deep trenches in thy beauty's field,\n  Thy youth's proud livery so gazed on now,\n  Will be a tattered weed of small worth held:  \n  Then being asked, where all thy beauty lies,\n  Where all the treasure of thy lusty days;\n  To sa

In [4]:
print(text[:1000])


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bud buriest thy content,
  And tender churl mak'st waste in niggarding:
    Pity the world, or else this glutton be,
    To eat the world's due, by the grave and thee.


                     2
  When forty winters shall besiege thy brow,
  And dig deep trenches in thy beauty's field,
  Thy youth's proud livery so gazed on now,
  Will be a tattered weed of small worth held:  
  Then being asked, where all thy beauty lies,
  Where all the treasure of thy lusty days;
  To say within thine own deep su

In [5]:
len(text)

5447699

## Encode Entire Text

In [6]:
# Get all the unique chars
all_characters = set(text)

In [7]:
all_characters

{'\n',
 ' ',
 '!',
 '"',
 '&',
 "'",
 '(',
 ')',
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '>',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '|',
 '}'}

In [8]:
len(all_characters)

84

In [9]:
# There are 84 pairs
for pair in enumerate(all_characters):
    print(pair)

(0, 'T')
(1, 'x')
(2, 'V')
(3, 's')
(4, 'c')
(5, '}')
(6, 'J')
(7, 'A')
(8, '0')
(9, '7')
(10, '>')
(11, 'S')
(12, 'D')
(13, 't')
(14, '4')
(15, '(')
(16, '|')
(17, 'I')
(18, 'F')
(19, ';')
(20, '!')
(21, '[')
(22, 'i')
(23, 'Z')
(24, 'M')
(25, 'j')
(26, 'Y')
(27, '6')
(28, 'w')
(29, 'o')
(30, 'y')
(31, '&')
(32, 'n')
(33, ' ')
(34, '<')
(35, '3')
(36, '.')
(37, 'd')
(38, '\n')
(39, ',')
(40, 'g')
(41, 'k')
(42, '"')
(43, 'O')
(44, ':')
(45, 'm')
(46, '_')
(47, 'e')
(48, 'Q')
(49, 'W')
(50, 'K')
(51, 'H')
(52, '8')
(53, 'v')
(54, 'p')
(55, 'C')
(56, 'r')
(57, '?')
(58, 'q')
(59, 'h')
(60, "'")
(61, 'N')
(62, 'X')
(63, '9')
(64, '5')
(65, 'a')
(66, 'P')
(67, 'l')
(68, 'E')
(69, ']')
(70, 'G')
(71, 'u')
(72, 'z')
(73, '`')
(74, '-')
(75, '2')
(76, 'U')
(77, 'B')
(78, 'f')
(79, 'R')
(80, '1')
(81, ')')
(82, 'b')
(83, 'L')


In [10]:
# decoder: num --> letter

decoder = dict(enumerate(all_characters))
decoder

{0: 'T',
 1: 'x',
 2: 'V',
 3: 's',
 4: 'c',
 5: '}',
 6: 'J',
 7: 'A',
 8: '0',
 9: '7',
 10: '>',
 11: 'S',
 12: 'D',
 13: 't',
 14: '4',
 15: '(',
 16: '|',
 17: 'I',
 18: 'F',
 19: ';',
 20: '!',
 21: '[',
 22: 'i',
 23: 'Z',
 24: 'M',
 25: 'j',
 26: 'Y',
 27: '6',
 28: 'w',
 29: 'o',
 30: 'y',
 31: '&',
 32: 'n',
 33: ' ',
 34: '<',
 35: '3',
 36: '.',
 37: 'd',
 38: '\n',
 39: ',',
 40: 'g',
 41: 'k',
 42: '"',
 43: 'O',
 44: ':',
 45: 'm',
 46: '_',
 47: 'e',
 48: 'Q',
 49: 'W',
 50: 'K',
 51: 'H',
 52: '8',
 53: 'v',
 54: 'p',
 55: 'C',
 56: 'r',
 57: '?',
 58: 'q',
 59: 'h',
 60: "'",
 61: 'N',
 62: 'X',
 63: '9',
 64: '5',
 65: 'a',
 66: 'P',
 67: 'l',
 68: 'E',
 69: ']',
 70: 'G',
 71: 'u',
 72: 'z',
 73: '`',
 74: '-',
 75: '2',
 76: 'U',
 77: 'B',
 78: 'f',
 79: 'R',
 80: '1',
 81: ')',
 82: 'b',
 83: 'L'}

In [11]:
# encoder: num --> letter
encoder = {char: ind for ind,char in decoder.items()}

In [12]:
encoder

{'T': 0,
 'x': 1,
 'V': 2,
 's': 3,
 'c': 4,
 '}': 5,
 'J': 6,
 'A': 7,
 '0': 8,
 '7': 9,
 '>': 10,
 'S': 11,
 'D': 12,
 't': 13,
 '4': 14,
 '(': 15,
 '|': 16,
 'I': 17,
 'F': 18,
 ';': 19,
 '!': 20,
 '[': 21,
 'i': 22,
 'Z': 23,
 'M': 24,
 'j': 25,
 'Y': 26,
 '6': 27,
 'w': 28,
 'o': 29,
 'y': 30,
 '&': 31,
 'n': 32,
 ' ': 33,
 '<': 34,
 '3': 35,
 '.': 36,
 'd': 37,
 '\n': 38,
 ',': 39,
 'g': 40,
 'k': 41,
 '"': 42,
 'O': 43,
 ':': 44,
 'm': 45,
 '_': 46,
 'e': 47,
 'Q': 48,
 'W': 49,
 'K': 50,
 'H': 51,
 '8': 52,
 'v': 53,
 'p': 54,
 'C': 55,
 'r': 56,
 '?': 57,
 'q': 58,
 'h': 59,
 "'": 60,
 'N': 61,
 'X': 62,
 '9': 63,
 '5': 64,
 'a': 65,
 'P': 66,
 'l': 67,
 'E': 68,
 ']': 69,
 'G': 70,
 'u': 71,
 'z': 72,
 '`': 73,
 '-': 74,
 '2': 75,
 'U': 76,
 'B': 77,
 'f': 78,
 'R': 79,
 '1': 80,
 ')': 81,
 'b': 82,
 'L': 83}

In [13]:
encoded_text = np.array([encoder[char] for char in text])

In [14]:
# Now all the characters in text is represented by a number

encoded_text[:500]

array([38, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
       33, 33, 33, 33, 33, 80, 38, 33, 33, 18, 56, 29, 45, 33, 78, 65, 22,
       56, 47,  3, 13, 33,  4, 56, 47, 65, 13, 71, 56, 47,  3, 33, 28, 47,
       33, 37, 47,  3, 22, 56, 47, 33, 22, 32,  4, 56, 47, 65,  3, 47, 39,
       38, 33, 33,  0, 59, 65, 13, 33, 13, 59, 47, 56, 47, 82, 30, 33, 82,
       47, 65, 71, 13, 30, 60,  3, 33, 56, 29,  3, 47, 33, 45, 22, 40, 59,
       13, 33, 32, 47, 53, 47, 56, 33, 37, 22, 47, 39, 38, 33, 33, 77, 71,
       13, 33, 65,  3, 33, 13, 59, 47, 33, 56, 22, 54, 47, 56, 33,  3, 59,
       29, 71, 67, 37, 33, 82, 30, 33, 13, 22, 45, 47, 33, 37, 47,  4, 47,
       65,  3, 47, 39, 38, 33, 33, 51, 22,  3, 33, 13, 47, 32, 37, 47, 56,
       33, 59, 47, 22, 56, 33, 45, 22, 40, 59, 13, 33, 82, 47, 65, 56, 33,
       59, 22,  3, 33, 45, 47, 45, 29, 56, 30, 44, 38, 33, 33, 77, 71, 13,
       33, 13, 59, 29, 71, 33,  4, 29, 32, 13, 56, 65,  4, 13, 47, 37, 33,
       13, 29, 33, 13, 59

## One Hot Encoding

As previously discussed, we need to one-hot encode our data inorder for it to work with the network structure. Make sure to review numpy if any of these operations confuse you!

In [64]:
def one_hot_encoder(encoded_text, num_uni_chars):
    '''
    encoded_text : batch of encoded text
    
    num_uni_chars = number of unique characters (len(set(text)))
    '''
    
    # METHOD FROM:
    # https://stackoverflow.com/questions/29831489/convert-encoded_textay-of-indices-to-1-hot-encoded-numpy-encoded_textay
      
    # Create a placeholder for zeros.
    # Shape of one_hot will encoded_text_size x num_unique_chars
    one_hot = np.zeros((encoded_text.size, num_uni_chars))
    
    # Convert data type for later use with pytorch (errors if we dont!)
    one_hot = one_hot.astype(np.float32)

    # Using fancy indexing fill in the 1s at the correct index locations
    # flatten() turns multi-dim matrix into a 1D array
    one_hot[np.arange(one_hot.shape[0]), encoded_text.flatten()] = 1.0

    # Reshape it so it matches the batch shape
    one_hot = one_hot.reshape((*encoded_text.shape, num_uni_chars))
    
    return one_hot

In [65]:
one_hot_encoder(np.array([1,2,0]),3)

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)

The output is a matrix of shape (3(len of array) x 3(num of unique elements))<br>
to represent 1, there is a 1 at index 1<br>
to represent 2, there is a 2 at index 2<br>
to represent 0, there is a 0 at index 0<br>

Similarly, this function will put one in the array to represent the presence of a word/char in a sentence/word

Few things before we move on to the next step:

In [66]:
example_text = np.arange(10)
example_text
# Assuming each number represents a word

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [67]:
# If we wanted to break this into 5 batches
example_text.reshape((5,-1))

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7],
       [8, 9]])

Now let's create the batch generator function:

In [68]:
def generate_batches(encoded_text, samp_per_batch=10, seq_len=50):
    
    '''
    Generate (using yield) batches for training.
    
    X: Encoded Text of length seq_len
    Y: Encoded Text shifted by one
    
    Example:
    
    X:
    
    [[1 2 3]]
    
    Y:
    
    [[ 2 3 4]]
    
    encoded_text : Complete Encoded Text to make batches from
    batch_size : Number of samples per batch
    seq_len : Length of character sequence
       
    '''
    
    # Total number of characters per batch
    # Example: If samp_per_batch is 2 and seq_len is 50, then 100
    # characters come out per batch.
    char_per_batch = samp_per_batch * seq_len
    
    
    # Number of batches available to make
    # Use int() to roun to nearest integer
    num_batches_avail = int(len(encoded_text)/char_per_batch)
    
    # Cut off end of encoded_text that
    # won't fit evenly into a batch
    encoded_text = encoded_text[:num_batches_avail * char_per_batch]
    
    
    # Reshape text into rows the size of a batch
    # This is the same steps we did in the cells above
    encoded_text = encoded_text.reshape((samp_per_batch, -1))
    

    # Go through each row in array.
    for n in range(0, encoded_text.shape[1], seq_len):
        
        # Grab feature characters
        x = encoded_text[:, n:n+seq_len]
        
        # y is the target shifted over by 1
        y = np.zeros_like(x)
       
        #
        try:
            y[:, :-1] = x[:, 1:]
            y[:, -1]  = encoded_text[:, n+seq_len]
            
        # FOR POTENTIAL INDEXING ERROR AT THE END    
        except:
            y[:, :-1] = x[:, 1:]
            y[:, -1] = encoded_text[:, 0]
            
        yield x, y

In [69]:
encoded_text

array([38, 33, 33, ..., 38, 38, 38])

In [70]:
# Just for demostration of what generate_batches() output
# Assume sample_text is an encoded represenation of chars
# i.e each number represents a char

sample_text = np.arange(30)
sample_text

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])

In [71]:
len(sample_text)

30

In [72]:
batch_generator = generate_batches(sample_text,samp_per_batch=2,seq_len=5)

In [73]:
# Grab first batch
x, y = next(batch_generator)

In [74]:
x

array([[ 0,  1,  2,  3,  4],
       [15, 16, 17, 18, 19]])

In [75]:
x.shape

(2, 5)

This batch of x has the first 10 words broken into 2 samples. So 5 words per sample

In [76]:
y

array([[ 1,  2,  3,  4,  5],
       [16, 17, 18, 19, 20]])

This batch of y is the same thing as x, but 1 element shifted

# Creating the LSTM Model

**Note! We will have options for GPU users and CPU users. CPU will take MUCH LONGER to train and you may encounter RAM issues depending on your hardware. If that is the case, consider using cloud services like AWS, GCP, or Azure. Note, these may cost you money to use!**

In [77]:
class CharModel(nn.Module):
    
    def __init__(self, all_chars, num_hidden=256, num_layers=4, drop_prob=0.5, use_gpu=False):
        
        
        # SET UP ATTRIBUTES
        super().__init__()
        self.drop_prob = drop_prob
        self.num_layers = num_layers
        self.num_hidden = num_hidden
        self.use_gpu = use_gpu
        
        #CHARACTER SET, ENCODER, and DECODER
        self.all_chars = all_chars
        
        # These are all steps we did in the beginning
        self.decoder = dict(enumerate(all_chars))
        self.encoder = {char: ind for ind,char in decoder.items()}
        
        
        # len(self.all_chars) is the input size
        # num_hidden is the num of features in hidden state
        # num_layers is the num of layers
        self.lstm = nn.LSTM(len(self.all_chars), num_hidden, num_layers, dropout=drop_prob, batch_first=True)
        
        self.dropout = nn.Dropout(drop_prob)
        
        # Fully connected linear layer with input_size=num_hidden
        # output will be len of all chars, so each output represents a char
        self.fc_linear = nn.Linear(num_hidden, len(self.all_chars))
      
    
    def forward(self, x, hidden):
                  
        # Similar to what was done for RNN on time-series
        # x is the input
        # hidden is the tuple (h_0, c_0)
        lstm_output, hidden = self.lstm(x, hidden)
        
        # pass lstm output to a dropout layer
        drop_output = self.dropout(lstm_output)
        
        # reshape the output so it's ready for the linear layer
        drop_output = drop_output.contiguous().view(-1, self.num_hidden)
        
        
        final_out = self.fc_linear(drop_output)
        
        
        return final_out, hidden
    
    
    def hidden_state(self, batch_size):
        '''
        Used as separate method to account for both GPU and CPU users.
        I removed the GPU part, because it throws error
        '''

        hidden = (torch.zeros(self.num_layers,batch_size,self.num_hidden),
                     torch.zeros(self.num_layers,batch_size,self.num_hidden))
        
        return hidden

## Instance of the Model

In [78]:
model_test = CharModel(
    all_chars=all_characters,
    num_hidden=512,
    num_layers=3,
    drop_prob=0.5,
    use_gpu=False,
)

model_test

CharModel(
  (lstm): LSTM(84, 512, num_layers=3, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5)
  (fc_linear): Linear(in_features=512, out_features=84, bias=True)
)

Try to make the total_parameters be roughly the same magnitude as the number of characters in the text.<br>
Otherwise, your model may overfit or underfit the training data

In [79]:
total_param  = []
for p in model_test.parameters():
    total_param.append(int(p.numel()))

In [80]:
sum(total_param)

5470292

In [81]:
len(encoded_text)

5447699

### Optimizer and Loss

In [103]:
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
criterion = nn.CrossEntropyLoss()

## Training Data and Validation Data

In [104]:
# percentage of data to be used for training
train_percent = 0.1

In [105]:
train_ind = int(len(encoded_text) * (train_percent))
train_ind

544769

This is the cutoff index for our training set

In [106]:
train_data = encoded_text[:train_ind]
val_data = encoded_text[train_ind:]

# Training the Network

## Variables

Feel free to play around with these values!

In [107]:
encoded_text

array([38, 33, 33, ..., 38, 38, 38])

In [108]:
## VARIABLES

# Epochs to train for
epochs = 60
# batch size 
batch_size = 100

# Length of sequence
seq_len = 100

# for printing report purposes
# always start at 0
tracker = 0

# number of characters in text
num_char = max(encoded_text)+1

### Going to examine to output of first iteration to better understand what the model is doing

In [109]:
# Set model to train
model_test.train()


for i in range(epochs):
    
    hidden = model_test.hidden_state(batch_size)
#     print("hidden:", hidden)
#     print("\n")


    
    # batch_size = 100
    # seq_len = 100
    # So each x will have 100 main arrays, each with 100 elements in it
    for x,y in generate_batches(train_data,batch_size,seq_len):
        print("x shape:", x.shape)
        print("x:", x[0])
        print("\n")
        
        # For tracing and printing purposes
        tracker += 1
        
        
        # One Hot Encode incoming data of shape (100,100)
        # one_hot_encoder returns x of shape (100,100,84)
        # So 100 batch arrays, each with 100 sub-arrays. Each sub-array has 84 elements because
        # Remember: there are 84 unique chars in this text data
        
        # Each number in sub-array represents a word, so each of those words get converted to a 84 length array 
        # containing 1 where index = encoded_value. 
        # For example: if 4 represents 'A', there will be a 84 len array with 1 at index 4
        
        x = one_hot_encoder(x,num_char)
        
        # Convert Numpy Arrays to Tensor
        inputs = torch.from_numpy(x)
        print('inputs:', inputs)
        print("\n")
        targets = torch.from_numpy(y)
        print('targets:', targets)
        print("\n")
        
        break
    break
        # Reset Hidden State
        # If we dont' reset we would backpropagate through all training history
        # Creating the hidden tuple (h_0, c_0)
#         hidden = tuple([state.data for state in hidden])
        
#         model.zero_grad()
        
#         lstm_output, hidden = model.forward(inputs,hidden)
#         loss = criterion(lstm_output,targets.view(batch_size*seq_len).long())
        
#         loss.backward()
        
#         # POSSIBLE EXPLODING GRADIENT PROBLEM!
#         # LET"S CLIP JUST IN CASE
#         nn.utils.clip_grad_norm_(model.parameters(),max_norm=5)
        
#         optimizer.step()
        
        
        
        ###################################
        ### CHECK ON VALIDATION SET ######
        #################################
        
#         if tracker % 25 == 0:
            
#             val_hidden = model.hidden_state(batch_size)
#             val_losses = []
#             model.eval()
            
#             for x,y in generate_batches(val_data,batch_size,seq_len):
                
#                 # One Hot Encode incoming data
#                 x = one_hot_encoder(x,num_char)
                

#                 # Convert Numpy Arrays to Tensor

#                 inputs = torch.from_numpy(x)
#                 targets = torch.from_numpy(y)

#                 # Adjust for GPU if necessary

#                 if model.use_gpu:

#                     inputs = inputs.cuda()
#                     targets = targets.cuda()
                    
#                 # Reset Hidden State
#                 # If we dont' reset we would backpropagate through 
#                 # all training history
#                 val_hidden = tuple([state.data for state in val_hidden])
                
#                 lstm_output, val_hidden = model.forward(inputs,val_hidden)
#                 val_loss = criterion(lstm_output,targets.view(batch_size*seq_len).long())
        
#                 val_losses.append(val_loss.item())
            
#             # Reset to training model after val for loop
#             model.train()
            
#             print(f"Epoch: {i} Step: {tracker} Val Loss: {val_loss.item()}")

x shape: (100, 100)
x: [38 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 80 38
 33 33 18 56 29 45 33 78 65 22 56 47  3 13 33  4 56 47 65 13 71 56 47  3
 33 28 47 33 37 47  3 22 56 47 33 22 32  4 56 47 65  3 47 39 38 33 33  0
 59 65 13 33 13 59 47 56 47 82 30 33 82 47 65 71 13 30 60  3 33 56 29  3
 47 33 45 22]


inputs: tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 1., 0.],
         ...,


hidden is the tuple (h_0, c_0)

In [110]:
hidden[0].shape  # h_0

torch.Size([3, 100, 512])

- 3 is the number of layers
- 100 is the batch size
- 512 is the num_hidden or hidden_size

In [111]:
len(hidden[0][0][0])

512

In [112]:
hidden[1].shape  # c_0

torch.Size([3, 100, 512])

In [113]:
# state.data for state in hidden
i=1
for state in hidden:
    print(state.shape)

torch.Size([3, 100, 512])
torch.Size([3, 100, 512])


In [114]:
inputs.shape

torch.Size([100, 100, 84])

In [115]:
targets.shape

torch.Size([100, 100])

Train Data:

In [116]:
train_data.shape

(544769,)

In [117]:
train_data

array([38, 33, 33, ..., 33, 59, 47])

x after one-hot encoding

In [118]:
x.shape

(100, 100, 84)

In [119]:
# This represents 82 which represents '\n'
x[0][0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)

In [120]:
# This represents 2 which represents ' '
x[1][0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)

### Now let's train the model for real

In [121]:
model = CharModel(
    all_chars=all_characters,
    num_hidden=512,
    num_layers=3,
    drop_prob=0.5,
    use_gpu=False,
)

model

CharModel(
  (lstm): LSTM(84, 512, num_layers=3, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5)
  (fc_linear): Linear(in_features=512, out_features=84, bias=True)
)

In [122]:
## VARIABLES

# Epochs to train for
epochs = 60
# batch size 
batch_size = 100

# Length of sequence
seq_len = 100

# for printing report purposes
# always start at 0
tracker = 0

# number of characters in text
num_char = max(encoded_text)+1

In [123]:
# Set model to train
model.train()


for i in range(epochs):
    
    hidden = model.hidden_state(batch_size)
    
    
    for x,y in generate_batches(train_data,batch_size,seq_len):
        
        tracker += 1
        
        # One Hot Encode incoming data
        x = one_hot_encoder(x,num_char)
        
        # Convert Numpy Arrays to Tensor
        
        inputs = torch.from_numpy(x)
        targets = torch.from_numpy(y)
        

        
        # Reset Hidden State
        # If we dont' reset we would backpropagate through all training history
        hidden = tuple([state.data for state in hidden])
        
        model.zero_grad()
        
        lstm_output, hidden = model.forward(inputs,hidden)
#         print("lstm output:", lstm_output)
#         print("lstm shape:", lstm_output.shape)
#         print("target shape:", targets.view(batch_size*seq_len).long().shape)
        loss = criterion(lstm_output,targets.view(batch_size*seq_len).long())
        
        loss.backward()
        
        # POSSIBLE EXPLODING GRADIENT PROBLEM!
        # LET"S CLIP JUST IN CASE
        nn.utils.clip_grad_norm_(model.parameters(),max_norm=5)
        
        optimizer.step()
        
        
        
        ###################################
        ### CHECK ON VALIDATION SET ######
        #################################
        
        if tracker % 25 == 0:
            
            val_hidden = model.hidden_state(batch_size)
            val_losses = []
            model.eval()
            
            for x,y in generate_batches(val_data,batch_size,seq_len):
                
                # One Hot Encode incoming data
                x = one_hot_encoder(x,num_char)
                

                # Convert Numpy Arrays to Tensor

                inputs = torch.from_numpy(x)
                targets = torch.from_numpy(y)

                # Adjust for GPU if necessary

                if model.use_gpu:

                    inputs = inputs.cuda()
                    targets = targets.cuda()
                    
                # Reset Hidden State
                # If we dont' reset we would backpropagate through 
                # all training history
                val_hidden = tuple([state.data for state in val_hidden])
                
                lstm_output, val_hidden = model.forward(inputs,val_hidden)
                
                val_loss = criterion(lstm_output,targets.view(batch_size*seq_len).long())
        
                val_losses.append(val_loss.item())
            
            # Reset to training model after val for loop
            model.train()
            
            print(f"Epoch: {i} Step: {tracker} Val Loss: {val_loss.item()}")

lstm output: tensor([[ 0.0128,  0.0186, -0.0103,  ..., -0.0350, -0.0245,  0.0400],
        [ 0.0169,  0.0246, -0.0188,  ..., -0.0303, -0.0213,  0.0342],
        [ 0.0195,  0.0431, -0.0215,  ..., -0.0272, -0.0142,  0.0187],
        ...,
        [ 0.0348,  0.0319, -0.0147,  ..., -0.0176, -0.0114,  0.0393],
        [ 0.0078,  0.0479, -0.0252,  ..., -0.0214, -0.0096,  0.0317],
        [ 0.0148,  0.0307, -0.0285,  ..., -0.0338, -0.0167,  0.0373]],
       grad_fn=<AddmmBackward>)
lstm shape: torch.Size([10000, 84])
target shape: torch.Size([10000])
lstm output: tensor([[ 1.5902e-02,  3.9078e-02, -2.1018e-02,  ..., -3.1244e-02,
         -1.6492e-02,  2.9342e-02],
        [ 9.3370e-03,  6.1259e-02, -5.4215e-03,  ..., -2.1160e-02,
         -2.7503e-02,  2.9383e-02],
        [ 9.1621e-03,  2.3921e-02, -1.2162e-02,  ..., -1.8516e-02,
         -1.4021e-02,  1.1166e-02],
        ...,
        [ 2.2576e-02,  3.5748e-02, -1.3561e-03,  ..., -3.3841e-02,
         -1.2663e-02,  4.1614e-02],
        [ 2.8

KeyboardInterrupt: 

In [None]:
len(targets.view(batch_size*seq_len).long())

In [None]:
targets.view(batch_size*seq_len).long().shape

In [None]:
type(targets.view(batch_size*seq_len).long())