In [11]:
## data colllection 
import nltk 
nltk.download('gutenberg')
from nltk.corpus import gutenberg

## load the dataset 
data = gutenberg.raw('shakespeare-hamlet.txt')
## save to a file 
with open('hamlet.txt','w') as file:
    file.write(data)

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/sandipmahata/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [12]:
## data preprocessing 
import numpy as np 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

## load the dataset 
with open('hamlet.txt','r')as file:
    text = file.read().lower()
    
## TOkenizer the text 
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index)+1
total_words

4818

In [13]:
tokenizer.index_word

{1: 'the',
 2: 'and',
 3: 'to',
 4: 'of',
 5: 'i',
 6: 'you',
 7: 'a',
 8: 'my',
 9: 'it',
 10: 'in',
 11: 'that',
 12: 'ham',
 13: 'is',
 14: 'not',
 15: 'his',
 16: 'this',
 17: 'with',
 18: 'your',
 19: 'but',
 20: 'for',
 21: 'me',
 22: 'lord',
 23: 'as',
 24: 'what',
 25: 'he',
 26: 'be',
 27: 'so',
 28: 'him',
 29: 'haue',
 30: 'king',
 31: 'will',
 32: 'no',
 33: 'our',
 34: 'we',
 35: 'on',
 36: 'are',
 37: 'if',
 38: 'all',
 39: 'then',
 40: 'shall',
 41: 'by',
 42: 'thou',
 43: 'come',
 44: 'or',
 45: 'hamlet',
 46: 'good',
 47: 'do',
 48: 'hor',
 49: 'her',
 50: 'let',
 51: 'now',
 52: 'thy',
 53: 'how',
 54: 'more',
 55: 'they',
 56: 'from',
 57: 'enter',
 58: 'at',
 59: 'was',
 60: 'oh',
 61: 'like',
 62: 'most',
 63: 'there',
 64: 'well',
 65: 'know',
 66: 'selfe',
 67: 'would',
 68: 'them',
 69: 'loue',
 70: 'may',
 71: "'tis",
 72: 'vs',
 73: 'sir',
 74: 'qu',
 75: 'which',
 76: 'did',
 77: 'why',
 78: 'laer',
 79: 'giue',
 80: 'thee',
 81: 'ile',
 82: 'must',
 83: 'hat

In [14]:
## create input sequences
input_sequences=[]
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [15]:
input_sequences

[[1, 687],
 [1, 687, 4],
 [1, 687, 4, 45],
 [1, 687, 4, 45, 41],
 [1, 687, 4, 45, 41, 1886],
 [1, 687, 4, 45, 41, 1886, 1887],
 [1, 687, 4, 45, 41, 1886, 1887, 1888],
 [1180, 1889],
 [1180, 1889, 1890],
 [1180, 1889, 1890, 1891],
 [57, 407],
 [57, 407, 2],
 [57, 407, 2, 1181],
 [57, 407, 2, 1181, 177],
 [57, 407, 2, 1181, 177, 1892],
 [407, 1182],
 [407, 1182, 63],
 [408, 162],
 [408, 162, 377],
 [408, 162, 377, 21],
 [408, 162, 377, 21, 247],
 [408, 162, 377, 21, 247, 882],
 [18, 66],
 [451, 224],
 [451, 224, 248],
 [451, 224, 248, 1],
 [451, 224, 248, 1, 30],
 [408, 407],
 [451, 25],
 [408, 6],
 [408, 6, 43],
 [408, 6, 43, 62],
 [408, 6, 43, 62, 1893],
 [408, 6, 43, 62, 1893, 96],
 [408, 6, 43, 62, 1893, 96, 18],
 [408, 6, 43, 62, 1893, 96, 18, 566],
 [451, 71],
 [451, 71, 51],
 [451, 71, 51, 1894],
 [451, 71, 51, 1894, 567],
 [451, 71, 51, 1894, 567, 378],
 [451, 71, 51, 1894, 567, 378, 80],
 [451, 71, 51, 1894, 567, 378, 80, 3],
 [451, 71, 51, 1894, 567, 378, 80, 3, 273],
 [451, 71

In [None]:
## Pad Sequences 
max_sequence_len = max([len(x) for x in input_sequences])
max__len

14

In [32]:
input_sequences = np.array(pad_sequences(input_sequences,maxlen = max_sequence_len,padding='pre'))
input_sequences

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]], dtype=int32)

In [33]:
## create predictors and label 
import tensorflow as tf 
x,y = input_sequences[:,:-1], input_sequences[:,-1]


In [34]:
x

array([[   0,    0,    0, ...,    0,    0,    1],
       [   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       ...,
       [   0,    0,    0, ...,  687,    4,   45],
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4]], dtype=int32)

In [35]:
y

array([ 687,    4,   45, ..., 1047,    4,  193], dtype=int32)

In [36]:
y = tf.keras.utils.to_categorical(y,num_classes=total_words)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [37]:
### split the data into training and testing sets 
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [38]:
x_train.shape,x_test.shape

((20585, 13), (5147, 13))

In [None]:
## train our LSTM RNN