<b>Loading the text</b>

In [1]:
#Opening the text file
with open('Murakami_test.txt','r') as f:
  doc=f.read()
f.close()

In [2]:
#importing spaCy and loading the English library
import spacy
nlp=spacy.load('en_core_web_sm')

<b>Tokenization</b>

In [24]:
#Tokenizing the text and removing punctuations
words=[word.text.lower() for word in nlp(doc) if word.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n']

In [25]:
words

['when',
 'the',
 'phone',
 'rang',
 'i',
 'was',
 'in',
 'the',
 'kitchen',
 'boiling',
 'a',
 'potrul',
 'of',
 'spaghetti',
 'and',
 'whistling',
 'along',
 'with',
 'an',
 'fm',
 'broadcast',
 'of',
 'the',
 'overture',
 'to',
 'rossini',
 '’s',
 'the',
 'thieving',
 'magpie',
 'which',
 'has',
 'to',
 'be',
 'the',
 'perfect',
 'music',
 'for',
 'cooking',
 'pasta',
 'i',
 'wanted',
 'to',
 'ignore',
 'the',
 'phone',
 'not',
 'only',
 'because',
 'the',
 'spaghetti',
 'was',
 'nearly',
 'done',
 'but',
 'because',
 'claudio',
 'abbado',
 'was',
 'bringing',
 'the',
 'london',
 'symphony',
 'to',
 'its',
 'musical',
 'climax',
 'finally',
 'though',
 'i',
 'had',
 'to',
 'give',
 'in',
 'it',
 'could',
 'have',
 'been',
 'somebody',
 'with',
 'news',
 'of',
 'a',
 'job',
 'opening',
 'i',
 'lowered',
 'the',
 'flame',
 'went',
 'to',
 'the',
 'living',
 'room',
 'and',
 'picked',
 'up',
 'the',
 'receiver',
 '“',
 'ten',
 'minutes',
 'please',
 '”',
 'said',
 'a',
 'woman',
 'on',

In [26]:
#Create sequence of tokens of size 25, +1 for the label which we will predict
word_sequences=[]
train_len=25 + 1
for i in range(train_len,len(words)):
  w=words[i-train_len:i]
  word_sequences.append(w)

In [27]:
word_sequences[0]

['when',
 'the',
 'phone',
 'rang',
 'i',
 'was',
 'in',
 'the',
 'kitchen',
 'boiling',
 'a',
 'potrul',
 'of',
 'spaghetti',
 'and',
 'whistling',
 'along',
 'with',
 'an',
 'fm',
 'broadcast',
 'of',
 'the',
 'overture',
 'to',
 'rossini']

<h1><b>Keras</b></h1>

In [28]:
import keras

In [29]:
from keras.preprocessing.text import Tokenizer

In [30]:
#creating an instance of the tokenizer
tokenizer=Tokenizer()

In [67]:
#fitting the tokenizer on the text
tokenizer.fit_on_texts(word_sequences)

In [32]:
#tokenizing the word sequences
sequences=tokenizer.texts_to_sequences(word_sequences)

In [34]:
sequences[0]

[59,
 1,
 46,
 150,
 2,
 12,
 9,
 1,
 58,
 404,
 8,
 403,
 10,
 16,
 7,
 400,
 399,
 21,
 77,
 398,
 397,
 10,
 1,
 395,
 5,
 151]

[59,
 1,
 46,
 150,
 2,
 12,
 9,
 1,
 58,
 404,
 8,
 403,
 10,
 16,
 7,
 400,
 399,
 21,
 77,
 398,
 397,
 10,
 1,
 395,
 5,
 151]

In [36]:
" ".join([tokenizer.index_word[i] for i in sequences[0]])

'when the phone rang i was in the kitchen boiling a potrul of spaghetti and whistling along with an fm broadcast of the overture to rossini'

In [40]:
#vocabulary size=number of unique words in the text
vocab_size=len(tokenizer.word_counts)

In [41]:
vocab_size

406

In [42]:
#converting the sequences into a matrix
import numpy as np
sequences=np.array(sequences)

In [43]:
sequences

array([[ 59,   1,  46, ..., 395,   5, 151],
       [  1,  46, 150, ...,   5, 151,  11],
       [ 46, 150,   2, ..., 151,  11,   1],
       ...,
       [  4,   3, 394, ..., 405,   7,  27],
       [  3, 394,   2, ...,   7,  27,   3],
       [394,   2, 396, ...,  27,   3, 406]])

<h1><b>Creating a Recurrent Neural Network</b></h1>

In [44]:
#importing the layers to build an RNN
import keras
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense

In [45]:
#creating the RNN
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   
    model.summary()
    
    return model

In [47]:
#Splitting the matrix into input sequences and labels
from tensorflow.keras.utils import to_categorical
X=sequences[:,:-1]
y=sequences[:,-1]
y=to_categorical(y,num_classes=vocab_size+1)

In [48]:
seq_len=X.shape[1]

In [49]:
seq_len

25

In [51]:
#Creating the model
model=create_model(vocab_size+1,seq_len)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 25)            10175     
                                                                 
 lstm (LSTM)                 (None, 25, 150)           105600    
                                                                 
 lstm_1 (LSTM)               (None, 150)               180600    
                                                                 
 dense (Dense)               (None, 150)               22650     
                                                                 
 dense_1 (Dense)             (None, 407)               61457     
                                                                 
Total params: 380,482
Trainable params: 380,482
Non-trainable params: 0
_________________________________________________________________


In [65]:
#Fitting the model
model.fit(X, y, batch_size=128, epochs=300,verbose=1)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

<keras.callbacks.History at 0x7f4afc957e90>

<h1><b>Text Generation</b></h1>

In [53]:
from random import randint
from keras.preprocessing.sequence import pad_sequences

In [54]:
#Generate new text
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    INPUTS:
    model : model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed
    num_gen_words : number of words to be generated by model
    '''
    
    # Final Output
    output_text = []
    
    # Intial Seed Sequence
    input_text = seed_text
    
    # Create num_gen_words
    for i in range(num_gen_words):
        
        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        # Pad sequences to our trained rate (50 words in the video)
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # Predict Class Probabilities for each word
        pred_word_ind = np.argmax(model.predict(pad_encoded, verbose=0)[0])
        
        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    # Make it look like a sentence.
    return ' '.join(output_text)

In [60]:
import random
random.seed(101)
random_pick = random.randint(0,len(word_sequences))

In [61]:
#Choosing random text sequence
random_seed_text = word_sequences[random_pick]

In [62]:
seed_text = ' '.join(random_seed_text)

In [63]:
seed_text

'poured the contents of the pot into a colander thanks to the phone call the spaghetti was a little softer than al dente but it had'

In [66]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=50)

'not been dealt a mortal blow i started eating and thinking understand each other understand each other ’s feelings in ten minutes what was she talking about maybe it was just a prank call or some new sales pitch in any case it had nothing to do with me after'