In [None]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import nltk
import string
import warnings
import re
from tensorflow.contrib import legacy_seq2seq


In [None]:
f = open("HarryPotterCh1_SorcererStone.txt","r") 
textbook = f.read()
#print(textbook)
f.close()

Here, we create our own function for preprocessing, tokenization as well as creating index of the words.

In [None]:
def preprocess_text(string):
    string = re.sub(r"\n", " ", string)     
    string = string.lower()
    return string

In [None]:
text=preprocess_text(textbook)
text_words=text.split()
text_len=len(text_words)
text_words

In [None]:
#Create dictionary from list of words in text
def dictionary(words):
    #create list of words without their dupications 
    words=set(words)
    #map word to index
    indx = {key: i for i, key in enumerate(words)}
    return indx



In [None]:
words_index=dictionary(text_words)
words_index

Create sequences of 10 length (given 10 words as inputs, predict 1 word for output added to the previos words)

In [None]:
seq_len=10

In [None]:
def  create_model_inputs(batch_size,seq_len):
    '''Define model inputs'''
    
    #Resert the default graph 
    tf.reset_default_graph()
    #Model's placeholders for inputs
    inputs = tf.placeholder(tf.int32, [None, None], name='inputs')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

    return inputs,targets,keep_prob

In [None]:
def  build_RNN(vocabulary_size,embedding_size,inputs,seq_len,num_hidden,lstm_layer_numbers,keep_prob,batch_size):
    '''Build RNN'''

    #Embedding Layer
    '''Intialize embeddings for the words. Embedding layer connects the words to the LSTM layers (words are embedded to the embedding_size vectors instead of vocabulary size vectors or one hot vectors). Here, provided by tensorflow, we used random_uniform distribution to create embeddings'''
    embedding = tf.Variable(tf.random_uniform((vocabulary_size, embedding_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs)

    #Define LSTM layers
    lstms=[]
    for i in range(lstm_layer_numbers):
        lstms.append(tf.contrib.rnn.BasicLSTMCell(num_hidden))
    # Add regularization dropout to the LSTM cells
    drops = [tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob) for lstm in lstms]
    # Stack up multiple LSTM layers
    stacked_lstm = tf.contrib.rnn.MultiRNNCell(drops)
    # Getting the initial state
    initial_state = stacked_lstm.zero_state(batch_size, tf.float32)
    #outputs, final_state = tf.nn.dynamic_rnn(stacked_lstm, embed, initial_state=initial_state)
    #need to unstack the sequence of input into a list of tensors
    seq_input = [tf.squeeze(i,[1]) for i in tf.split(embed,seq_len,1)]  
    outputs, final_state = legacy_seq2seq.rnn_decoder(seq_input, initial_state, stacked_lstm, loop_function=None,scope='rnnlm')
    return initial_state, outputs, final_state

In [None]:
def get_batches(text_words,text_len,seq_len, batch_size,number_of_words_in_one_batch,n_batches):
    '''Using generator to return batches'''
    
    #This makes the input data to be compatible with seq_len
    text_all_batches = text_words[:n_batches*number_of_words_in_one_batch]
    index_text_all_batches=[]
    for i in text_all_batches:
        if i in words_index:
            index_text_all_batches.append(words_index[i])
        
    #index_text_all_batches={v for k,v in words_index.items() if k in text_all_batches}
    #get word index for words for batches
    input_seq=list(index_text_all_batches)
    output_seq=input_seq
    output_seq.append(output_seq.pop(output_seq[0]))
    for ii in range(0, len(text_all_batches), number_of_words_in_one_batch):
        yield input_seq[ii:ii+number_of_words_in_one_batch], output_seq[ii:ii+number_of_words_in_one_batch]

In [None]:
#Define Parameters
# number of units
n_input= len(words_index)
num_hidden = 256
lstm_layer_numbers=2
embed_size=256
batch_size= 256
learning_rate=0.001

In [None]:
inputs,targets,keep_prob=create_model_inputs(batch_size,seq_len)
initial_state, outputs, final_state = build_RNN(n_input,embed_size,inputs,seq_len,num_hidden,lstm_layer_numbers,keep_prob,batch_size)
# Loss and optimizer
logits = tf.contrib.layers.fully_connected(outputs, n_input, activation_fn=None)


probs = tf.nn.softmax(logits, name='probs')
cost =  tf.contrib.seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([batch_size, (seq_len)])    
    )                                   
                                                     
optimizer = tf.train.AdamOptimizer(learning_rate)
# Gradient clipping to avoid exploding gradients
gradients = optimizer.compute_gradients(cost)
capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
train_op = optimizer.apply_gradients(capped_gradients) 


#Execute the graph
sess = tf.Session()
saver = tf.train.Saver()
init_op = tf.initialize_all_variables()
sess.run(init_op)
number_of_words_in_one_batch= seq_len*batch_size
n_batches = text_len//number_of_words_in_one_batch
epochs = 35
for epoch in range(epochs):
    print(epoch)
    state = sess.run(initial_state)
    avg_cost_train = 0 
    avg_acc_train= 0
    for ii, (x, y) in enumerate(get_batches(text_words,text_len,seq_len,batch_size,number_of_words_in_one_batch,n_batches), 1):
        #need to reshape y to feed it to targets
        y = np.array(y).reshape(batch_size,(seq_len))
        x = np.array(x).reshape(batch_size,(seq_len))

        state, loss, _= sess.run([final_state, cost,train_op], feed_dict={inputs: x,
                                                        targets: y,keep_prob: 0.5,initial_state: state})
        
        avg_cost_train += loss / n_batches
        #avg_acc_train += acc / no_of_batches_train
    print("cost_train=", avg_cost_train) 
#Save the model into a file 
checkpoint="./model/savedmodel.ckpt"
save_path = saver.save(sess, checkpoint)
sess.close()