In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import os

#### Load GloVe embeddings

In [2]:
embeddings_index = {}

f = open ('glove.6B.100d.txt','r', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word]=coefs

f.close()
print ('Found %s word embeddings'%(len(embeddings_index)))

Found 400000 word embeddings


### Load comments

In [3]:
input_texts = []
target_texts = []

for line in open('./data/poetry/robert_frost.txt'):
    line = line.rstrip()
    input_line = '<sos>'+line
    target_line = line+'<eos>'
    input_texts.append(input_line)
    target_texts.append(target_line)
    
all_lines = input_texts + target_texts

In [4]:
MAX_WORDS=20000
BATCH_SIZE=16
EPOCHS=2
OOV_TOKEN=0
EMBEDDING_DIM = 100
MAX_SEQ_LENGTH=100
VALIDATION_SPLIT_RATIO= 0.3
LSTM_UNITS=64

### Tokenize comments

In [5]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_WORDS, oov_token=OOV_TOKEN, filters='')
tokenizer.fit_on_texts(all_lines)
word2Idx = tokenizer.word_index

print ('Found %d unique words'%(len(word2Idx)))
print (word2Idx['girl'])

input_sequences = tokenizer.texts_to_sequences(input_texts)
target_sequences = tokenizer.texts_to_sequences(target_texts)
print ('Found %d input sequences'%(len(input_sequences)))
print ('Found %d output sequences'%(len(target_sequences)))

max_seq_len_from_data = min (MAX_SEQ_LENGTH, max(len(s) for s in input_sequences))

padded_input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, padding='post', maxlen=max_seq_len_from_data)
padded_target_sequences = tf.keras.preprocessing.sequence.pad_sequences(target_sequences, padding='post', maxlen=max_seq_len_from_data)

print ('Created %d padded input sequences'%(len(padded_input_sequences)))
print ('Created %d padded target sequences'%(len(padded_target_sequences)))

assert ('<sos>' in word2Idx)
assert ('<eos>' in word2Idx)

Found 4617 unique words
1268
Found 1581 input sequences
Found 1581 output sequences
Created 1581 padded input sequences
Created 1581 padded target sequences


In [6]:
num_words = min (MAX_WORDS, len(word2Idx)+1)
print ('Min words to be considered are %d'%(num_words))

loaded_embeddings_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word2Idx.items():
    if (i<num_words):
        if word in embeddings_index.keys():
            embedding_vector = embeddings_index[word]
            loaded_embeddings_matrix[i] = embedding_vector

print (loaded_embeddings_matrix.shape)

Min words to be considered are 4618
(4618, 100)


In [7]:
for i, seq in enumerate(padded_target_sequences):
    print (seq)
    if (i>2):
        break

[ 136  574  575    8    4  576 3470    0    0    0    0]
[   5  577    6   69   31  987 3471    0    0    0    0]
[   5   27   28  988  166    6 3472    0    0    0    0]
[   5  184   72   28   17  140   17    6 3473    0    0]


### Important Step:

If we look at this problem statement, we have a set of input sequences and corresponding target sequences.

Input Seq (T*D)-> Target Seq (T'*D')

    T -> number of input sequences
    D -> length of each input sequence

    T' -> number of target sequences
    D' -> length of each target sequence.

In a seq2seq scenario the target value needs to be one-hot encoded and that's what we are doing below.

In [8]:
one_hot_targets = np.zeros((len(target_sequences), max_seq_len_from_data, num_words))
print (one_hot_targets.shape)
for i, seq in enumerate(padded_target_sequences):
    for j, word in enumerate(seq):
        if (word>0):
            one_hot_targets[i,j,word]=1

(1581, 11, 4618)


### Build Model

In [9]:
embed_layer = tf.keras.layers.Embedding(input_dim = num_words,
                               output_dim = EMBEDDING_DIM,
                               input_length=max_seq_len_from_data, 
                               embeddings_initializer=tf.keras.initializers.Constant(loaded_embeddings_matrix),
                               trainable=False)

In [23]:
input_ = tf.keras.layers.Input(shape=(max_seq_len_from_data))
initial_h = tf.keras.layers.Input(shape=(LSTM_UNITS,))
initial_c = tf.keras.layers.Input(shape=(LSTM_UNITS,))

x = embed_layer(input_)

lstm_layer = tf.keras.layers.LSTM(units=LSTM_UNITS, return_sequences=True, return_state=True)
x,_,_ = lstm_layer(x)

dense_layer = tf.keras.layers.Dense(num_words, activation='softmax')
output = dense_layer(x)

model = tf.keras.models.Model([input_, initial_h, initial_c], output)

model.compile (optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [24]:
from sklearn.model_selection import train_test_split

train_sentences, test_sentences, train_targets, test_targets = train_test_split (padded_input_sequences, one_hot_targets)

In [25]:
h = np.zeros((len(train_sentences),LSTM_UNITS))
c = np.zeros((len(train_sentences),LSTM_UNITS))
model.fit ([train_sentences,h,c], train_targets, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=VALIDATION_SPLIT_RATIO)

Train on 829 samples, validate on 356 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x225278f6ba8>

### Prediction Model

In [26]:
input2_ = tf.keras.layers.Input(shape=(1,))

x = embed_layer(input2_)

x,h,c = lstm_layer(x)

output2_ = dense_layer(x)

pred_model = tf.keras.models.Model([input2_, initial_h, initial_c], [output2_,h,c])
pred_model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           multiple             461800      input_11[0][0]                   
__________________________________________________________________________________________________
lstm_2 (LSTM)                   multiple             42240       embedding[4][0]                  
__________________________________________________________________________________________________
input_9 (InputLayer)            [(None, 64)]         0                                            
____________________________________________________________________________________________

In [57]:
test_input_word = '<sos>'
test_input_word_idx = word2Idx[test_input_word]
test_input_word_vector = np.array([[test_input_word_idx]])

print (test_input_word_vector.shape)

h = np.zeros([1,LSTM_UNITS])
c = np.zeros([1,LSTM_UNITS])

(1, 1)


In [58]:
output_poem = []

idx2word = {v:k for k, v in word2Idx.items()}

for i in range(max_seq_len_from_data):
    
    print ('Start word is ',test_input_word_vector)
    o,h,c = pred_model.predict([test_input_word_vector, h, c])
    
    prob_word = np.argmax(o)
    
    pred_word = idx2word[prob_word]
    print ('Next Start word is ',pred_word)
    
    if (pred_word == '<eos>'):
        break
    
    test_input_word_vector[0,0] = prob_word
    
    output_poem.append(pred_word + ' ')

Start word is  [[15]]
Next Start word is  the
Start word is  [[2]]
Next Start word is  the
Start word is  [[2]]
Next Start word is  the
Start word is  [[2]]
Next Start word is  the
Start word is  [[2]]
Next Start word is  the
Start word is  [[2]]
Next Start word is  the
Start word is  [[2]]
Next Start word is  the
Start word is  [[2]]
Next Start word is  the
Start word is  [[2]]
Next Start word is  the
Start word is  [[2]]
Next Start word is  the
Start word is  [[2]]
Next Start word is  the


In [53]:
print (output_poem)

['the ', 'the ', 'the ', 'the ', 'the ', 'the ', 'the ', 'the ', 'the ', 'the ', 'the ']
