In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import os

In [2]:
embeddings_index = {}

f = open ('../glove.6B.100d.txt','r', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word]=coefs

f.close()
print ('Found %s word embeddings'%(len(embeddings_index)))

Found 400000 word embeddings


In [150]:
input_texts = []
target_texts = []

for line in open('./data/poetry/robert_frost.txt'):
    line = line.rstrip()
    input_line = '<sos> '+line
    target_line = line+' <eos>'
    input_texts.append(input_line)
    target_texts.append(target_line)
    
all_lines = input_texts + target_texts

In [173]:
MAX_WORDS=20000
EPOCHS=1
OOV_TOKEN=0
EMBEDDING_DIM = 100
MAX_SEQ_LENGTH=20
VALIDATION_SPLIT_RATIO= 0.2
LSTM_UNITS=1000

In [167]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_WORDS, oov_token=OOV_TOKEN, filters='')
tokenizer.fit_on_texts(all_lines)
word2Idx = tokenizer.word_index

print ('Found %d unique words'%(len(word2Idx)))

input_sequences = tokenizer.texts_to_sequences(input_texts)
target_sequences = tokenizer.texts_to_sequences(target_texts)
print ('Found %d input sequences'%(len(input_sequences)))
print ('Found %d output sequences'%(len(target_sequences)))

max_seq_len_from_data = min (MAX_SEQ_LENGTH, max(len(s) for s in input_sequences))
print ('max seq length is %d'%(max_seq_len_from_data))

padded_input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, padding='post', maxlen=max_seq_len_from_data)
padded_target_sequences = tf.keras.preprocessing.sequence.pad_sequences(target_sequences, padding='post', maxlen=max_seq_len_from_data)

print ('Created %d padded input sequences'%(len(padded_input_sequences)))
print ('Created %d padded target sequences'%(len(padded_target_sequences)))

assert ('<sos>' in word2Idx)
assert ('<eos>' in word2Idx)

Found 3057 unique words
Found 1581 input sequences
Found 1581 output sequences
max seq length is 12
Created 1581 padded input sequences
Created 1581 padded target sequences


In [168]:
num_words = min (MAX_WORDS, len(word2Idx)+1)
print ('Min words to be considered are %d'%(num_words))

loaded_embeddings_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word2Idx.items():
    if (i<num_words):
        if word in embeddings_index.keys():
            embedding_vector = embeddings_index[word]
            loaded_embeddings_matrix[i] = embedding_vector

print (loaded_embeddings_matrix.shape)

Min words to be considered are 3058
(3058, 100)


In [169]:
one_hot_targets = np.zeros((len(target_sequences), max_seq_len_from_data, num_words))
print (one_hot_targets.shape)
for i, seq in enumerate(padded_target_sequences):
    for j, word in enumerate(seq):
        if (word>0):
            one_hot_targets[i,j,word]=1

(1581, 12, 3058)


In [170]:
embed_layer = tf.keras.layers.Embedding(input_dim = num_words,
                               output_dim = EMBEDDING_DIM,
                               input_length=max_seq_len_from_data, 
                               embeddings_initializer=tf.keras.initializers.Constant(loaded_embeddings_matrix),
                               trainable=True)

In [171]:
input_ = tf.keras.layers.Input(shape=(max_seq_len_from_data))
initial_h = tf.keras.layers.Input(shape=(LSTM_UNITS,))
initial_c = tf.keras.layers.Input(shape=(LSTM_UNITS,))

x = embed_layer(input_)

lstm_layer_0 = tf.keras.layers.LSTM(units=LSTM_UNITS, return_sequences=True, return_state=True)
x,h,c = lstm_layer_0(x)

print (x.shape)
print (h.shape)
print (c.shape)
dense_layer = tf.keras.layers.Dense(num_words, activation='softmax')
output = dense_layer(x)

model = tf.keras.models.Model([input_, initial_h, initial_c], output)

model.compile (optimizer=tf.keras.optimizers.Adam(lr=0.01), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

(?, 12, 1000)
(?, 1000)
(?, 1000)
Model: "model_15"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_46 (InputLayer)           [(None, 12)]         0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 12, 100)      305800      input_46[0][0]                   
__________________________________________________________________________________________________
lstm_8 (LSTM)                   [(None, 12, 1000), ( 4404000     embedding_6[0][0]                
__________________________________________________________________________________________________
input_47 (InputLayer)           [(None, 1000)]       0                                            
_________________________________________________________

In [174]:
h = np.zeros((len(padded_input_sequences),LSTM_UNITS))
c = np.zeros((len(padded_input_sequences),LSTM_UNITS))
history = model.fit ([padded_input_sequences,h,c], one_hot_targets, epochs=EPOCHS)



In [175]:
input2_ = tf.keras.layers.Input(shape=(1,))
initial_h_1 = tf.keras.layers.Input(shape=(LSTM_UNITS,))
initial_c_1 = tf.keras.layers.Input(shape=(LSTM_UNITS,))

x = embed_layer(input2_)
x,h,c = lstm_layer_0(x, initial_state = [initial_h_1, initial_c_1])

output2_ = dense_layer(x)

pred_model = tf.keras.models.Model([input2_, initial_h_1, initial_c_1], [output2_,h,c])
pred_model.summary()

Model: "model_16"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_49 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         multiple             305800      input_49[0][0]                   
__________________________________________________________________________________________________
input_50 (InputLayer)           [(None, 1000)]       0                                            
__________________________________________________________________________________________________
input_51 (InputLayer)           [(None, 1000)]       0                                            
___________________________________________________________________________________________

In [197]:
word1 = 'horse'
word1idx = word2Idx[word1]
word1Vector = np.array([[word1idx]])

word2 = 'night'
word2idx = word2Idx[word2]
word2Vector = np.array([[word2idx]])

h = np.zeros((1,LSTM_UNITS))
c = np.zeros((1,LSTM_UNITS))

o1,h1,c1 = pred_model.predict([word1Vector, h, c])
probs = o1[0,0]
wordId = np.argmax(probs)
print (wordId)
print (idx2word[wordId])

o2,h2,c2 = pred_model.predict([word2Vector, h, c])
probs = o2[0,0]
wordId = np.argmax(probs)
print (wordId)
print (idx2word[wordId])

from scipy import spatial
cosdiff = spatial.distance.cosine(h1, h2)
print (cosdiff)

5
how
4
world
0.2102702260017395


0.12282532453536987
