<a href="https://colab.research.google.com/github/ptsurko/AIML-notebooks/blob/master/Use_Word_Embedding_Layers_for_Deep_Learning_with_Keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
from scipy.spatial.distance import cosine

np.random.seed(1337)
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

# https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/

Using TensorFlow backend.


In [0]:
# define documents
docs = ['Well done!',
		'Good work',
		'Great effort',
		'nice work',
		'Excellent!',
		'Weak',
		'Poor effort!',
		'not good',
		'poor work',
		'Could have done better.']
# define class labels
labels = np.array([1,1,1,1,1,0,0,0,0,0])

In [0]:
tokenizer = Tokenizer(num_words=50, oov_token="<OOV>")
tokenizer.fit_on_texts(docs)

word_index = tokenizer.word_index
vocab_size=len(word_index) + 1
words = list(tokenizer.word_index.keys())[1:]

encoded_docs = tokenizer.texts_to_sequences(docs)

print('word_index: ', str(word_index))
print('len(word_index): ', len(word_index))
print('vocab_size: ', vocab_size)
print('words: ', words)
print(encoded_docs)

word_index:  {'<OOV>': 1, 'work': 2, 'done': 3, 'good': 4, 'effort': 5, 'poor': 6, 'well': 7, 'great': 8, 'nice': 9, 'excellent': 10, 'weak': 11, 'not': 12, 'could': 13, 'have': 14, 'better': 15}
len(word_index):  15
vocab_size:  16
words:  ['work', 'done', 'good', 'effort', 'poor', 'well', 'great', 'nice', 'excellent', 'weak', 'not', 'could', 'have', 'better']
[[7, 3], [4, 2], [8, 5], [9, 2], [10], [11], [6, 5], [12, 4], [6, 2], [13, 14, 3, 15]]


In [0]:
# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

[[ 7  3  0  0]
 [ 4  2  0  0]
 [ 8  5  0  0]
 [ 9  2  0  0]
 [10  0  0  0]
 [11  0  0  0]
 [ 6  5  0  0]
 [12  4  0  0]
 [ 6  2  0  0]
 [13 14  3 15]]


In [0]:
np.random.seed(42)

# define the model
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# summarize the model
print(model.summary())

model.fit(padded_docs, labels, epochs=50, verbose=0, shuffle=False)

# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

embeddings = model.layers[0].get_weights()[0]
print('embeddings.shape: ', embeddings.shape)
print('embeddings: ', embeddings)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 4, 8)              128       
_________________________________________________________________
flatten_3 (Flatten)          (None, 32)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 161
Trainable params: 161
Non-trainable params: 0
_________________________________________________________________
None
Accuracy: 89.999998
embeddings.shape:  (16, 8)
embeddings:  [[ 0.05721183 -0.03809275 -0.00354055  0.05120752 -0.00139931  0.09407571
  -0.04230647 -0.01367894]
 [ 0.03533186 -0.0128661  -0.02535462 -0.0490261  -0.00229315 -0.00074725
  -0.00457001  0.03586162]
 [-0.07674701 -0.07548218  0.04792396  0.0756007  -0.01280425  0.06967522
  -0.0931087   0.06812938]
 [-0.02533895 -0.0

In [0]:
word1 = 'work'
word2 = 'better'

weight1 = embeddings[word_index[word1] - 1]
weight2 = embeddings[word_index[word2] - 1]

print('cosine distance between %s and %s = %s' % (word1, word2, cosine(weight1, weight2)))

cosine distance between work and better = 0.6851622462272644


In [0]:
# import io

# out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
# out_m = io.open('meta.tsv', 'w', encoding='utf-8')

# for num, word in enumerate(words):
#   vec = embeddings[num+1] # skip 0, it's padding.
#   out_m.write(word + "\n")
#   out_v.write('\t'.join([str(x) for x in vec]) + "\n")
# out_v.close()
# out_m.close()

# # If you are running this tutorial in Colaboratory, you can use the following snippet to download these files to your local machine (or use the file browser, View -> Table of contents -> File browser).


In [0]:
# try:
#   from google.colab import files
# except ImportError:
#    pass
# else:
#   files.download('vecs.tsv')
#   files.download('meta.tsv')