This notebook is based on 
https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/

In [87]:
import os
import math
import numpy as np
import pandas as pd
# pip install -q -U tensorflow==1.7.0
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten

print("Tensorflow version:", tf.__version__)

Tensorflow version: 1.7.0


In [88]:
docs = ['Well done!',
		'Good work',
		'Great effort',
		'nice work',
		'Excellent!',
		'Weak',
		'Poor effort!',
		'not good',
		'poor work',
		'Could have done better.']
# define class labels
labels = np.array([1,1,1,1,1,0,0,0,0,0])
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)

In [89]:
t.word_index

{'work': 1,
 'done': 2,
 'good': 3,
 'effort': 4,
 'poor': 5,
 'well': 6,
 'great': 7,
 'nice': 8,
 'excellent': 9,
 'weak': 10,
 'not': 11,
 'could': 12,
 'have': 13,
 'better': 14}

### Create Input Sequence from Text (Non-prefared way)
Keras provides the one_hot() function that creates a hash of each word as an efficient integer encoding.

In [90]:
vocab_size = len(t.word_index) + 1
# vocab_size
encoded_docs = [keras.preprocessing.text.one_hot(text, vocab_size) for text in docs]
print(encoded_docs)

[[8, 14], [6, 7], [11, 10], [14, 7], [9], [14], [7, 10], [8, 6], [7, 7], [5, 5, 14, 4]]


The sequences have different lengths and Keras prefers inputs to be vectorized and all inputs to have the same length. We will pad all input sequences to have the length of 4. Again, we can do this with a built in Keras function, in this case the pad_sequences() function.

In [91]:
# pad documents to a max length of 4 words
max_length = 4
padded_docs = keras.preprocessing.sequence.pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

[[ 8 14  0  0]
 [ 6  7  0  0]
 [11 10  0  0]
 [14  7  0  0]
 [ 9  0  0  0]
 [14  0  0  0]
 [ 7 10  0  0]
 [ 8  6  0  0]
 [ 7  7  0  0]
 [ 5  5 14  4]]


### Create Input Sequence from Text (Non-prefared way)
tokenizer.texts_to_matrix(texts) function
    - Return: numpy array of shape (len(texts), num_words).
    - Arguments:
        - texts: list of texts to vectorize.
        - mode: one of "binary", "count", "tfidf", "freq" (default: "binary").

In [92]:
print(t.texts_to_matrix(docs, mode='binary'))

[[0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1.]]


### Create Input Sequence from Text (prefared way)

In [93]:
text_seq = t.texts_to_sequences(docs)
print('Text sequences :')
print( text_seq)
text_seq_padded = keras.preprocessing.sequence.pad_sequences(text_seq, maxlen=max_length, padding='post')
print('Text sequences padded :')
print(text_seq_padded)

Text sequences :
[[6, 2], [3, 1], [7, 4], [8, 1], [9], [10], [5, 4], [11, 3], [5, 1], [12, 13, 2, 14]]
Text sequences padded :
[[ 6  2  0  0]
 [ 3  1  0  0]
 [ 7  4  0  0]
 [ 8  1  0  0]
 [ 9  0  0  0]
 [10  0  0  0]
 [ 5  4  0  0]
 [11  3  0  0]
 [ 5  1  0  0]
 [12 13  2 14]]


### Load the entire GloVe word embedding file into memory as a dictionary

In [100]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('../data/raw/glove.6B.100d.txt',encoding="utf8")
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [106]:
print(embeddings_index.get('the').shape)
print(embeddings_index.get('the'))

(100,)
[-0.038194 -0.24487   0.72812  -0.39961   0.083172  0.043953 -0.39141
  0.3344   -0.57545   0.087459  0.28787  -0.06731   0.30906  -0.26384
 -0.13231  -0.20757   0.33395  -0.33848  -0.31743  -0.48336   0.1464
 -0.37304   0.34577   0.052041  0.44946  -0.46971   0.02628  -0.54155
 -0.15518  -0.14107  -0.039722  0.28277   0.14393   0.23464  -0.31021
  0.086173  0.20397   0.52624   0.17164  -0.082378 -0.71787  -0.41531
  0.20335  -0.12763   0.41367   0.55187   0.57908  -0.33477  -0.36559
 -0.54857  -0.062892  0.26584   0.30205   0.99775  -0.80481  -3.0243
  0.01254  -0.36942   2.2167    0.72201  -0.24978   0.92136   0.034514
  0.46745   1.1079   -0.19358  -0.074575  0.23353  -0.052062 -0.22044
  0.057162 -0.15806  -0.30798  -0.41625   0.37972   0.15006  -0.53212
 -0.2055   -1.2526    0.071624  0.70565   0.49744  -0.42063   0.26148
 -1.538    -0.30223  -0.073438 -0.28312   0.37104  -0.25217   0.016215
 -0.017099 -0.38984   0.87424  -0.72569  -0.51058  -0.52028  -0.1459
  0.8278    0.

In [101]:
# create a weight matrix for words in training docs
word_feature = 100
embedding_matrix = np.zeros((vocab_size, word_feature))
for word, index in t.word_index.items():    
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [102]:
embedding_matrix[1:2,:]

array([[-1.16190001e-01,  4.54470009e-01, -6.92160010e-01,
         3.45799997e-02,  2.63480008e-01, -3.81390005e-01,
        -2.27899998e-01,  3.72330010e-01, -2.05789998e-01,
         2.90199995e-01,  1.21140003e-01, -4.27289993e-01,
         5.55729985e-01, -9.42860022e-02, -4.99669999e-01,
        -2.94779986e-01,  7.41090000e-01,  2.51910001e-01,
        -2.74679989e-01,  2.31910005e-01,  3.82039999e-03,
         4.52519991e-02,  2.49699995e-01, -4.15789992e-01,
         3.13069999e-01, -5.84959984e-01, -3.27389985e-01,
        -6.61889970e-01,  1.49090007e-01, -2.57710010e-01,
        -9.48580027e-01,  4.18089986e-01, -2.95379996e-01,
        -4.27110009e-02, -6.99699998e-01,  5.78920007e-01,
        -6.92709982e-02, -3.96329984e-02, -5.64630004e-03,
        -2.96160012e-01, -5.74479997e-01,  1.60099998e-01,
        -1.06710002e-01,  1.00960001e-01, -4.29560006e-01,
        -2.77850002e-01, -3.00170004e-01, -6.95370018e-01,
         1.79649994e-01, -4.67229992e-01,  1.25110000e-0

### Embedding layer with GloVe word embedding weights
The embedding layer can be seeded with the GloVe word embedding weights. We chose the 50-dimensional (or 100-dimensional) version, therefore the Embedding layer must be defined with output_dim set to **50** (or **100**). Finally, we do not want to update the learned word weights in this model, therefore we will set the trainable attribute for the model to be False.

In [112]:
# define model
model = Sequential()
e = Embedding(vocab_size, word_feature, weights=[embedding_matrix], input_length=4, trainable=False)

In [113]:
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 4, 100)            1500      
_________________________________________________________________
flatten_5 (Flatten)          (None, 400)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 401       
Total params: 1,901
Trainable params: 401
Non-trainable params: 1,500
_________________________________________________________________
None
Accuracy: 100.000000
