# Embedding Layer

In [103]:
import pandas as pd
texts = ['This is a text','This is not a text']
textFrame = pd.DataFrame(texts, columns=['data'])
textFrame

Unnamed: 0,data
0,This is a text
1,This is not a text


In [104]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences 
from keras.utils import to_categorical

max_review_length = 6 #maximum length of the sentence
embedding_vecor_length = 3
top_words = 10

#num_words is tne number of unique words in the sequence, if there's more top count words are taken
tokenizer = Tokenizer(top_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
input_dim = len(word_index) + 1
print('Found %s unique tokens.' % len(word_index))

#max_review_length is the maximum length of the input text so that we can create vector [... 0,0,1,3,50] where 1,3,50 are individual words
data = pad_sequences(sequences, max_review_length)

print('Shape of data tensor:', data.shape)
print(data)
textFrame['sequence'] = data.tolist()
textFrame

Found 5 unique tokens.
Shape of data tensor: (2, 6)
[[0 0 1 2 3 4]
 [0 1 2 5 3 4]]


Unnamed: 0,data,sequence
0,This is a text,"[0, 0, 1, 2, 3, 4]"
1,This is not a text,"[0, 1, 2, 5, 3, 4]"


In [117]:
from keras.models import Sequential
from keras.layers import Embedding

model = Sequential()
model.add(Embedding(top_words , embedding_vecor_length, input_length=max_review_length))
model.compile(optimizer='adam', loss='categorical_crossentropy')
output_array = model.predict(data)
output_array

textFrame['Embedding'] = output_array.tolist()
pd.set_option('precision',2)
textFrame

Unnamed: 0,data,sequence,Embedding
0,This is a text,"[0, 0, 1, 2, 3, 4]","[-0.0014113187789916992, 0.008530724793672562,..."
1,This is not a text,"[0, 1, 2, 5, 3, 4]","[-0.0014113187789916992, 0.008530724793672562,..."


In [118]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_34 (Embedding)     (None, 6, 3)              30        
_________________________________________________________________
flatten_3 (Flatten)          (None, 18)                0         
Total params: 30
Trainable params: 30
Non-trainable params: 0
_________________________________________________________________


In [119]:
model.get_config()

[{'class_name': 'Embedding',
  'config': {'activity_regularizer': None,
   'batch_input_shape': (None, 6),
   'dtype': 'float32',
   'embeddings_constraint': None,
   'embeddings_initializer': {'class_name': 'RandomUniform',
    'config': {'maxval': 0.05, 'minval': -0.05, 'seed': None}},
   'embeddings_regularizer': None,
   'input_dim': 10,
   'input_length': 6,
   'mask_zero': False,
   'name': 'embedding_34',
   'output_dim': 3,
   'trainable': True}},
 {'class_name': 'Flatten', 'config': {'name': 'flatten_3', 'trainable': True}}]

In [120]:
for layer in model.layers:
    print(layer.get_config())
    print(layer.get_weights())

{'name': 'embedding_34', 'trainable': True, 'batch_input_shape': (None, 6), 'dtype': 'float32', 'input_dim': 10, 'output_dim': 3, 'embeddings_initializer': {'class_name': 'RandomUniform', 'config': {'minval': -0.05, 'maxval': 0.05, 'seed': None}}, 'embeddings_regularizer': None, 'activity_regularizer': None, 'embeddings_constraint': None, 'mask_zero': False, 'input_length': 6}
[array([[-0.00141132,  0.00853072,  0.02273766],
       [ 0.03596164, -0.03251224, -0.04678869],
       [-0.02188659,  0.00193819, -0.00553908],
       [-0.00947953, -0.04847676,  0.00643707],
       [-0.01232052,  0.03661312,  0.03740862],
       [-0.04615786, -0.0401098 , -0.02656391],
       [ 0.00160072,  0.02230993, -0.02462446],
       [-0.01392549,  0.00434928,  0.03618317],
       [ 0.0289198 ,  0.01454613, -0.01566162],
       [-0.01825454, -0.0113125 ,  0.04260141]], dtype=float32)]
{'name': 'flatten_3', 'trainable': True}
[]


In [121]:
model.layers[0].get_weights()

[array([[-0.00141132,  0.00853072,  0.02273766],
        [ 0.03596164, -0.03251224, -0.04678869],
        [-0.02188659,  0.00193819, -0.00553908],
        [-0.00947953, -0.04847676,  0.00643707],
        [-0.01232052,  0.03661312,  0.03740862],
        [-0.04615786, -0.0401098 , -0.02656391],
        [ 0.00160072,  0.02230993, -0.02462446],
        [-0.01392549,  0.00434928,  0.03618317],
        [ 0.0289198 ,  0.01454613, -0.01566162],
        [-0.01825454, -0.0113125 ,  0.04260141]], dtype=float32)]

In [122]:
output_array[0]

array([-0.00141132,  0.00853072,  0.02273766, -0.00141132,  0.00853072,
        0.02273766,  0.03596164, -0.03251224, -0.04678869, -0.02188659,
        0.00193819, -0.00553908, -0.00947953, -0.04847676,  0.00643707,
       -0.01232052,  0.03661312,  0.03740862], dtype=float32)

## Adding a flatten

In [124]:
from keras.models import Sequential
from keras.layers import Embedding

model = Sequential()
model.add(Embedding(top_words , embedding_vecor_length, input_length=max_review_length))
model.add(Flatten())
model.compile(optimizer='adam', loss='categorical_crossentropy')
output_array = model.predict(data)
output_array

textFrame['EmbeddingsFlatten'] = output_array.tolist()
textFrame

Unnamed: 0,data,sequence,Embedding,EmbeddingsFlatten
0,This is a text,"[0, 0, 1, 2, 3, 4]","[-0.0014113187789916992, 0.008530724793672562,...","[-0.04179590940475464, 0.01228167861700058, -0..."
1,This is not a text,"[0, 1, 2, 5, 3, 4]","[-0.0014113187789916992, 0.008530724793672562,...","[-0.04179590940475464, 0.01228167861700058, -0..."


In [125]:
output_array[0]

array([-0.04179591,  0.01228168, -0.03246965, -0.04179591,  0.01228168,
       -0.03246965,  0.01011391, -0.02434975,  0.00832511, -0.03656663,
        0.01334569, -0.01831834,  0.00389238,  0.01628676, -0.01201253,
        0.01853235,  0.04914785,  0.01473973], dtype=float32)