In [1]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
import numpy as np


In [3]:
sentences = ['Man live in the house',
             'King live in the kingdom',
             'Man love women',
             'King love queen',
             'Queen live to eat apple',
             'Girl like to eat mango']

In [25]:
voc_size = 10000

In [26]:
tokenizer = Tokenizer(num_words = voc_size, oov_token = "<OOV>")

In [27]:
tokenizer.fit_on_texts(sentences)

In [28]:
word_index = tokenizer.word_index

In [29]:
word_index

{'<OOV>': 1,
 'live': 2,
 'man': 3,
 'in': 4,
 'the': 5,
 'king': 6,
 'love': 7,
 'queen': 8,
 'to': 9,
 'eat': 10,
 'house': 11,
 'kingdom': 12,
 'women': 13,
 'apple': 14,
 'girl': 15,
 'like': 16,
 'mango': 17}

In [30]:
sequences = tokenizer.texts_to_sequences(sentences)

In [31]:
sequences

[[3, 2, 4, 5, 11],
 [6, 2, 4, 5, 12],
 [3, 7, 13],
 [6, 7, 8],
 [8, 2, 9, 10, 14],
 [15, 16, 9, 10, 17]]

In [32]:
sentences_length = 8
padding_sequences = pad_sequences(sequences, maxlen = sentences_length, padding = 'post')

In [33]:
padding_sequences

array([[ 3,  2,  4,  5, 11,  0,  0,  0],
       [ 6,  2,  4,  5, 12,  0,  0,  0],
       [ 3,  7, 13,  0,  0,  0,  0,  0],
       [ 6,  7,  8,  0,  0,  0,  0,  0],
       [ 8,  2,  9, 10, 14,  0,  0,  0],
       [15, 16,  9, 10, 17,  0,  0,  0]], dtype=int32)

In [34]:
dim  = 10 # fearture dimention, take 10 fearture

In [35]:
model = Sequential()
model.add(Embedding(voc_size, dim, input_length = sentences_length ))
model.compile('adam','mse')

In [36]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 8, 10)             100000    
Total params: 100,000
Trainable params: 100,000
Non-trainable params: 0
_________________________________________________________________


In [40]:
print(model.predict(padding_sequences))

[[[-0.01509737  0.00686573  0.02186376 -0.04065489 -0.03159928
   -0.00730455  0.00557702 -0.04573032 -0.03333386 -0.03662892]
  [ 0.00053348 -0.02660242  0.02762609  0.04323545  0.03662442
   -0.00653205  0.01806704  0.0423425   0.0098186   0.02856291]
  [-0.00529069 -0.03589702  0.0444488   0.04695028 -0.00875999
   -0.01862372  0.04488533  0.03828743 -0.00747294 -0.0323103 ]
  [ 0.04959127  0.04421295 -0.03254279 -0.02213754  0.0450354
    0.04669661  0.01731205  0.02626509 -0.0252524   0.00833527]
  [ 0.02914019 -0.03311793 -0.01243305  0.02759982  0.04595165
   -0.01624659 -0.00177119  0.02940159 -0.03582285 -0.03575338]
  [ 0.02127484  0.04767188 -0.01002558 -0.01240257 -0.02030493
    0.01009977  0.0098091  -0.04965622  0.04865109 -0.02660616]
  [ 0.02127484  0.04767188 -0.01002558 -0.01240257 -0.02030493
    0.01009977  0.0098091  -0.04965622  0.04865109 -0.02660616]
  [ 0.02127484  0.04767188 -0.01002558 -0.01240257 -0.02030493
    0.01009977  0.0098091  -0.04965622  0.0486510

In [42]:
padding_sequences[0]

array([ 3,  2,  4,  5, 11,  0,  0,  0], dtype=int32)

In [41]:
print(model.predict(padding_sequences)[0])

[[-0.01509737  0.00686573  0.02186376 -0.04065489 -0.03159928 -0.00730455
   0.00557702 -0.04573032 -0.03333386 -0.03662892]
 [ 0.00053348 -0.02660242  0.02762609  0.04323545  0.03662442 -0.00653205
   0.01806704  0.0423425   0.0098186   0.02856291]
 [-0.00529069 -0.03589702  0.0444488   0.04695028 -0.00875999 -0.01862372
   0.04488533  0.03828743 -0.00747294 -0.0323103 ]
 [ 0.04959127  0.04421295 -0.03254279 -0.02213754  0.0450354   0.04669661
   0.01731205  0.02626509 -0.0252524   0.00833527]
 [ 0.02914019 -0.03311793 -0.01243305  0.02759982  0.04595165 -0.01624659
  -0.00177119  0.02940159 -0.03582285 -0.03575338]
 [ 0.02127484  0.04767188 -0.01002558 -0.01240257 -0.02030493  0.01009977
   0.0098091  -0.04965622  0.04865109 -0.02660616]
 [ 0.02127484  0.04767188 -0.01002558 -0.01240257 -0.02030493  0.01009977
   0.0098091  -0.04965622  0.04865109 -0.02660616]
 [ 0.02127484  0.04767188 -0.01002558 -0.01240257 -0.02030493  0.01009977
   0.0098091  -0.04965622  0.04865109 -0.02660616]]

- vậy là một từ sẽ được biểu diễn bằng 1 vector

In [43]:
print(model.predict(padding_sequences)[1])

[[ 0.03578847  0.02556732  0.02176673 -0.03255526 -0.01786099 -0.02463955
   0.00754889  0.00513943  0.04138016 -0.03765459]
 [ 0.00053348 -0.02660242  0.02762609  0.04323545  0.03662442 -0.00653205
   0.01806704  0.0423425   0.0098186   0.02856291]
 [-0.00529069 -0.03589702  0.0444488   0.04695028 -0.00875999 -0.01862372
   0.04488533  0.03828743 -0.00747294 -0.0323103 ]
 [ 0.04959127  0.04421295 -0.03254279 -0.02213754  0.0450354   0.04669661
   0.01731205  0.02626509 -0.0252524   0.00833527]
 [ 0.02211211 -0.04313502 -0.0348926   0.0373216  -0.0260461   0.03860049
   0.03122233  0.00642104 -0.0114817   0.048354  ]
 [ 0.02127484  0.04767188 -0.01002558 -0.01240257 -0.02030493  0.01009977
   0.0098091  -0.04965622  0.04865109 -0.02660616]
 [ 0.02127484  0.04767188 -0.01002558 -0.01240257 -0.02030493  0.01009977
   0.0098091  -0.04965622  0.04865109 -0.02660616]
 [ 0.02127484  0.04767188 -0.01002558 -0.01240257 -0.02030493  0.01009977
   0.0098091  -0.04965622  0.04865109 -0.02660616]]

In [45]:
print(model.predict(padding_sequences)[2])

[[-0.01509737  0.00686573  0.02186376 -0.04065489 -0.03159928 -0.00730455
   0.00557702 -0.04573032 -0.03333386 -0.03662892]
 [ 0.04803871 -0.00506578 -0.02755824 -0.04440121  0.00737508  0.02998397
  -0.01811301 -0.01140507  0.04036574  0.03964809]
 [-0.00750855  0.01826353 -0.0308272  -0.04955001 -0.03889537 -0.03603083
  -0.00674325 -0.00289793  0.03089764 -0.00528123]
 [ 0.02127484  0.04767188 -0.01002558 -0.01240257 -0.02030493  0.01009977
   0.0098091  -0.04965622  0.04865109 -0.02660616]
 [ 0.02127484  0.04767188 -0.01002558 -0.01240257 -0.02030493  0.01009977
   0.0098091  -0.04965622  0.04865109 -0.02660616]
 [ 0.02127484  0.04767188 -0.01002558 -0.01240257 -0.02030493  0.01009977
   0.0098091  -0.04965622  0.04865109 -0.02660616]
 [ 0.02127484  0.04767188 -0.01002558 -0.01240257 -0.02030493  0.01009977
   0.0098091  -0.04965622  0.04865109 -0.02660616]
 [ 0.02127484  0.04767188 -0.01002558 -0.01240257 -0.02030493  0.01009977
   0.0098091  -0.04965622  0.04865109 -0.02660616]]