In [3]:
### Libraries used Tensorflow>2.0 and keras
!pip install tensorflow



In [2]:
import tensorflow as tf
print(tf.__version__)

2.15.0


In [3]:
### tensorflow >2.0
from tensorflow.keras.preprocessing.text import one_hot

In [4]:
sent = ['The glass of milk',
        'The glass of juice',
        'The cup of tea',
        'I am a good boy',
        'I am a good developer',
        'Understanding the meaning of words',
        'your videos are good']

In [5]:
sent

['The glass of milk',
 'The glass of juice',
 'The cup of tea',
 'I am a good boy',
 'I am a good developer',
 'Understanding the meaning of words',
 'your videos are good']

In [6]:
### Vocabulary size
voc_size = 10000

In [8]:
## one hot representation
onehot_repr = [one_hot(words,voc_size) for words in sent]
print(onehot_repr)

[[6026, 3678, 2556, 5068], [6026, 3678, 2556, 8945], [6026, 8665, 2556, 3046], [3950, 5050, 8635, 4390, 7993], [3950, 5050, 8635, 4390, 8559], [4759, 6026, 3730, 2556, 1450], [5062, 4193, 385, 4390]]


In [9]:
## Word Embedding Representation

from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [10]:
import numpy as np

In [11]:
# PRE PADDING
sent_length = 8
embedded_docs = pad_sequences(onehot_repr, padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0    0 6026 3678 2556 5068]
 [   0    0    0    0 6026 3678 2556 8945]
 [   0    0    0    0 6026 8665 2556 3046]
 [   0    0    0 3950 5050 8635 4390 7993]
 [   0    0    0 3950 5050 8635 4390 8559]
 [   0    0    0 4759 6026 3730 2556 1450]
 [   0    0    0    0 5062 4193  385 4390]]


In [27]:
# POST PADDING

# sent_length = 8
# embedded_docs = pad_sequences(onehot_repr, padding='post',maxlen=sent_length)
# print(embedded_docs)

[[6815 8342 4474 5037    0    0    0    0]
 [6815 8342 4474 5476    0    0    0    0]
 [6815 9500 4474 2970    0    0    0    0]
 [4503 5515 3337 7243 1624    0    0    0]
 [4503 5515 3337 7243 9423    0    0    0]
 [9120 6815 3273 4474 9941    0    0    0]
 [2116 1518 3695 7243    0    0    0    0]]


In [12]:
## if input size is not fixed we make sure that we need fix the same size as input size using prepadding or post padding

In [13]:
# feature dimention
dim=10

In [14]:
# we need to convert each and every value into vector
# this embedding layer will work similar like word2vec only

In [15]:
model=Sequential()
model.add(Embedding(voc_size,10,input_length=sent_length))
model.compile('adam','mse')

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 10)             100000    
                                                                 
Total params: 100000 (390.62 KB)
Trainable params: 100000 (390.62 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [17]:
### 'The glass of milk' -- first sentence
embedded_docs[0]

array([   0,    0,    0,    0, 6026, 3678, 2556, 5068], dtype=int32)

In [18]:
print(model.predict(embedded_docs[0]))

[[ 0.01858428  0.02680499  0.01744356  0.01456142  0.04174199  0.0297544
  -0.03461748 -0.01493578  0.03189256  0.02773218]
 [ 0.01858428  0.02680499  0.01744356  0.01456142  0.04174199  0.0297544
  -0.03461748 -0.01493578  0.03189256  0.02773218]
 [ 0.01858428  0.02680499  0.01744356  0.01456142  0.04174199  0.0297544
  -0.03461748 -0.01493578  0.03189256  0.02773218]
 [ 0.01858428  0.02680499  0.01744356  0.01456142  0.04174199  0.0297544
  -0.03461748 -0.01493578  0.03189256  0.02773218]
 [ 0.01341571  0.04078356  0.00408313 -0.0032902  -0.03525608 -0.03551003
  -0.04975706 -0.04014855  0.01774174 -0.04581623]
 [-0.03134952  0.02977896 -0.03021019  0.01709202  0.01496582 -0.03488215
   0.03752308  0.00755079  0.0080502   0.04960139]
 [ 0.02465906 -0.04004981  0.01443166 -0.02130209  0.0154174   0.01511149
  -0.03810322  0.00458755 -0.02016917  0.00652839]
 [ 0.02853651  0.0420163  -0.00260217  0.01168463  0.00248306 -0.02899214
  -0.04324913  0.01823684 -0.00050105 -0.00713273]]


In [19]:
print(model.predict(embedded_docs))

[[[ 1.8584277e-02  2.6804987e-02  1.7443564e-02  1.4561418e-02
    4.1741993e-02  2.9754404e-02 -3.4617484e-02 -1.4935780e-02
    3.1892564e-02  2.7732182e-02]
  [ 1.8584277e-02  2.6804987e-02  1.7443564e-02  1.4561418e-02
    4.1741993e-02  2.9754404e-02 -3.4617484e-02 -1.4935780e-02
    3.1892564e-02  2.7732182e-02]
  [ 1.8584277e-02  2.6804987e-02  1.7443564e-02  1.4561418e-02
    4.1741993e-02  2.9754404e-02 -3.4617484e-02 -1.4935780e-02
    3.1892564e-02  2.7732182e-02]
  [ 1.8584277e-02  2.6804987e-02  1.7443564e-02  1.4561418e-02
    4.1741993e-02  2.9754404e-02 -3.4617484e-02 -1.4935780e-02
    3.1892564e-02  2.7732182e-02]
  [ 1.3415705e-02  4.0783558e-02  4.0831342e-03 -3.2901987e-03
   -3.5256077e-02 -3.5510026e-02 -4.9757063e-02 -4.0148545e-02
    1.7741743e-02 -4.5816232e-02]
  [-3.1349517e-02  2.9778961e-02 -3.0210186e-02  1.7092023e-02
    1.4965821e-02 -3.4882151e-02  3.7523080e-02  7.5507872e-03
    8.0502033e-03  4.9601387e-02]
  [ 2.4659064e-02 -4.0049814e-02  1.4431

In [20]:
### Assignment

sent_new = ["The world is a better place",
            "Marvel series is my favourite movie",
            "I like DC movies",
            "the cat is eating the food",
            "Tom and jerry is my favourite movie",
            "Python is my favourite programming language"]

In [21]:
sent_new

['The world is a better place',
 'Marvel series is my favourite movie',
 'I like DC movies',
 'the cat is eating the food',
 'Tom and jerry is my favourite movie',
 'Python is my favourite programming language']

In [22]:
voc_size = 50000

In [23]:
## one hot representation
onehot_repr_new = [one_hot(words,voc_size) for words in sent_new]
print(onehot_repr_new)

[[38599, 9753, 49882, 40449, 4590, 40639], [28329, 7267, 49882, 31436, 34222, 41570], [13760, 16719, 4746, 31267], [38599, 892, 49882, 26166, 38599, 38876], [600, 16429, 22915, 49882, 31436, 34222, 41570], [37111, 49882, 31436, 34222, 26084, 19487]]


In [24]:
# POST PADDING
sent_length = 10
embedded_docs = pad_sequences(onehot_repr, padding='post',maxlen=sent_length)
print(embedded_docs)

[[6026 3678 2556 5068    0    0    0    0    0    0]
 [6026 3678 2556 8945    0    0    0    0    0    0]
 [6026 8665 2556 3046    0    0    0    0    0    0]
 [3950 5050 8635 4390 7993    0    0    0    0    0]
 [3950 5050 8635 4390 8559    0    0    0    0    0]
 [4759 6026 3730 2556 1450    0    0    0    0    0]
 [5062 4193  385 4390    0    0    0    0    0    0]]


In [25]:
dim=20

In [26]:
model=Sequential()
model.add(Embedding(voc_size,20,input_length=sent_length))
model.compile('adam','mse')

In [27]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 10, 20)            1000000   
                                                                 
Total params: 1000000 (3.81 MB)
Trainable params: 1000000 (3.81 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [28]:
print(model.predict(embedded_docs))

[[[-0.02872952  0.03988805  0.04573771 ... -0.04198503 -0.0143496
    0.00879817]
  [-0.01666056 -0.0043491   0.00207754 ... -0.01819021 -0.03344234
    0.0145745 ]
  [ 0.04652074 -0.03324814 -0.03861304 ... -0.03522118 -0.00549439
    0.01036888]
  ...
  [ 0.02973226  0.01869703  0.01708371 ...  0.03882045  0.00567507
    0.03321124]
  [ 0.02973226  0.01869703  0.01708371 ...  0.03882045  0.00567507
    0.03321124]
  [ 0.02973226  0.01869703  0.01708371 ...  0.03882045  0.00567507
    0.03321124]]

 [[-0.02872952  0.03988805  0.04573771 ... -0.04198503 -0.0143496
    0.00879817]
  [-0.01666056 -0.0043491   0.00207754 ... -0.01819021 -0.03344234
    0.0145745 ]
  [ 0.04652074 -0.03324814 -0.03861304 ... -0.03522118 -0.00549439
    0.01036888]
  ...
  [ 0.02973226  0.01869703  0.01708371 ...  0.03882045  0.00567507
    0.03321124]
  [ 0.02973226  0.01869703  0.01708371 ...  0.03882045  0.00567507
    0.03321124]
  [ 0.02973226  0.01869703  0.01708371 ...  0.03882045  0.00567507
    0.03