# **DL project with simple RNN on IMDB dataset**

# Embedding

In [None]:
from tensorflow.keras.preprocessing.text import one_hot
sent=['the glass of milk',
      'the glass of juice',
      'the cup of tea',
      'I am a good boy',
      'I am a good developer',
      'understand the meaning of words',
      'your videos are good']

In [None]:
voc_size=10000


In [None]:
##OHE
onehot_repr=[one_hot(words,voc_size)for words in sent]
onehot_repr
# These numbers are hashed indices. They identify the words, but don't describe them.

[[4996, 9703, 5222, 3173],
 [4996, 9703, 5222, 605],
 [4996, 1156, 5222, 6671],
 [1285, 2645, 7072, 9864, 4680],
 [1285, 2645, 7072, 9864, 9033],
 [2840, 4996, 2441, 5222, 4026],
 [5365, 3265, 5251, 9864]]

In [None]:
import numpy as np
from tensorflow.keras.layers import Embedding
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential

In [None]:
# pad_sequences ensures all sentences become the same length i.e here 8

In [None]:
sent_length=8
embedded_docs=pad_sequences(onehot_repr,padding='post',maxlen=sent_length)
print(embedded_docs)

[[4996 9703 5222 3173    0    0    0    0]
 [4996 9703 5222  605    0    0    0    0]
 [4996 1156 5222 6671    0    0    0    0]
 [1285 2645 7072 9864 4680    0    0    0]
 [1285 2645 7072 9864 9033    0    0    0]
 [2840 4996 2441 5222 4026    0    0    0]
 [5365 3265 5251 9864    0    0    0    0]]


# Feature representation

In [None]:
dim=10
# 10 features are used for representation of words

In [None]:
model=Sequential()
model.add(Embedding(input_dim=voc_size, output_dim=dim, input_length=sent_length))
model.compile('adam','mse')



In [None]:
model.build(input_shape=(None, sent_length))
model.summary()



In [None]:
embedded_docs[0]
# model.predict(embedded_docs)

array([4996, 9703, 5222, 3173,    0,    0,    0,    0], dtype=int32)

In [None]:
embedded_docs

array([[4996, 9703, 5222, 3173,    0,    0,    0,    0],
       [4996, 9703, 5222,  605,    0,    0,    0,    0],
       [4996, 1156, 5222, 6671,    0,    0,    0,    0],
       [1285, 2645, 7072, 9864, 4680,    0,    0,    0],
       [1285, 2645, 7072, 9864, 9033,    0,    0,    0],
       [2840, 4996, 2441, 5222, 4026,    0,    0,    0],
       [5365, 3265, 5251, 9864,    0,    0,    0,    0]], dtype=int32)

In [None]:
model.predict(embedded_docs)
#we get 7 8*10 matrices i.e word embeddings

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step


array([[[-0.04709667, -0.02995545,  0.04402837,  0.04710596,
          0.0066764 ,  0.01808305, -0.0111316 ,  0.04791249,
          0.02274882, -0.00613145],
        [-0.01497085,  0.02657837, -0.03164606,  0.01426245,
         -0.03127106, -0.03977277,  0.04753743,  0.04756797,
          0.02091589,  0.00577608],
        [ 0.03606024,  0.01854564,  0.01493331, -0.04309701,
          0.03366515, -0.02038604,  0.01246505, -0.04189955,
          0.0413645 , -0.04142636],
        [ 0.04146774,  0.01293756,  0.04853961, -0.01298034,
         -0.00409126,  0.04132279,  0.03374065,  0.04218635,
         -0.03644489,  0.00980975],
        [ 0.00586449,  0.02019073,  0.04617555, -0.01507942,
          0.03215369,  0.04611954, -0.00047123, -0.01664779,
          0.04870106, -0.01587888],
        [ 0.00586449,  0.02019073,  0.04617555, -0.01507942,
          0.03215369,  0.04611954, -0.00047123, -0.01664779,
          0.04870106, -0.01587888],
        [ 0.00586449,  0.02019073,  0.04617555, -0.0

In [None]:
model.predict(embedded_docs[0].reshape(1, -1))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step


array([[[-0.04709667, -0.02995545,  0.04402837,  0.04710596,
          0.0066764 ,  0.01808305, -0.0111316 ,  0.04791249,
          0.02274882, -0.00613145],
        [-0.01497085,  0.02657837, -0.03164606,  0.01426245,
         -0.03127106, -0.03977277,  0.04753743,  0.04756797,
          0.02091589,  0.00577608],
        [ 0.03606024,  0.01854564,  0.01493331, -0.04309701,
          0.03366515, -0.02038604,  0.01246505, -0.04189955,
          0.0413645 , -0.04142636],
        [ 0.04146774,  0.01293756,  0.04853961, -0.01298034,
         -0.00409126,  0.04132279,  0.03374065,  0.04218635,
         -0.03644489,  0.00980975],
        [ 0.00586449,  0.02019073,  0.04617555, -0.01507942,
          0.03215369,  0.04611954, -0.00047123, -0.01664779,
          0.04870106, -0.01587888],
        [ 0.00586449,  0.02019073,  0.04617555, -0.01507942,
          0.03215369,  0.04611954, -0.00047123, -0.01664779,
          0.04870106, -0.01587888],
        [ 0.00586449,  0.02019073,  0.04617555, -0.0

In [None]:
model.predict(embedded_docs[1].reshape(1, -1))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step


array([[[-0.04709667, -0.02995545,  0.04402837,  0.04710596,
          0.0066764 ,  0.01808305, -0.0111316 ,  0.04791249,
          0.02274882, -0.00613145],
        [-0.01497085,  0.02657837, -0.03164606,  0.01426245,
         -0.03127106, -0.03977277,  0.04753743,  0.04756797,
          0.02091589,  0.00577608],
        [ 0.03606024,  0.01854564,  0.01493331, -0.04309701,
          0.03366515, -0.02038604,  0.01246505, -0.04189955,
          0.0413645 , -0.04142636],
        [-0.03668784,  0.04283092, -0.03460098,  0.04984171,
          0.02685782,  0.01293851, -0.04556641, -0.01656036,
         -0.04841533, -0.02076366],
        [ 0.00586449,  0.02019073,  0.04617555, -0.01507942,
          0.03215369,  0.04611954, -0.00047123, -0.01664779,
          0.04870106, -0.01587888],
        [ 0.00586449,  0.02019073,  0.04617555, -0.01507942,
          0.03215369,  0.04611954, -0.00047123, -0.01664779,
          0.04870106, -0.01587888],
        [ 0.00586449,  0.02019073,  0.04617555, -0.0