# Integer Encoding of Text

In [1]:
import numpy as np

In [2]:
# Random text taken as part of docs list

docs = ['recurrent neural network',
		'neural network',
		'artificial neural',
		'connections between nodes',
		'can create a cycle',
		'allowing output',
		'some nodes to affect subsequent',
		'exhibit temporal',
		'dynamic behavior',
		'type of Neural Network',
    'affect subsequent']

In [3]:
# Tokenization is done to assign integer value to the words.
# We initialize Tokenizer with oov_token with default "<nothing>" .
# oov is "out of vocabulary" to ensure any new text which is not part of the existing text will be assigned "nothing"

from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token='<nothing>')

In [4]:
tokenizer.fit_on_texts(docs)

In [5]:
# It display all the integer value assigned by the tokenizer to the words

tokenizer.word_index

{'<nothing>': 1,
 'neural': 2,
 'network': 3,
 'nodes': 4,
 'affect': 5,
 'subsequent': 6,
 'recurrent': 7,
 'artificial': 8,
 'connections': 9,
 'between': 10,
 'can': 11,
 'create': 12,
 'a': 13,
 'cycle': 14,
 'allowing': 15,
 'output': 16,
 'some': 17,
 'to': 18,
 'exhibit': 19,
 'temporal': 20,
 'dynamic': 21,
 'behavior': 22,
 'type': 23,
 'of': 24}

In [6]:
# It display the count of unique word in the docs

tokenizer.word_counts

OrderedDict([('recurrent', 1),
             ('neural', 4),
             ('network', 3),
             ('artificial', 1),
             ('connections', 1),
             ('between', 1),
             ('nodes', 2),
             ('can', 1),
             ('create', 1),
             ('a', 1),
             ('cycle', 1),
             ('allowing', 1),
             ('output', 1),
             ('some', 1),
             ('to', 1),
             ('affect', 2),
             ('subsequent', 2),
             ('exhibit', 1),
             ('temporal', 1),
             ('dynamic', 1),
             ('behavior', 1),
             ('type', 1),
             ('of', 1)])

In [7]:
# it display the count of how many sentences we have in the docs

tokenizer.document_count

11

In [8]:
# It will display what is the token integer value of each words in the sentence

sequences = tokenizer.texts_to_sequences(docs)
sequences

[[7, 2, 3],
 [2, 3],
 [8, 2],
 [9, 10, 4],
 [11, 12, 13, 14],
 [15, 16],
 [17, 4, 18, 5, 6],
 [19, 20],
 [21, 22],
 [23, 24, 2, 3],
 [5, 6]]

Explanation of the above

**Sentence 1**  "recurrent neural network" </br>
**Integer Tokens of each word  **  recurrent = 7, neural = 2 , network = 3 </br>
Hence sequence of sentence 1 becomes [[7,2,3]]


# Padding

In [9]:
from keras.utils import pad_sequences

In [10]:
# Since in each sentence, count of words are not same hence we will use padding to have same number of word length in each sentences

sequences = pad_sequences(sequences,padding='post')

In [11]:
# sequence post padding

sequences

array([[ 7,  2,  3,  0,  0],
       [ 2,  3,  0,  0,  0],
       [ 8,  2,  0,  0,  0],
       [ 9, 10,  4,  0,  0],
       [11, 12, 13, 14,  0],
       [15, 16,  0,  0,  0],
       [17,  4, 18,  5,  6],
       [19, 20,  0,  0,  0],
       [21, 22,  0,  0,  0],
       [23, 24,  2,  3,  0],
       [ 5,  6,  0,  0,  0]], dtype=int32)

## Sentiment Analysis of IMDB review Dataset

In [12]:
from keras.datasets import imdb
from keras import Sequential
from keras.layers import Dense,SimpleRNN,Embedding,Flatten

In [13]:
# Loading imdb dataset
(X_train,y_train),(X_test,y_test) = imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [14]:
y_test

array([0, 1, 1, ..., 0, 0, 0])

In [15]:
# checking length of X_train dataset

print(len(X_train[2]))
print(len(X_train[3]))

141
550


We can see that length of X_train Dataset for each rows are not same

In [16]:
# X_train[2] before padding
X_train[2]

[1,
 14,
 47,
 8,
 30,
 31,
 7,
 4,
 249,
 108,
 7,
 4,
 5974,
 54,
 61,
 369,
 13,
 71,
 149,
 14,
 22,
 112,
 4,
 2401,
 311,
 12,
 16,
 3711,
 33,
 75,
 43,
 1829,
 296,
 4,
 86,
 320,
 35,
 534,
 19,
 263,
 4821,
 1301,
 4,
 1873,
 33,
 89,
 78,
 12,
 66,
 16,
 4,
 360,
 7,
 4,
 58,
 316,
 334,
 11,
 4,
 1716,
 43,
 645,
 662,
 8,
 257,
 85,
 1200,
 42,
 1228,
 2578,
 83,
 68,
 3912,
 15,
 36,
 165,
 1539,
 278,
 36,
 69,
 44076,
 780,
 8,
 106,
 14,
 6905,
 1338,
 18,
 6,
 22,
 12,
 215,
 28,
 610,
 40,
 6,
 87,
 326,
 23,
 2300,
 21,
 23,
 22,
 12,
 272,
 40,
 57,
 31,
 11,
 4,
 22,
 47,
 6,
 2307,
 51,
 9,
 170,
 23,
 595,
 116,
 595,
 1352,
 13,
 191,
 79,
 638,
 89,
 51428,
 14,
 9,
 8,
 106,
 607,
 624,
 35,
 534,
 6,
 227,
 7,
 129,
 113]

In [17]:
# X_train[3] before padding
X_train[3]

[1,
 4,
 18609,
 16085,
 33,
 2804,
 4,
 2040,
 432,
 111,
 153,
 103,
 4,
 1494,
 13,
 70,
 131,
 67,
 11,
 61,
 15305,
 744,
 35,
 3715,
 761,
 61,
 5766,
 452,
 9214,
 4,
 985,
 7,
 64317,
 59,
 166,
 4,
 105,
 216,
 1239,
 41,
 1797,
 9,
 15,
 7,
 35,
 744,
 2413,
 31,
 8,
 4,
 687,
 23,
 4,
 33929,
 7339,
 6,
 3693,
 42,
 38,
 39,
 121,
 59,
 456,
 10,
 10,
 7,
 265,
 12,
 575,
 111,
 153,
 159,
 59,
 16,
 1447,
 21,
 25,
 586,
 482,
 39,
 4,
 96,
 59,
 716,
 12,
 4,
 172,
 65,
 9,
 579,
 11,
 6004,
 4,
 1615,
 5,
 23005,
 7,
 5168,
 17,
 13,
 7064,
 12,
 19,
 6,
 464,
 31,
 314,
 11,
 87564,
 6,
 719,
 605,
 11,
 8,
 202,
 27,
 310,
 4,
 3772,
 3501,
 8,
 2722,
 58,
 10,
 10,
 537,
 2116,
 180,
 40,
 14,
 413,
 173,
 7,
 263,
 112,
 37,
 152,
 377,
 4,
 537,
 263,
 846,
 579,
 178,
 54,
 75,
 71,
 476,
 36,
 413,
 263,
 2504,
 182,
 5,
 17,
 75,
 2306,
 922,
 36,
 279,
 131,
 2895,
 17,
 2867,
 42,
 17,
 35,
 921,
 18435,
 192,
 5,
 1219,
 3890,
 19,
 20523,
 217,
 4122,
 1710,
 

As can be seen length are not the same. We will need to use padding to maintain same length of each row

In [18]:
# adding padding to have length of maximum 50 for each row
X_train = pad_sequences(X_train,padding='post',maxlen=50)
X_test = pad_sequences(X_test,padding='post',maxlen=50)

In [19]:
# X_train[2] post padding
X_train[2]

array([  215,    28,   610,    40,     6,    87,   326,    23,  2300,
          21,    23,    22,    12,   272,    40,    57,    31,    11,
           4,    22,    47,     6,  2307,    51,     9,   170,    23,
         595,   116,   595,  1352,    13,   191,    79,   638,    89,
       51428,    14,     9,     8,   106,   607,   624,    35,   534,
           6,   227,     7,   129,   113], dtype=int32)

In [20]:
# X_train[3] post padding
X_train[3]

array([  132,     8,    67,     6,    22,    15,     9,   283,     8,
        5168,    14,    31,     9,   242,   955,    48,    25,   279,
       22148,    23,    12,  1685,   195,    25,   238,    60,   796,
       13713,     4,   671,     7,  2804,     5,     4,   559,   154,
         888,     7,   726,    50,    26,    49,  7008,    15,   566,
          30,   579,    21,    64,  2574], dtype=int32)

In [21]:
print(len(X_train[2]))
print(len(X_train[3]))

50
50


Both X_train[2] and X_train[3] has same length post padding

# Creating Simple RRN model

In [24]:

model = Sequential()


# SimpleRNN is used for processing sequential data, such as time series data or sequences of text.
# number of hidden units(neruon) = 32
# input_shape =(50,1), sequence of length 50 with each step having a single feature
# With return_sequences = False it follows many-to-one architecture
    # If return_sequences = True it will return sequences as output
    # If return_sequences = Falsethe layer will return only the output of the final time step

model.add(SimpleRNN(32,input_shape=(50,1),return_sequences=False))
# 1 Dense layer as we only have classification problem with activation = Sigmoid
model.add(Dense(1,activation='sigmoid'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_1 (SimpleRNN)    (None, 32)                1088      
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 1121 (4.38 KB)
Trainable params: 1121 (4.38 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [23]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model.fit(X_train,y_train,epochs=5,validation_data=(X_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7b936d05eaa0>

# Encodings with keras Embeddings

In [25]:
docs1 = ['recurrent neural network',
		'neural network',
		'artificial neural',
		'connections between nodes',
		'can create a cycle',
		'allowing output',
		'some nodes to affect subsequent',
		'exhibit temporal',
		'dynamic behavior',
		'type of Neural Network',
    'affect subsequent']

In [26]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()

In [27]:
tokenizer.fit_on_texts(docs1)

In [28]:
# It display all the interger value assigned by the tokenizer to the words
tokenizer.word_index

{'neural': 1,
 'network': 2,
 'nodes': 3,
 'affect': 4,
 'subsequent': 5,
 'recurrent': 6,
 'artificial': 7,
 'connections': 8,
 'between': 9,
 'can': 10,
 'create': 11,
 'a': 12,
 'cycle': 13,
 'allowing': 14,
 'output': 15,
 'some': 16,
 'to': 17,
 'exhibit': 18,
 'temporal': 19,
 'dynamic': 20,
 'behavior': 21,
 'type': 22,
 'of': 23}

In [29]:
# it gives the count of all unique words we have int the documents

len(tokenizer.word_index)

23

In [30]:
# generating the sequence of docs1

sequences = tokenizer.texts_to_sequences(docs1)
sequences

[[6, 1, 2],
 [1, 2],
 [7, 1],
 [8, 9, 3],
 [10, 11, 12, 13],
 [14, 15],
 [16, 3, 17, 4, 5],
 [18, 19],
 [20, 21],
 [22, 23, 1, 2],
 [4, 5]]

In [31]:
# adding padding to the sequences

from keras.utils import pad_sequences
sequences = pad_sequences(sequences,padding='post')
sequences

array([[ 6,  1,  2,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 7,  1,  0,  0,  0],
       [ 8,  9,  3,  0,  0],
       [10, 11, 12, 13,  0],
       [14, 15,  0,  0,  0],
       [16,  3, 17,  4,  5],
       [18, 19,  0,  0,  0],
       [20, 21,  0,  0,  0],
       [22, 23,  1,  2,  0],
       [ 4,  5,  0,  0,  0]], dtype=int32)


# Embeddings

The purpose of an embedding layer is typically to learn meaningful representations of categorical data, such as word embeddings in natural language processing tasks or item embeddings in recommendation systems.

In [35]:
model = Sequential()

# 23 is total unique Vocabulary length, which means it expects input values ranging from 0 to 22. The input values will be used as indices to map to the corresponding embeddings.
# Embedding layer will convert integer values (ranging from 0 to 22) into dense vectors of size 2.
# The input sequences should have a length of 5.

model.add(Embedding(23,output_dim=2,input_length=5))

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 5, 2)              46        
                                                                 
Total params: 46 (184.00 Byte)
Trainable params: 46 (184.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [33]:
model.compile('adam','accuracy')

In [37]:
pred = model.predict(sequences)
print(pred)

InvalidArgumentError: ignored

In [38]:
from keras.datasets import imdb
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras import Sequential
from keras.layers import Dense,SimpleRNN,Embedding,Flatten

In [39]:
(X_train,y_train),(X_test,y_test) = imdb.load_data()

In [40]:
X_train = pad_sequences(X_train,padding='post',maxlen=50)
X_test = pad_sequences(X_test,padding='post',maxlen=50)

In [41]:
X_train.shape


(25000, 50)

In [48]:
X_train[2]

array([  215,    28,   610,    40,     6,    87,   326,    23,  2300,
          21,    23,    22,    12,   272,    40,    57,    31,    11,
           4,    22,    47,     6,  2307,    51,     9,   170,    23,
         595,   116,   595,  1352,    13,   191,    79,   638,    89,
       51428,    14,     9,     8,   106,   607,   624,    35,   534,
           6,   227,     7,   129,   113], dtype=int32)

In [42]:
model = Sequential()
model.add(Embedding(10000,output_dim=2,input_length=50))
model.add(SimpleRNN(32,return_sequences=False))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 50, 2)             20000     
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 32)                1120      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 21153 (82.63 KB)
Trainable params: 21153 (82.63 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [49]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])


In [50]:
history = model.fit(X_train, y_train,epochs=1,validation_data=(X_test,y_test))

InvalidArgumentError: ignored

# Predictions

In [45]:
X_test[0][0:50].reshape(1,-1).shape

(1, 50)

In [46]:
test_data = X_test[0][0:50].reshape(1,-1)

In [47]:
model.predict(test_data)

InvalidArgumentError: ignored