<a href="https://colab.research.google.com/github/souradipta93/NLP/blob/main/3_embedding_layer_keras_toy_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text classification with keras embedding layer

In [None]:
#!pip install tensorflow --user

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
import numpy as np

from numpy import array
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Embedding, Dense



In [None]:
# Enlist 10 movie reviews
reviews = [
    'Never coming back!',
    'unflinching scenes of violence',
    'slower than a soap opera',
    'complete waste of time',
    'Bad mistake',
    'A wonderful little production',
    'dialogue is witty and the characters are likable!',
    'Rocks!',
    'artistic work',
    'Could not have done better']

# Define labels (Y) (1=negative, 0=positive)
labels = array([1,1,1,1,1,0,0,0,0,0])

In [None]:
len(reviews)

10

In [None]:
## Identify the max word length of the sentences - to be used for padding
max_words = 0
for i in range (0, len(reviews)):
    if len(reviews[i].split(" ")) > max_words:
        max_words = len(reviews[i].split(" "))
max_words   

8

## Get vocabulary and word indices

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

#Lets understand count vectoriser
cv = CountVectorizer()

In [None]:
test_cv = cv.fit_transform(reviews)
test_cv.shape

(10, 35)

In [None]:
test_cv.shape[1]

35

In [None]:
VOCAB_SIZE = test_cv.shape[1]
VOCAB_SIZE

35

In [None]:
cv.vocabulary_

{'never': 17,
 'coming': 7,
 'back': 3,
 'unflinching': 29,
 'scenes': 23,
 'of': 19,
 'violence': 30,
 'slower': 24,
 'than': 26,
 'soap': 25,
 'opera': 20,
 'complete': 8,
 'waste': 31,
 'time': 28,
 'bad': 4,
 'mistake': 16,
 'wonderful': 33,
 'little': 15,
 'production': 21,
 'dialogue': 10,
 'is': 13,
 'witty': 32,
 'and': 0,
 'the': 27,
 'characters': 6,
 'are': 1,
 'likable': 14,
 'rocks': 22,
 'artistic': 2,
 'work': 34,
 'could': 9,
 'not': 18,
 'have': 12,
 'done': 11,
 'better': 5}

**TensorFlow translates all of the words to index values and replaces each word with that index**

In [None]:
#TensorFlow one-hot encoding
encoded_reviews = [one_hot(d, VOCAB_SIZE) for d in reviews]
encoded_reviews

[[18, 6, 33],
 [31, 1, 13, 33],
 [7, 30, 30, 18, 13],
 [4, 34, 13, 21],
 [16, 6],
 [30, 23, 32, 12],
 [13, 23, 1, 13, 10, 23, 18, 30],
 [24],
 [10, 21],
 [21, 20, 7, 21, 29]]

## Padding reviews
- lengths of sentences are different
- pad these reviews to max words

In [None]:
padded_reviews = pad_sequences(encoded_reviews, maxlen=max_words, padding='post')
padded_reviews

array([[18,  6, 33,  0,  0,  0,  0,  0],
       [31,  1, 13, 33,  0,  0,  0,  0],
       [ 7, 30, 30, 18, 13,  0,  0,  0],
       [ 4, 34, 13, 21,  0,  0,  0,  0],
       [16,  6,  0,  0,  0,  0,  0,  0],
       [30, 23, 32, 12,  0,  0,  0,  0],
       [13, 23,  1, 13, 10, 23, 18, 30],
       [24,  0,  0,  0,  0,  0,  0,  0],
       [10, 21,  0,  0,  0,  0,  0,  0],
       [21, 20,  7, 21, 29,  0,  0,  0]])

**each review is padded by appending zeros at the end -  padding=post setting**

### neural network to learn to classify these reviews

- input_dim = How large is the vocabulary? 
- output_dim = How many numbers in the vector that you wish to return.
- input_length = How many items are in the input feature vector that you need to transform?

## Creating the embedding layer

In [None]:
model=Sequential()
model.add(Embedding(VOCAB_SIZE, 5, input_length=max_words))
model.compile('adam','mse')

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 8, 5)              175       
Total params: 175
Trainable params: 175
Non-trainable params: 0
_________________________________________________________________


In [None]:
#View the word embeddings (8 words of padded review * 5 dimensions per word)
print(model.predict(padded_reviews))

[[[ 0.02628726  0.02608377  0.01073365  0.02550543  0.00356968]
  [-0.01923581 -0.0143186   0.01217812  0.04515283  0.01381182]
  [ 0.03764497 -0.04185117  0.00226804  0.00752691 -0.04151941]
  [ 0.04958353 -0.00881586 -0.02619383  0.01204515 -0.02920728]
  [ 0.04958353 -0.00881586 -0.02619383  0.01204515 -0.02920728]
  [ 0.04958353 -0.00881586 -0.02619383  0.01204515 -0.02920728]
  [ 0.04958353 -0.00881586 -0.02619383  0.01204515 -0.02920728]
  [ 0.04958353 -0.00881586 -0.02619383  0.01204515 -0.02920728]]

 [[ 0.0437868  -0.00890525  0.04524394  0.04425904 -0.04062217]
  [ 0.0179368   0.0137301  -0.02281821  0.0290441   0.02564264]
  [ 0.03283925  0.02162002  0.02333589 -0.0476038  -0.03974309]
  [ 0.03764497 -0.04185117  0.00226804  0.00752691 -0.04151941]
  [ 0.04958353 -0.00881586 -0.02619383  0.01204515 -0.02920728]
  [ 0.04958353 -0.00881586 -0.02619383  0.01204515 -0.02920728]
  [ 0.04958353 -0.00881586 -0.02619383  0.01204515 -0.02920728]
  [ 0.04958353 -0.00881586 -0.02619383

## Use embedding layer to build a classification model

In [None]:
model = Sequential()
embedding_layer = Embedding(VOCAB_SIZE, 5, input_length=max_words)
model.add(embedding_layer)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 8, 5)              175       
_________________________________________________________________
flatten_1 (Flatten)          (None, 40)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 41        
Total params: 216
Trainable params: 216
Non-trainable params: 0
_________________________________________________________________
None


### Explaining the output

- 8 is the max_word used for the padded reviews
- 5 is the length of the vector used to represent each word
- 5 * 35 (VOCAB_SIZE) is 175 - number of parameters in the embedding layer
- Output size from embedding layer is 8*5 = 40
- single output neuron is connected to the embedding layer by 33 weights
- 32 from the embedding layer and a single bias neuron

In [None]:
# fit the model
model.fit(padded_reviews,labels, epochs=30, verbose=2)

Epoch 1/30
1/1 - 0s - loss: 0.6921 - acc: 0.5000
Epoch 2/30
1/1 - 0s - loss: 0.6906 - acc: 0.5000
Epoch 3/30
1/1 - 0s - loss: 0.6891 - acc: 0.6000
Epoch 4/30
1/1 - 0s - loss: 0.6876 - acc: 0.7000
Epoch 5/30
1/1 - 0s - loss: 0.6861 - acc: 0.8000
Epoch 6/30
1/1 - 0s - loss: 0.6846 - acc: 0.8000
Epoch 7/30
1/1 - 0s - loss: 0.6830 - acc: 1.0000
Epoch 8/30
1/1 - 0s - loss: 0.6815 - acc: 1.0000
Epoch 9/30
1/1 - 0s - loss: 0.6800 - acc: 1.0000
Epoch 10/30
1/1 - 0s - loss: 0.6784 - acc: 1.0000
Epoch 11/30
1/1 - 0s - loss: 0.6769 - acc: 1.0000
Epoch 12/30
1/1 - 0s - loss: 0.6753 - acc: 1.0000
Epoch 13/30
1/1 - 0s - loss: 0.6738 - acc: 1.0000
Epoch 14/30
1/1 - 0s - loss: 0.6722 - acc: 1.0000
Epoch 15/30
1/1 - 0s - loss: 0.6707 - acc: 1.0000
Epoch 16/30
1/1 - 0s - loss: 0.6691 - acc: 1.0000
Epoch 17/30
1/1 - 0s - loss: 0.6675 - acc: 1.0000
Epoch 18/30
1/1 - 0s - loss: 0.6659 - acc: 1.0000
Epoch 19/30
1/1 - 0s - loss: 0.6643 - acc: 1.0000
Epoch 20/30
1/1 - 0s - loss: 0.6627 - acc: 1.0000
Epoch 21/

<tensorflow.python.keras.callbacks.History at 0x185982c28b0>

In [None]:
#Weights of the neural network
print(embedding_layer.get_weights()[0].shape)
print(embedding_layer.get_weights())

(35, 5)
[array([[ 0.00896169,  0.05281427,  0.07611971, -0.00220037, -0.04760801],
       [-0.01334171, -0.02854192, -0.01416   ,  0.07467426, -0.01148151],
       [-0.00307142, -0.04682486,  0.03691106, -0.04811405,  0.02341738],
       [ 0.00628116,  0.03010892, -0.01977941,  0.00172827, -0.01123383],
       [ 0.01848032,  0.00916144, -0.06602555,  0.0179408 ,  0.04225676],
       [ 0.03119499, -0.03585517,  0.01607618, -0.04502418, -0.04649937],
       [-0.00925162, -0.01809002, -0.07927017, -0.018263  ,  0.03500897],
       [-0.00839186,  0.03043676, -0.03704572, -0.01208869,  0.03305728],
       [-0.01109999, -0.00028785,  0.03328579,  0.04230466,  0.03021765],
       [ 0.03680978,  0.01648321,  0.04533643, -0.03523693,  0.0032534 ],
       [ 0.04923539, -0.07009612, -0.00024977, -0.00541267, -0.01721736],
       [ 0.01752696,  0.01674411, -0.03417907, -0.04716414, -0.02547703],
       [ 0.02454761,  0.03098752, -0.04716522, -0.03430349,  0.01053312],
       [ 0.00906456,  0.07495

**every word in the vocabulary is represented by a vector of 8 numbers**

In [None]:
loss, accuracy = model.evaluate(padded_reviews, labels, verbose=0)
print(f'Accuracy: {accuracy}')

Accuracy: 1.0
