In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding

In [2]:
reviews = ['nice food',
        'amazing restaurant',
        'too good',
        'just loved it!',
        'will go again',
        'horrible food',
        'never go there',
        'poor service',
        'poor quality',
        'needs improvement']

sentiment = np.array([1,1,1,1,1,0,0,0,0,0])

In [3]:
one_hot('nice food', 50)

[3, 14]

In [4]:
vocab_size = 50
encoded_reviews = [one_hot(d, vocab_size) for d in reviews] # giving them index for each word they will be one hot encoded automatically while training in the keras layer
encoded_reviews

[[3, 14],
 [29, 8],
 [12, 37],
 [12, 39, 36],
 [32, 35, 6],
 [47, 14],
 [15, 35, 36],
 [10, 47],
 [10, 21],
 [2, 7]]

In [5]:
max_length = 3
padded_reviews = pad_sequences(encoded_reviews, maxlen = max_length, padding = 'post') # padding all sentence because all are not of same length
padded_reviews

array([[ 3, 14,  0],
       [29,  8,  0],
       [12, 37,  0],
       [12, 39, 36],
       [32, 35,  6],
       [47, 14,  0],
       [15, 35, 36],
       [10, 47,  0],
       [10, 21,  0],
       [ 2,  7,  0]])

In [6]:
embeded_vec_size = 4
model = Sequential()
model.add(Embedding(vocab_size, embeded_vec_size, input_length=3, name = 'embedding')) # the size of the dictionary(no. of total words), no. of features for each word, size of each sentence, name of the layer so we can use it later to get weights
model.add(Flatten()) # for each sentence we will get 3 vectors of size 4(coz we have 3 words for each sentence and 4 features for each words) so joining them for training
model.add(Dense(1, activation='sigmoid')) # 1 neuron layer for output and then back propagation

In [7]:
X = padded_reviews
y = sentiment

In [8]:
model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 4)              200       
                                                                 
 flatten (Flatten)           (None, 12)                0         
                                                                 
 dense (Dense)               (None, 1)                 13        
                                                                 
Total params: 213
Trainable params: 213
Non-trainable params: 0
_________________________________________________________________


In [9]:
model.fit(X, y, epochs=50, verbose=0)

<keras.callbacks.History at 0x198b61112a0>

In [10]:
model.evaluate(X, y)



[0.6418737173080444, 1.0]

In [11]:
weights = model.get_layer('embedding').get_weights()[0] # this is the weights we want for each word there are four 

In [13]:
weights[3], weights[37] # these are the one_hot indexes we calculated before

(array([ 0.06177153, -0.02295771, -0.06856312,  0.03601143], dtype=float32),
 array([ 0.00479292,  0.06162761,  0.09652323, -0.05066398], dtype=float32))

## we dont need to predict anything because its not our real goal here, our real goal is to extract the features, weights of each word here so we can use them afterwards for diff. problems