# Sentiment Classification of IMDB Reviews

## Dataset

In [1]:
import tensorflow_datasets as tfds

# IMDB Reviews dataset
df = tfds.load("imdb_reviews", as_supervised=True)

# first two training example
for review in df['train'].take(2):
    print(review)

(<tf.Tensor: shape=(), dtype=string, numpy=b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.">, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on

## Train and Test dataset 

In [9]:
import numpy as np

# review and labels
review_tr, label_tr = [], np.array([]) # training sentences and lables
review_te, label_te = [], np.array([])

for review, label in df['train']:
    review_tr.append(review.numpy().decode('utf8'))
    label_tr = np.append(label_tr, label.numpy())

for review, label in df['test']:
    review_te.append(review.numpy().decode('utf8'))
    label_te = np.append(label_te, label.numpy())

# first review and it's label
print(f'the first review:\n "{review_tr[0]}"')
print(f"label: {label_tr[0]}")

the first review:
 "This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
label: 0.0


## Encoding Reviews

In [11]:
# fixing the parameters
vsize = 10000 # number of unique words
rlen = 120 # maximum words in a review
emb_dim = 16 # embeded vector dimension
trunc = 'post' # truncating words from last
oov = ""

In [12]:
# Encoding setup
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Encoder methods
encoder = Tokenizer(num_words=vsize, oov_token=oov)
encoder.fit_on_texts(review_tr)
word_index = encoder.word_index

In [15]:
# Encoded sequence generation

# training
sequence_tr = encoder.texts_to_sequences(review_tr)
padded_tr = pad_sequences(sequence_tr, maxlen=rlen, truncating=trunc)

#test
sequence_te = encoder.texts_to_sequences(review_te)
padded_te = pad_sequences(sequence_te, maxlen=rlen, truncating=trunc)

## Embeding

In [21]:
# Model Building
import tensorflow as tf
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vsize, emb_dim, input_length=rlen),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(20, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Parameters training
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 120, 16)           160000    
                                                                 
 flatten_2 (Flatten)         (None, 1920)              0         
                                                                 
 dense_2 (Dense)             (None, 20)                38420     
                                                                 
 dense_3 (Dense)             (None, 1)                 21        
                                                                 
Total params: 198,441
Trainable params: 198,441
Non-trainable params: 0
_________________________________________________________________


In [23]:
# Model Training
nepochs = 20
model.fit(padded_tr, label_tr, epochs=nepochs, validation_data=(padded_te, label_te))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x244ba920670>

### Model Test

In [41]:
def getSentiment(review):
    sen = [review]
    seq = encoder.texts_to_sequences([review])
    pad = pad_sequences(seq, maxlen = rlen, truncating=trunc)
    result = model.predict(pad, verbose=0)
    print('Sentiment score between 0 and 1 (closer to 1 - positve sentiment):\n')
    return(result[0][0])

In [42]:
reviews = "I have not seen ever a worst moview like this. Total waste of money."
getSentiment(reviews)

Sentiment score between 0 and 1 (closer to 1 - positve sentiment):



0.42266902

## File generation to check embedding in Projector

In [43]:
# Embedding weights

embedding_layers = model.layers[0]
embedding_weights = embedding_layers.get_weights()[0]
print(embedding_weights.shape)

(10000, 16)


In [45]:
# reversed dictionary
reverse_word_index = encoder.index_word

In [47]:
# generating tensorflow files for visualization of embeddings

import io
out_v = io.open('v.tsv', 'w', encoding='utf-8') # for vector weights of each word 
out_w = io.open('w.tsv', 'w', encoding='utf-8') # for words

# writing files
for i in range(1, vsize):
    word = reverse_word_index[i]
    out_w.write(word+"\n")
    word_weights_vec = embedding_weights[i]
    out_v.write('\t'.join([str(x) for x in word_weights_vec]) + "\n")

out_v.close()
out_w.close()

print('files generated')

files generated
