#**Sentiment analysis IMDB reviews using Word2Vec, LSTM in Keras and Tensorflow**

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
imdb = tfds.load('imdb_reviews',as_supervised=True)
train_data, test_data = imdb['train'], imdb['test']

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…







HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteISYL5H/imdb_reviews-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteISYL5H/imdb_reviews-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteISYL5H/imdb_reviews-unsupervised.tfrecord


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [None]:
training_sentences=[]
training_labels=[]
testing_sentences=[]
testing_labels=[]
for sentence,label in train_data:
    training_sentences.append(str(sentence.numpy()))
    training_labels.append(label.numpy())
for sentence,label in test_data:
    testing_sentences.append(str(sentence.numpy()))
    testing_labels.append(label.numpy())
training_labels=np.array(training_labels)
testing_labels=np.array(testing_labels)

In [None]:
all_sentences = training_sentences + testing_sentences

In [None]:
reviews_cleaned = list()

for review in all_sentences:
  tokens = word_tokenize(review)
  tokens = [ word.lower() for word in tokens]
  table = str.maketrans('','',string.punctuation)
  stripped = [ w.translate(table) for w in tokens]
  words = [word for word in stripped if word.isalpha()]
  stop_words = set(stopwords.words('english'))
  words = [w for w in words if not w in stop_words]
  reviews_cleaned.append(words)


In [None]:
embedding_dim = 128
max_length = 64
trunc_type='post'
oov_tok = "<OOV>"

In [None]:
embedding_model = Word2Vec(sentences = reviews_cleaned,
                          size = embedding_dim,
                           window = 7,
                           min_count = 0
                           )
vocab = list(embedding_model.wv.vocab)

In [None]:
vocab_size = len(vocab)
print(vocab_size)

135725


In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(reviews_cleaned)
word_index = tokenizer.word_index

In [None]:
training_sequences=tokenizer.texts_to_sequences(training_sentences)
testing_sequences=tokenizer.texts_to_sequences(testing_sentences)

In [None]:
training_padded=pad_sequences(training_sequences,maxlen=max_length,truncating=trunc_type,padding='post')
testing_padded=pad_sequences(testing_sequences,maxlen=max_length,truncating=trunc_type,padding='post')

In [None]:
training_padded[4]

array([  278,   927,   246,    54,  2596,     3,   520,   300,  1354,
          39,  9705,   162,  8787,   522, 18374,   226,    86,    81,
        6168,   226,    86,   305, 20632,   219,    76,   304,   304,
         352,    47,    12,   710,  1848,   506,   111,  2880,  6342,
         183,    37,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0], dtype=int32)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, LSTM, Dense

In [None]:

model = Sequential()
e = embedding_model.wv.get_keras_embedding(train_embeddings=False)
model.add(e)
model.add(LSTM(units=32,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(1,activation='sigmoid')) 
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())






Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         17372800  
_________________________________________________________________
lstm (LSTM)                  (None, 32)                20608     
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 17,393,441
Trainable params: 20,641
Non-trainable params: 17,372,800
_________________________________________________________________
None


In [None]:
num_epochs = 20

In [None]:
history = model.fit(training_padded, training_labels, batch_size= 64, epochs=num_epochs, validation_split= 0.2)

Epoch 1/20

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(range(num_epochs),history.history['accuracy'],label='Training')
plt.plot(range(num_epochs),history.history['val_accuracy'],label='Validation')
plt.legend()
plt.show()

In [None]:
loss, acc = model.evaluate(testing_padded, testing_labels)
print('Test accuracy',acc *100)

In [None]:
sample1 = "Wow.. what a movie. I would love to watch it again"
sample2 = "Why someone spend money and time on this movie.. please don't watch"
samples = [sample1, sample2]

In [None]:
sample_to_tokens = tokenizer.texts_to_sequences(samples)
sample_to_padded_tokens = pad_sequences(sample_to_tokens,maxlen=max_length,padding='post')


In [None]:
sample_to_padded_tokens

In [None]:
model.predict(sample_to_padded_tokens)