

# PART_1 
Compare Models: LSTM, GRU, Dense, Naïve Bayesian 

In [None]:
from keras.preprocessing import sequence
from keras import models, layers, optimizers, datasets, utils, losses
import numpy as np

vocabulary_size = 10000
maxlen = 40
batch_size = 25

(x_train, y_train), (x_test, y_test) = datasets.imdb.load_data(num_words=vocabulary_size)
x_train = sequence.pad_sequences(x_train, maxlen)
x_test = sequence.pad_sequences(x_test, maxlen)



Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [None]:
#LSTM
inputs = layers.Input(shape=(maxlen,))
e=layers.Embedding(vocabulary_size, 128)(inputs)
h=layers.LSTM(128, dropout=0.8, recurrent_dropout=0.8)(e)
outputs=layers.Dense(1, activation='sigmoid')(h)
model = models.Model(inputs, outputs)

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=4,
          validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test accuracy:', acc)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Test accuracy: 0.796720027923584


In [None]:
#GRU
from keras.preprocessing import sequence
from keras import models, layers, optimizers, datasets, utils, losses

vocabulary_size = 10000
maxlen = 40
batch_size = 25

(x_train, y_train), (x_test, y_test) = datasets.imdb.load_data(num_words=vocabulary_size)
x_train = sequence.pad_sequences(x_train, maxlen)
x_test = sequence.pad_sequences(x_test, maxlen)


inputs = layers.Input(shape=(maxlen,))
e=layers.Embedding(vocabulary_size, 128)(inputs)
h=layers.GRU(128, dropout=0.8, recurrent_dropout=0.8)(e)
outputs=layers.Dense(1, activation='sigmoid')(h)
model = models.Model(inputs, outputs)

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=4,
          validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test accuracy:', acc)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Test accuracy: 0.8116400241851807


In [None]:
#Dense
from keras.preprocessing import sequence
from keras import models, layers, optimizers, datasets, utils, losses

vocabulary_size = 10000
maxlen = 40
batch_size = 25

(x_train, y_train), (x_test, y_test) = datasets.imdb.load_data(num_words=vocabulary_size)
x_train = sequence.pad_sequences(x_train, maxlen)
x_test = sequence.pad_sequences(x_test, maxlen)


inputs = layers.Input(shape=(maxlen,))
e=layers.Embedding(vocabulary_size, 128)(inputs)
h=layers.Dense(128, activation = "relu" )(e)
outputs=layers.Dense(1, activation='sigmoid')(h)
model = models.Model(inputs, outputs)

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=4,
          validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test accuracy:', acc)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Test accuracy: 0.5577864646911621


In [None]:
# Naive Bayes
from keras.preprocessing import sequence
from keras import models, layers, optimizers, datasets, utils, losses
from sklearn.naive_bayes import GaussianNB

vocabulary_size = 10000
maxlen = 40
batch_size = 25

(x_train, y_train), (x_test, y_test) = datasets.imdb.load_data(num_words=vocabulary_size)
x_train = sequence.pad_sequences(x_train, maxlen)
x_test = sequence.pad_sequences(x_test, maxlen)


inputs = layers.Input(shape=(maxlen,))
e=layers.Embedding(vocabulary_size, 128)(inputs)
h=GaussianNB()

h.fit(x_train, y_train,)

acc = h.score(x_test, y_test)
print('Test accuracy:', acc)

Test accuracy: 0.51028


In the given task of evaluating movie reviews as good or bad, GRU has performed best with accuracy of 81% and LSTM has given accuracy of 79.6%. Where as Dense network performed poorly with accuracy of 55% and Guassian model gave the least accuracy of 51%. The main reason that LSTM and GRU performed so well is that in the input of movie review the 40 words are related to each other. Each word has influence on the next word, so it is sequential input. LSTM maintains c(t) gate to store context of the previous inputs in predicting output and similary GRU uses hidden state to store memory as well. So these are both recurrent neural network and each review is treated as related sequence that is why it performs well. In dense layer model and naive gaussian model, each word of the review is treated as separate feature and no relation between these is learned. So, these models are not able to perform well in this case.

# PART_2
Pairwise distance in embedding space

In [None]:
x_train[0]

array([  22,   21,  134,  476,   26,  480,    5,  144,   30, 5535,   18,
         51,   36,   28,  224,   92,   25,  104,    4,  226,   65,   16,
         38, 1334,   88,   12,   16,  283,    5,   16, 4472,  113,  103,
         32,   15,   16, 5345,   19,  178,   32], dtype=int32)

In [None]:
imdb = datasets.imdb
word_index = imdb.get_word_index()
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [None]:
movie_review1 = [reverse_word_index.get(i-3, "?") for i in x_train[7]]
movie_review2 = [reverse_word_index.get(i-3, "?") for i in x_train[8]]

In [None]:
movie_review1[2]

'dull'

In [None]:
movie_review1[3]

'boring'

In [None]:
movie_review2[34]

'best'

**Pairwise distance for Dense model**


In [None]:
# getting embedding from Dense network
inputs = layers.Input(shape=(maxlen,))
e =layers.Embedding(vocabulary_size, 128)(inputs)
emb=layers.Flatten()(e)
h=layers.Dense(128, activation = "relu" )(emb)
outputs=layers.Dense(1, activation='sigmoid',)(h)
model = models.Model(inputs, outputs = [outputs,emb])

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=4,
          validation_data=(x_test, y_test))


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7f7cea222cc0>

In [None]:
y_pred = model.predict(x_train)
emb1 = y_pred[1][7]
emb1 = emb1.reshape(40,128)
emb_dull = emb1[2]
emb_boring = emb1[3]

In [None]:
 emb2 = y_pred[1][8]
 emb2 = emb2.reshape(40,128)
 emb_best = emb2[34]

In [None]:
from scipy.spatial import distance
d1 = distance.euclidean(emb_dull, emb_boring)
print(d1)

1.2481439113616943


In [None]:
d2 = distance.euclidean(emb_dull, emb_best)
d2

2.5505788326263428

In [None]:
d3 = distance.euclidean(emb_boring, emb_best)
d3

2.688584089279175

**Pairwise distance for LSTM model**

In [None]:
inputs = layers.Input(shape=(maxlen,))
e=layers.Embedding(vocabulary_size, 128)(inputs)
emb=layers.Flatten()(e)
h=layers.LSTM(128, dropout=0.8, recurrent_dropout=0.8)(e)
outputs=layers.Dense(1, activation='sigmoid')(h)
model = models.Model(inputs,  outputs = [outputs,emb])

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=4,
          validation_data=(x_test, y_test))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7f7cf64c6dd8>

In [None]:
y_pred = model.predict(x_train)
emb1 = y_pred[1][7]
emb1 = emb1.reshape(40,128)
emb_dull = emb1[2]
emb_boring = emb1[3]

In [None]:
 emb2 = y_pred[1][8]
 emb2 = emb2.reshape(40,128)
 emb_best = emb2[34]

In [None]:
from scipy.spatial import distance
d1 = distance.euclidean(emb_dull, emb_boring)
print(d1)

1.1012248992919922


In [None]:
d2 = distance.euclidean(emb_dull, emb_best)
d2

2.8943264484405518

In [None]:
d3 = distance.euclidean(emb_boring, emb_best)
d3

2.622497320175171

**Pairwise distance for GRU model**

In [None]:
inputs = layers.Input(shape=(maxlen,))
e=layers.Embedding(vocabulary_size, 128)(inputs)
emb=layers.Flatten()(e)
h=layers.GRU(128, dropout=0.8, recurrent_dropout=0.8)(e)
outputs=layers.Dense(1, activation='sigmoid')(h)
model = models.Model(inputs,  outputs = [outputs,emb])

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=4,
          validation_data=(x_test, y_test))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7f7ce60fd668>

In [None]:
y_pred = model.predict(x_train)
emb1 = y_pred[1][7]
emb1 = emb1.reshape(40,128)
emb_dull = emb1[2]
emb_boring = emb1[3]

In [None]:
 emb2 = y_pred[1][8]
 emb2 = emb2.reshape(40,128)
 emb_best = emb2[34]

In [None]:
from scipy.spatial import distance
d1 = distance.euclidean(emb_dull, emb_boring)
print(d1)

1.2452424764633179


In [None]:
d2 = distance.euclidean(emb_dull, emb_best)
d2

2.708897113800049

In [None]:
d3 = distance.euclidean(emb_boring, emb_best)
d3

2.720167875289917