In [0]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<a href="https://drive.google.com/open?id=1Cq6Yg53UaL0OLDnAz5n2ZFGwv9TeHVGT" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import tensorflow as tf
print(tf.__version__)


#!pip install -q tensorflow-datasets

2.2.0


In [0]:
#importing tensorflow datasets
import tensorflow_datasets as tfds
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

In [0]:
import numpy as np

train_data, test_data = imdb['train'], imdb['test']

training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

# str(s.tonumpy()) is needed in Python3 instead of just s.numpy()
for s,l in train_data:
  training_sentences.append(str(s.numpy()))
  training_labels.append(l.numpy())
  
for s,l in test_data:
  testing_sentences.append(str(s.numpy()))
  testing_labels.append(l.numpy())
  
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)



In [10]:
print(training_labels_final[:5])
print(training_sentences[:5])

[0 0 0 1 1]
['b"This was an absolutely terrible movie. Don\'t be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie\'s ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor\'s like Christopher Walken\'s good name. I could barely sit through it."', "b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The

In [0]:
vocab_size = 10000 #num of words in model vocabulary
embedding_dim = 16 #embedding dimension
max_length = 120 #max num of words in a sentence, short sentence -> padded, long sent -> truncated
trunc_type='post' #truncating words from rear
oov_tok = "<OOV>" #marking out of vocabulary words


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

#{'word': 'index' for all words in vocab}
word_index = tokenizer.word_index

#converting sentences into seq of numbers
sequences = tokenizer.texts_to_sequences(training_sentences)
#padding or truncating for achieving same length
padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)

#same thing on test dataset
#testing sequences are tokenized based on word_index that was learned from the training words
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length)


In [19]:
#Testing cell, run the cell below first
text = padded[2]
ans = ' '.join([reverse_word_index.get(i, '???') for i in text])
print(ans)
#print(reverse_word_index)
print(padded[2])

<OOV> photographs the <OOV> rocky mountains in a superb fashion and jimmy stewart and walter brennan give enjoyable performances as they always seem to do br br but come on hollywood a <OOV> telling the people of dawson city <OOV> to <OOV> themselves a <OOV> yes a <OOV> and to <OOV> the law themselves then <OOV> battling it out on the streets for control of the town br br nothing even remotely resembling that happened on the canadian side of the border during the <OOV> gold rush mr mann and company appear to have mistaken dawson city for <OOV> the canadian north for the american wild west br br canadian viewers be prepared for a <OOV> madness type of enjoyable
[   1 6175    2    1 4916 4029    9    4  912 1622    3 1969 1307    3
 2384 8836  201  746  361   15   34  208  308    6   83    8    8   19
  214   22  352    4    1  990    2   82    5 3608  545    1    6    1
  539    4    1  434    4    1    3    6    1    2 1176  539   95    1
 8111   10   46   22    2 1996   16 1153    5  

In [34]:
#for decoding a review from padded sequence to text
#reverse_word_index = {'index': 'word'}
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
def decode_review(text):
  #replacing index with words in vocabulary, replacing with "??" if word is not found
  return ' '.join([reverse_word_index.get(i, '??') for i in text])

print(decode_review(padded[0]))
print(training_sentences[0])


?? ?? b this was an absolutely terrible movie don't be <OOV> in by christopher walken or michael <OOV> both are great actors but this must simply be their worst role in history even their great acting could not redeem this movie's ridiculous storyline this movie is an early nineties us propaganda piece the most pathetic scenes were those when the <OOV> rebels were making their cases for <OOV> maria <OOV> <OOV> appeared phony and her pseudo love affair with walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning i am disappointed that there are movies like this ruining <OOV> like christopher <OOV> good name i could barely sit through it
b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most 

In [20]:
#Neural network model, top layer embedding, output of embedding layer will be flattened, then a layer of 6 neurons & the output layer consists of 1 neuron
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 11526     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [22]:
#training for 10 epochs
#this model is overfitted......
num_epochs = 10
model.fit(padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7efbad299160>

In [38]:
#predicting

#on test data
print(model.predict(testing_padded[:5]))
print(testing_labels_final[:5])

#on training data
print(model.predict(padded[:5]))
print(training_labels_final[:5])



[[2.4394315e-01]
 [9.9601620e-01]
 [1.6095247e-07]
 [1.6940733e-06]
 [9.9996567e-01]]
[1 1 0 0 1]
[[9.4601192e-16]
 [4.4422499e-08]
 [3.0696822e-06]
 [1.0000000e+00]
 [9.9999917e-01]]
[0 0 0 1 1]


In [29]:
#generating vecs.tsv & meta.tsv for projecting on http://projector.tensorflow.org/

# e = output of embedding
e = model.layers[0]

weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

[[2.4394315e-01]
 [9.9601620e-01]
 [1.6095247e-07]
 [1.6940733e-06]
 [9.9996567e-01]]
(10000, 16)


In [0]:
import io
#saving the words on meta.tsv & weights on vecs.tsv for projection
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [0]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

#go to http://projector.tensorflow.org/ , upload these two files which will allow you to plot these 16 dimensional vectors in a 3 dimensional 

In [24]:
sentence = "I really think this is amazing. honest."
sequence = tokenizer.texts_to_sequences(sentence)
print(sequence)

[[11], [], [1431], [966], [4], [1537], [1537], [4715], [], [790], [2019], [11], [2929], [2184], [], [790], [2019], [11], [579], [], [11], [579], [], [4], [1782], [4], [4517], [11], [2929], [1275], [], [], [2019], [1003], [2929], [966], [579], [790], []]
