<a href="https://colab.research.google.com/github/praveenbenedict/tf-developer-certificate-coursera/blob/master/3%20-%20Natural%20Language%20Processing%20in%20Tensorflow/Week%202/2.1.1%20-%20IMDB_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import tensorflow as tf

In [24]:
import tensorflow_datasets as tfds

In [25]:
imdb, imdb_info = tfds.load('imdb_reviews', with_info = True, as_supervised = True)

In [26]:
import numpy as np

In [27]:
train_data, test_data = imdb['train'], imdb['test']

In [28]:
for item in train_data:
  print(item[0])
  print(item[1])
  break

tf.Tensor(b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.", shape=(), dtype=string)
tf.Tensor(0, shape=(), dtype=int64)


In [29]:
training_sentences, training_labels, testing_sentences, testing_labels = [], [], [], []

In [30]:
for s, l in train_data:
  training_sentences.append(s.numpy().decode('utf8'))
  training_labels.append(l.numpy())

In [31]:
for s, l in test_data:
  testing_sentences.append(s.numpy().decode('utf8'))
  testing_labels.append(l.numpy())

In [32]:
print(training_sentences[0])
print(type(training_sentences[0]))

This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.
<class 'str'>


In [33]:
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
oov_tok = '<OOV>'

In [34]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [35]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)

In [36]:
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)

In [37]:
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length)

In [38]:
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

In [39]:
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=16, input_length=max_length),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(6, activation = 'relu'),
  tf.keras.layers.Dense(1, activation = 'sigmoid')
])

In [40]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 120, 16)           160000    
                                                                 
 flatten_1 (Flatten)         (None, 1920)              0         
                                                                 
 dense_4 (Dense)             (None, 6)                 11526     
                                                                 
 dense_5 (Dense)             (None, 1)                 7         
                                                                 
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [41]:
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=16, input_length=max_length),
  tf.keras.layers.GlobalAveragePooling1D(),
  tf.keras.layers.Dense(6, activation = 'relu'),
  tf.keras.layers.Dense(1, activation = 'sigmoid')
])

In [42]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 120, 16)           160000    
                                                                 
 global_average_pooling1d_1   (None, 16)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_6 (Dense)             (None, 6)                 102       
                                                                 
 dense_7 (Dense)             (None, 1)                 7         
                                                                 
Total params: 160,109
Trainable params: 160,109
Non-trainable params: 0
_________________________________________________________________


In [43]:
model.compile(
    optimizer = 'rmsprop',
    loss = 'binary_crossentropy',
    metrics = 'accuracy'
)

In [44]:
model.fit(
    padded, training_labels_final, batch_size = 16, epochs = 15, verbose = 2, validation_data = (testing_padded, testing_labels_final)
)

Epoch 1/15
1563/1563 - 7s - loss: 0.5753 - accuracy: 0.7233 - val_loss: 0.4170 - val_accuracy: 0.8224 - 7s/epoch - 5ms/step
Epoch 2/15
1563/1563 - 7s - loss: 0.3768 - accuracy: 0.8356 - val_loss: 0.3411 - val_accuracy: 0.8520 - 7s/epoch - 5ms/step
Epoch 3/15
1563/1563 - 17s - loss: 0.3233 - accuracy: 0.8626 - val_loss: 0.3283 - val_accuracy: 0.8579 - 17s/epoch - 11ms/step
Epoch 4/15
1563/1563 - 11s - loss: 0.2985 - accuracy: 0.8754 - val_loss: 0.3296 - val_accuracy: 0.8576 - 11s/epoch - 7ms/step
Epoch 5/15
1563/1563 - 9s - loss: 0.2823 - accuracy: 0.8835 - val_loss: 0.3295 - val_accuracy: 0.8579 - 9s/epoch - 6ms/step
Epoch 6/15
1563/1563 - 7s - loss: 0.2712 - accuracy: 0.8900 - val_loss: 0.3338 - val_accuracy: 0.8554 - 7s/epoch - 5ms/step
Epoch 7/15
1563/1563 - 7s - loss: 0.2617 - accuracy: 0.8955 - val_loss: 0.3387 - val_accuracy: 0.8536 - 7s/epoch - 4ms/step
Epoch 8/15
1563/1563 - 6s - loss: 0.2545 - accuracy: 0.8982 - val_loss: 0.3411 - val_accuracy: 0.8546 - 6s/epoch - 4ms/step
Epo

<keras.callbacks.History at 0x7f59e44a8070>

In [45]:
padded[0]

array([   0,    0,    0,   12,   14,   33,  425,  392,   18,   90,   28,
          1,    9,   32, 1366, 3585,   40,  486,    1,  197,   24,   85,
        154,   19,   12,  213,  329,   28,   66,  247,  215,    9,  477,
         58,   66,   85,  114,   98,   22, 5675,   12, 1322,  643,  767,
         12,   18,    7,   33,  400, 8170,  176, 2455,  416,    2,   89,
       1231,  137,   69,  146,   52,    2,    1, 7577,   69,  229,   66,
       2933,   16,    1, 2904,    1,    1, 1479, 4940,    3,   39, 3900,
        117, 1584,   17, 3585,   14,  162,   19,    4, 1231,  917, 7917,
          9,    4,   18,   13,   14, 4139,    5,   99,  145, 1214,   11,
        242,  683,   13,   48,   24,  100,   38,   12, 7181, 5515,   38,
       1366,    1,   50,  401,   11,   98, 1197,  867,  141,   10],
      dtype=int32)

In [46]:
training_labels[0]

0

In [47]:
model.layers

[<keras.layers.core.embedding.Embedding at 0x7f59f00968e0>,
 <keras.layers.pooling.global_average_pooling1d.GlobalAveragePooling1D at 0x7f59e41b4fd0>,
 <keras.layers.core.dense.Dense at 0x7f59dd4bc310>,
 <keras.layers.core.dense.Dense at 0x7f59e49bf850>]

In [49]:
embeddings = model.layers[0]

In [50]:
 embeddings.get_weights()

[array([[-0.03950284,  0.00981644,  0.02337485, ..., -0.02010072,
          0.05551388,  0.0274117 ],
        [-0.03535434, -0.02995031,  0.03202409, ..., -0.06507205,
          0.05761032, -0.04644955],
        [-0.01860887,  0.07449695,  0.01216679, ..., -0.08233895,
          0.03951284, -0.01336469],
        ...,
        [ 0.05731193,  0.0853012 ,  0.00741141, ..., -0.05495796,
         -0.06891952,  0.03709858],
        [ 0.18065713,  0.16675375,  0.13660309, ..., -0.09641556,
         -0.16312271,  0.15392897],
        [ 0.1522331 ,  0.15813637,  0.06885725, ..., -0.06363111,
         -0.03831554,  0.09995979]], dtype=float32)]

In [55]:
type(embeddings.get_weights())

list

In [57]:
len(embeddings.get_weights())

1

In [58]:
weights = embeddings.get_weights()[0]

In [59]:
weights.shape

(10000, 16)

In [64]:
weights

array([[-0.03950284,  0.00981644,  0.02337485, ..., -0.02010072,
         0.05551388,  0.0274117 ],
       [-0.03535434, -0.02995031,  0.03202409, ..., -0.06507205,
         0.05761032, -0.04644955],
       [-0.01860887,  0.07449695,  0.01216679, ..., -0.08233895,
         0.03951284, -0.01336469],
       ...,
       [ 0.05731193,  0.0853012 ,  0.00741141, ..., -0.05495796,
        -0.06891952,  0.03709858],
       [ 0.18065713,  0.16675375,  0.13660309, ..., -0.09641556,
        -0.16312271,  0.15392897],
       [ 0.1522331 ,  0.15813637,  0.06885725, ..., -0.06363111,
        -0.03831554,  0.09995979]], dtype=float32)

In [61]:
reverse_word_index = tokenizer.index_word
print(word_index)
# print(reverse_word_index)



In [62]:
print(reverse_word_index)



In [63]:
import io
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

In [65]:
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + '\n')
  out_v.write('\t'.join([str(x) for x in embeddings]) + '\n')
out_v.close()
out_m.close()