<a href="https://colab.research.google.com/github/shstreuber/AI/blob/main/Week5_MovieReviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

**Notebook to classify IMDB moview reviews**

This notebook classifies IMDB movie reviews as either positive (thumbs up) or negative (thumbs down).  Classifying texts that way called *Sentiment Mining*.

The notebook comes from the Coursera class **Natural Language Processing in Tensorflow**.

Adapted by Jung Hee Kim and Michael Glass.

In [1]:
import tensorflow as tf
print(tf.__version__)

# !pip install -q tensorflow-datasets

2.18.0


In [None]:
# The IMDB movie review dataset is provided by a Tensorflow dataset collection.
# That collection contains special python methods for reading the data
#
import tensorflow_datasets as tfds
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)


Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.MRY02V_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.MRY02V_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
# Here we divide it into training and testing data.
# Each review will be represented by a single long string containing the whole review, all the sentences.
# The sentiment will be represented by a number 0 or 1.
# The data is saved in Numpy arrays.
#
import numpy as np

train_data, test_data = imdb['train'], imdb['test']

training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

# str(s.tonumpy()) is needed in Python3 instead of just s.numpy()
for s,l in train_data:
  training_sentences.append(s.numpy().decode('utf8'))
  training_labels.append(l.numpy())

for s,l in test_data:
  testing_sentences.append(s.numpy().decode('utf8'))
  testing_labels.append(l.numpy())

training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)


In [None]:
# Tokenize the reviews: convert each review to a sequence of numbers
#   (where each number represents a different word in the vocabulary )
#
# It will use a tokenizer object
#
vocab_size = 10000    # Max number of words which will be assigned a number
oov_tok = "<OOV>"     # Infrequent words will be replaced by this fake word
                      # "OOV" means "out of vocabulary"
embedding_dim = 16    # When word embeddings are computed, this is the length
                      #   of the vector for each word.
max_length = 120      # Movie reviews will be truncated to max 120 words,
trunc_type='post'     #   by chopping off the tail ends.

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# The tokenizer object will do most of the conversion from a single movie
# review string to a list of numbers.
#
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
#
# This tells the tokenizer to build a vocabulary list from our training
#  data. It counts how often each word occurs, and assigns a number each
#  of the vocab_size most frequent words.
#
tokenizer.fit_on_texts(training_sentences)
#
#  This is a dictionary saying which number was assigned to each word
word_index = tokenizer.word_index
#
#  This will convert the text of a review to a list of numbers (tokens),
#    one for each word. Do this on the training data.
sequences = tokenizer.texts_to_sequences(training_sentences)
#
#  Cut each review to 120 tokens, or pad out the short ones with zeros.
padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)

#  Now do the same for the testing data. Convert to tokens and make all
#  the same length.  Notice that it is using the same word-numbering that
#  came from the training data. (New words will be replace by OOV fake word)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length)


In [None]:
#
# Set up to decode some reviews from the list of token-numbers back to the
# original words.
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

# You can print out individual reviews here: first the decoded token-list,
#  then the original review it came from.  Notice that the zero-padding of
#  short reviews was put at the beginning (represented by ?)
print(decode_review(padded[3]))   # Print review #3
print(training_sentences[3])

In [None]:
# Now create a neural network for classification.
# Input is a token-sequence of numbers, 120 numbers for each review.
# Output is a single number 0 or 1 for classification.
#
model = tf.keras.Sequential([
    # Embedding layer will convert each number to a vector.  The tensorflow
    #  Embedding layer is trained at the same time the neural network is trained,
    # from your training data.  It adjusts the embedding vectors for each word
    #  to achieve better classification accuracy, similar to how it adjusts
    #  the weights of neurons. It is possible to put other embedding layers
    #  here that are pre-trained, for example LSA or Word2Vec.
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    #
    # Output is one embedding_dim vector per word,
    #   an embedding_dim x sentence_length 2D matrix.
    #  here we flatten it to 1D for input to the hidden layer.
    tf.keras.layers.Flatten(),
    # Standard hidden layer
    tf.keras.layers.Dense(6, activation='relu'),
    # Output layer, one neuron for a 2-class classifier.
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()


In [None]:
# Train and validate.  The validation data will be tested at the end of every training epoch.
#
num_epochs = 10
model.fit(padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final))

In [None]:
# If you are curious about the embedding, this code will extract the trained
# embedding as a 2-D matrix, one row for each different word in the vocabulary.
#
#e = model.layers[0]
#weights = e.get_weights()[0]
#print(weights.shape) # shape: (vocab_size, embedding_dim)

In [None]:
# Further, if you want to see the embedding, this code will store the
# embedding as two .tsv spreadsheet files.  One file contains only
# the embedding vectors, one per row. The other file contains only
# the words, one per row.
#import io
#
#out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
#out_m = io.open('meta.tsv', 'w', encoding='utf-8')
#for word_num in range(1, vocab_size):
#  word = reverse_word_index[word_num]
#  embeddings = weights[word_num]
#  out_m.write(word + "\n")
#  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
#out_v.close()
#out_m.close()

In [None]:
#  And this cell will prompt you to download and save the files.
#  You can upload and visualize them at http://projector.tensorflow.com
#
#  (You can also visualize the built-in embeddings that are already on that web site,
#   which are more interesting because they are general purpose.)
#
#try:
#  from google.colab import files
#except ImportError:
#  pass
#else:
#  files.download('vecs.tsv')
#  files.download('meta.tsv')

In [None]:
# Practice tokenizing sentences, using your tokenizer object that was trained
#  on the movie review data.
#
sentence = "I really think this is amazing. honest."
sequence = tokenizer.texts_to_sequences([sentence])
print(sequence)
print(tokenizer.word_index)