In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Default values
SIZE_VOCAB = 20000  # size of the vocabulary
EMBED_DIM = 128  # Dimension of each word after embedding
MAX_SEQ_LEN = 200  # Max length of words in each paragraph that are used in the model.
EPOCH = 1
BATCH_SIZE=32

# Shared Functions

In [None]:
def seq_pad(raw_texts, tokenizer, max_seq_len=MAX_SEQ_LEN):
    """Turn raw texts into sequence and then pad it.
    
    A tokenizer must be initialized and fitted to a given text before being passed to
    this function. This function first turns raw texts to sequence according to a
    provided tokenizer. Then we pad each sequence to max_seq_len with zero at the end.
    The reason we do post-padding is that we can use CuDNN implementation of the RNN
    layer, according to https://keras.io/guides/understanding_masking_and_padding/#padding-sequence-data
    
    :param raw_texts: A list of raw texts, e.g. ['blablab bla', 'foo fooobar', ...]
    :param tokenizer: A Tokenizer instance that has run fit_on_texts() already on a sample text.
    :param max_seq_len: Max length of a sequence (i.e. number of words in a text string). Default
        to MAX_SEQ_LEN.
    """
    seq = tokenizer.texts_to_sequences(raw_texts)
    pad_seq = keras.preprocessing.sequence.pad_sequences(seq, maxlen=max_seq_len, padding='post')
    return pad_seq


def predict(reviews, model):
    """Predict the sentiment of a list of reviews.
    
    The model predicts the likelihood that a review is positive. We decide that when the likelihood
    is above 0.5, it is a positive review.
    
    :param reviews: A list of strings, each string is a movie review.
    :param model: The trained model.
    """
    pred = model.predict(seq_pad(reviews, tokenizer))
    pred_senti = ['positive' if p >= 0.5 else 'negative' for p in pred]

    for tr, s in zip(reviews, pred_senti):
        print(tr, '-->', s)

# Preprocess The Dataset

In [None]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [None]:
# change the label 
labels = np.array([1 if s == 'positive' else 0 for s in df.sentiment])
reviews = df.review.to_numpy()

X_train, X_val, y_train, y_val = train_test_split(
    reviews,
    labels,
    test_size=0.5,
    stratify=labels,
    random_state=42,
)

In [None]:
# Tokenize the reviews.
# NOTE: this cells takes a while to run. Do not run often.
tokenizer = keras.preprocessing.text.Tokenizer(num_words=SIZE_VOCAB)
tokenizer.fit_on_texts(X_train)

# Create Model

In [None]:
model = keras.Sequential()
# NOTE: if mask_zero set to true, we will encounter this error:
# CancelledError:  [_Derived_]RecvAsync is cancelled.
# This is apparently an on-going issue with Keras, refer to these two
# issues: https://github.com/tensorflow/tensorflow/issues/33721
# and https://github.com/tensorflow/tensorflow/issues/45594
# Note also that downgrading tensorflow to 1.14 is suggesteed as a fix
# to the problem. While I have been able to downgrade tensorflow to 1.14
# in Kaggle, I was not able to enable GPU with 1.14 even after installing
# tensorflow-gpu. Hence this supposedly fix does not work for me.
model.add(
    layers.Embedding(
        input_dim=SIZE_VOCAB,
        output_dim=EMBED_DIM,
#         mask_zero=True,
        input_length=MAX_SEQ_LEN,
    ),
)
# Use bidirectional LSTM. According to:
# https://keras.io/examples/nlp/bidirectional_lstm_imdb/
model.add(
    layers.Bidirectional(layers.LSTM(64, return_sequences=True))
)
model.add(layers.Bidirectional(layers.LSTM(32)))
model.add(layers.Dense(1))  # classification is binary

model.summary()

# Compile and Train Model

In [None]:
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"],
)
# Must turn on GPU to speed things up. Without GPU, each epoch takes about 25 min to train.
# With GPU, each epoch takes about 1 min to train. HUGE difference.
model.fit(
    x=seq_pad(X_train, tokenizer),
    y=y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCH,
#     callbacks=[
#         keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3, min_delta=0.01)
#     ],
    validation_data=(seq_pad(X_val, tokenizer), y_val),
)

# Examples

In [None]:
test_reviews = [
    'this movie sucks',
    'this movie is great',
    'The movie is a flop',
    'I recommend it to my friends',
    'I will ask my friends to watch it if I want them to suffer',
    'This movie is so bad that words cannot express how terrible it is.',
    'I love it',
    'Like it',
    'Adore it',
    'Amazing',
    'Terrible',
    'I hate it',
    'Disgusting',
    'This is obviously a good movie that everyone should see. It captures the essense of a happy life and presents it with great artistic touch. The director uses great and beautiful shots to convey a sense of emptiness when a person is in a state of total tranquility.',
    'I will not recommend this movie because the shots are simply too shaky. It is okay to use shaky cam if it serves a purpose, such as in the Bourne movie, the first one to be specific. However, when shaky cam is used for the sake of shaky cam because it seems like the audience like it, then it is totally pointless.',
    'I love it. I actually love the movie. I think I love it. I have no reason not to love it. Loving the movie is what I am feeling right now',
    'I hate it. I actually hate the movie. I think I hate it. I have no reason not to hate it. Hating the movie is what I am feeling right now',
    'I love it. I love it. I love it. I love it. I love it. I love it. I love it. I love it. I love it. I love it. I love it. I love it. I love it.',
    'I hate it. I hate it. I hate it. I hate it. I hate it. I hate it. I hate it. I hate it. I hate it. I hate it. I hate it. I hate it. I hate it.',
]
predict(test_reviews, model)

# Average Length of Reviews

In [None]:
train_df = pd.DataFrame({'review': X_train, 'sentiment': y_train})

print(
    'Average positive review length',
    np.mean([len(r.split(' ')) for r in train_df[train_df.sentiment==1].review]),
)
print(
    'Average negative review length',
    np.mean([len(r.split(' ')) for r in train_df[train_df.sentiment==0].review]),
)

# Poke The Model More

In [None]:
poking_reviews = [
    'This movie is so good that words cannot express how amazing it is.',
    'This movie is so good that words cannot express how amazing it is. To begin with, it has good characters development.',
    'This movie is so good that words cannot express how amazing it is. To begin with, it has good characters development. Each character has a full arc and their decisions are in-line with their personalities.',
    'This movie is so good that words cannot express how amazing it is. To begin with, it has good characters development. Each character has a full arc and their decisions are in-line with their personalities. Second, the acting is on-point.',
    'This movie is so good that words cannot express how amazing it is. To begin with, it has good characters development. Each character has a full arc and their decisions are in-line with their personalities. Second, the acting is on-point. Everyone, from the main characters to the supporting cast, does a great job such that you do not even realize that they are acting',
    'This movie is so good that words cannot express how amazing it is. To begin with, it has good characters development. Each character has a full arc and their decisions are in-line with their personalities. Second, the acting is on-point. Third, the screenplay is good',
    'This movie is so good that words cannot express how amazing it is. To begin with, it has good characters development. Each character has a full arc and their decisions are in-line with their personalities. Second, the acting is on-point. Third, the screenplay is very good',
    'This movie is so good that words cannot express how amazing it is. To begin with, it has good characters development. Each character has a full arc and their decisions are in-line with their personalities. Second, the acting is good',
]
predict(poking_reviews, model)