In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
import re

In [None]:
df = pd.read_csv("../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
df.head()

In [None]:
len(df)

In [None]:
def remove_html_tags(text):
    cleaner = re.compile('<.*?>')
    cleantext = re.sub(cleaner, ' ', text)
    return cleantext

def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    text=re.sub("\s\s+" , " ", text)
    text=text.lower()
    return text

def preprocess_review(text):
    clean_text = remove_html_tags(text);
    clean_text = remove_between_square_brackets(clean_text);
    clean_text = remove_special_characters(clean_text)
    return clean_text;

In [None]:
df.iloc[666]['review']

In [None]:
preprocess_review(df.iloc[666]['review'])

In [None]:
df['review'] = df["review"].apply(lambda x: preprocess_review(x))

In [None]:
df['new_sentiment'] = df['sentiment'] == "positive"
df["new_sentiment"] = df["new_sentiment"].astype("uint8")
df = df.drop("sentiment", axis=1)
df = df.rename(columns={"new_sentiment": "sentiment"})

In [None]:
df.head()

### Split the data

In [None]:
X = df[:5000]['review']
y = df[:5000]['sentiment']


In [None]:
from sklearn.model_selection import train_test_split
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, random_state=42, test_size=0.1)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, random_state=42, test_size=0.1)

In [None]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

In [None]:
for example, label in train_dataset.take(1):
    print('text: ', example.numpy())
    print('label: ', label.numpy())

In [None]:
BATCH_SIZE = 32

In [None]:
train_dataset = train_dataset.shuffle(5000).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(500).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(500).prefetch(tf.data.AUTOTUNE)

In [None]:
for example, label in train_dataset.take(1):
    print('texts: ', example.numpy()[:3])
    print()
    print('labels: ', label.numpy()[:3])

In [None]:
VOCAB_SIZE=2000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [None]:
vocab = np.array(encoder.get_vocabulary())
vocab[:50]

In [None]:
encoded_example = encoder(example)[:3].numpy()
encoded_example

In [None]:
for n in range(1):
    print("Original: ", example[n].numpy())
    print("\nRound-trip: ", " ".join(vocab[encoded_example[n]]))
    print()

### Create model

In [None]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=128,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
checkpoint_filepath = '/tmp/checkpoint'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True)

In [None]:
history = model.fit(train_dataset, epochs=10,
                    validation_data=val_dataset, callbacks=[early_stop, model_checkpoint_callback])

In [None]:
examples = [
  "The movie was great, i would recommend this movie to my family, the storyline is amazing",
  "The movie was meh, not great not terrible, but there is something off in the plot, but still this movie have great CGI effect",
  "The movie was terrible, dont watch this movie, dont waste your time watching these movies!"
]

model.predict(examples)