In [None]:
import tensorflow as tf
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import os, re
from tensorflow.keras.utils import plot_model
import matplotlib.pyplot as plt
from tqdm import tqdm

##################
# Verifications:
#################
print('GPU is used.' if len(tf.config.list_physical_devices('GPU')) > 0 else 'GPU is NOT used.')
print("Tensorflow version: " + tf.__version__)


# Load the IMDB movie review sentiment data


In [None]:
data_url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
zip_path = keras.utils.get_file(
    "aclImdb_v1.tar.gz", data_url, cache_subdir="datasets/aclImdb_v1/", extract=True
)

train_folder_pos = os.path.dirname(zip_path) + "/aclImdb/train/pos/"
train_folder_neg = os.path.dirname(zip_path) + "/aclImdb/train/neg/"
test_folder_pos = os.path.dirname(zip_path) + "/aclImdb/test/pos/"
test_folder_neg = os.path.dirname(zip_path) + "/aclImdb/test/neg/"


## Read IMDB data

In [None]:
def add_spaces_to_punctuation(string):
    pattern = r"([^\w\s])"
    modified_string = re.sub(pattern, r" \1 ", string)
    return modified_string


def read_files_in_directory(directory):
    docs = []
    for root, dirs, files in os.walk(directory):
        for file_name in files:
            file_path = os.path.join(root, file_name)
            with open(file_path, "r") as file:
                # Perform desired operations on the file
                file_contents = file.read()
                # Do something with the file contents
                file_contents = add_spaces_to_punctuation(file_contents)
                docs.append(file_contents.lower())
    return docs


# Specify the directory path you want to read files from

# Call the function to read files in the repository
train_pos = read_files_in_directory(train_folder_pos)
train_neg = read_files_in_directory(train_folder_neg)
test_pos = read_files_in_directory(test_folder_pos)
test_neg = read_files_in_directory(test_folder_neg)


# Data preprocessing

We first prepare the vocabulary to be used.



In [None]:
# Get the vocabulary
vocab_size = 20000
vocab = {}
for doc in train_pos + train_neg:
    doc = doc.split(" ")
    for token in doc:
        v = vocab.get(token, 0)
        vocab[token] = v + 1


vocab = list(dict(sorted(vocab.items(), key=lambda x: -x[1])[0:vocab_size]).keys())
# Mapping tokens to integers
token_to_num = keras.layers.StringLookup(
    vocabulary=vocab, oov_token="[UNK]", mask_token="[ZERO]"
)
# Mapping integers back to original tokens
num_to_token = keras.layers.StringLookup(
    vocabulary=token_to_num.get_vocabulary(),
    oov_token="[UNK]",
    mask_token="[ZERO]",
    invert=True,
)
vocab_size = token_to_num.vocabulary_size()
print(f"The size of the vocabulary ={token_to_num.vocabulary_size()}")
print("Top 20 tokens in the vocabulary: ", token_to_num.get_vocabulary()[:20])


In [None]:
def docs_to_nums(docs):
    out = []
    for doc in tqdm(docs):
        word_splits = tf.strings.split(doc, sep=" ")
        doc_list = token_to_num(word_splits)
        out.append(doc_list)
    return out


x_train = docs_to_nums(train_pos + train_neg)
x_test = docs_to_nums(test_pos + test_neg)

y_train = np.array([1] * len(train_pos) + [0] * len(train_neg))
y_test = np.array([1] * len(test_pos) + [0] * len(test_neg))


# Build the model


In [None]:
def build_model(vocab_size):
    # Input for variable-length sequences of integers
    inputs = keras.Input(shape=(None,), dtype="int32")
    # Embed each integer in a 128-dimensional vector
    x = layers.Embedding(vocab_size, 128)(inputs)
    # Add 2 bidirectional LSTMs
    x = layers.LSTM(64, use_bias=False)(x)
    # Add a classifier
    outputs = layers.Dense(1, activation="sigmoid", use_bias=False)(x)
    model = keras.Model(inputs, outputs)
    model.compile("adam", "binary_crossentropy", metrics=["accuracy"])
    return model


# Train and evaluate the model


In [None]:
maxlen = 500
x_train_padded = keras.utils.pad_sequences(x_train, maxlen=maxlen, padding="post")
x_test_padded = keras.utils.pad_sequences(x_test, maxlen=maxlen, padding="post")

model = build_model(vocab_size)
model.summary()
plot_model(model, show_shapes=True)


In [None]:
history1 = model.fit(
    x_train_padded,
    y_train,
    batch_size=32,
    epochs=30,
    validation_data=(x_test_padded, y_test),
)
model.evaluate(
    x_train_padded,
    y_train,
    batch_size=32
)


In [None]:
maxlen = 500
x_train_padded = keras.utils.pad_sequences(x_train, maxlen=maxlen, padding="pre")
x_test_padded = keras.utils.pad_sequences(x_test, maxlen=maxlen, padding="pre")


model = build_model(vocab_size)
model.summary()
plot_model(model, show_shapes=True)



history2 = model.fit(
    x_train_padded,
    y_train,
    batch_size=32,
    epochs=30,
    validation_data=(x_test_padded, y_test),
)

model.evaluate(
    x_train_padded,
    y_train,
    batch_size=32
)

In [None]:
def learning_plots(history):
    plt.figure(figsize=(15, 4))
    ax1 = plt.subplot(1, 2, 1)
    for l in history.history:
        if l == 'loss' or l == 'val_loss':
            loss = history.history[l]
            plt.plot(range(1, len(loss) + 1), loss, label=l)

    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    ax2 = plt.subplot(1, 2, 2)
    for k in history.history:
        if 'accuracy' in k:
            loss = history.history[k]
            plt.plot(range(1, len(loss) + 1), loss, label=k)
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()


