In this notebook we will demonstrate different text classification models trained using the IMDB reviews dataset. 

In [1]:
# To install only the requirements of this notebook, uncomment the lines below and run this cell

# ===========================
"""
!pip install numpy==1.19.5
!pip install wget==3.2
!pip install tensorflow==1.14.0
"""

#!pip install numpy wget tensorflow tensorflow_datasets

# ===========================

'\n!pip install numpy==1.19.5\n!pip install wget==3.2\n!pip install tensorflow==1.14.0\n'

In [3]:
#Make the necessary imports
import os
import sys
import numpy as np
import tarfile
import wget
import warnings
warnings.filterwarnings("ignore") 
from zipfile import ZipFile
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.initializers import Constant

Here we set all the paths of all the external datasets and models such as [glove](https://nlp.stanford.edu/projects/glove/) and [IMDB reviews dataset](http://ai.stanford.edu/~amaas/data/sentiment/).

In [5]:
#!pip install tensorflow_datasets

In [6]:
from tensorflow.keras.datasets import imdb

MAX_WORDS = 10000

# Load the IMDb dataset
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=MAX_WORDS)

# The data comes preprocessed as sequences of word indices.
print(train_data[0])  # This will print an integer-encoded review

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]


In [7]:
BASE_DIR = '../../layer-cake/.vector_cache/'

DATA_DIR = '../datasets/IMDB/'

GLOVE_DIR = os.path.join(BASE_DIR, 'GloVe')

TRAIN_DATA_DIR = DATA_DIR + '/train'
TEST_DATA_DIR = DATA_DIR + '/test'

print("GLOVE_DIR: ", GLOVE_DIR)
print("TRAIN_DATA_DIR: ", TRAIN_DATA_DIR)
print("TEST_DATA_DIR: ", TEST_DATA_DIR)

GLOVE_DIR:  ../../layer-cake/.vector_cache/GloVe
TRAIN_DATA_DIR:  ../datasets/IMDB//train
TEST_DATA_DIR:  ../datasets/IMDB//test


In [8]:
#Within these, I only have a pos/ and a neg/ folder containing text files 
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000 
EMBEDDING_DIM = 100 
VALIDATION_SPLIT = 0.2

#started off from: https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py
#and from: https://github.com/keras-team/keras/blob/master/examples/imdb_lstm.py

### Loading and Preprocessing
 

In [9]:
#Function to load the data from the dataset into the notebook. Will be called twice - for train and test.
"""
def get_data(data_dir):
    texts = []  # list of text samples
    labels_index = {'pos':1, 'neg':0}  # dictionary mapping label name to numeric id
    labels = []  # list of label ids
    for name in sorted(os.listdir(data_dir)):
        path = os.path.join(data_dir, name)
        if os.path.isdir(path):
            if name=='pos' or name=='neg':
                label_id = labels_index[name]
                for fname in sorted(os.listdir(path)):
                        fpath = os.path.join(path, fname)
                        text = open(fpath,encoding='utf8').read()
                        texts.append(text)
                        labels.append(label_id)
    return texts, labels

train_texts, train_labels = get_data(TRAIN_DATA_DIR)
test_texts, test_labels = get_data(TEST_DATA_DIR)
"""

train_texts, train_labels = train_data, train_labels
test_texts, test_labels = test_data, test_labels

labels_index = {'pos':1, 'neg':0} 

#Just to see how the data looks like. 
print("train_texts[0]:", train_texts[0])
print("train_labels[0]", train_labels[0])

print("test_texts[24999]:", test_texts[24999])
print("test_labels[24999]:", test_labels[24999])

train_texts[0]: [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
train_labels[0] 1
test_texts[24999]: [1, 6, 

In [10]:
import tensorflow_datasets as tfds

# Load the IMDb dataset
imdb_data = tfds.load("imdb_reviews", as_supervised=True)

# Split the data into train and test sets
train_data, test_data = imdb_data['train'], imdb_data['test']

# Extract the reviews and labels from the dataset (decode from bytes)
train_texts = [text.decode('utf-8') for text, label in tfds.as_numpy(train_data)]
test_texts = [text.decode('utf-8') for text, label in tfds.as_numpy(test_data)]

# Extract the labels
train_labels = [label for text, label in tfds.as_numpy(train_data)]
test_labels = [label for text, label in tfds.as_numpy(test_data)]

# Now, train_texts and test_texts can be fed into a tokenizer.
print(train_texts[0])           # Example: First review in the training set
print(train_labels[0])          # Example: Label of the first review in the training set

print("test_texts[24999]:", test_texts[24999])
print("test_labels[24999]:", test_labels[24999])


2024-10-18 09:13:04.393163: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.
0
test_texts[24999]: They just don't make cartoons like they used to. This one had wit, great characters, and the greatest ensemble of voice over artists ever assembled for a daytime cartoon show. This still remains as one of the highest rated daytime cartoon shows, and one of the most hon

In [11]:
#Vectorize these text samples into a 2D integer tensor using Keras Tokenizer 
#Tokenizer is fit on training data only, and that is used to tokenize both train and test data. 
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) 
tokenizer.fit_on_texts(train_texts) 
train_sequences = tokenizer.texts_to_sequences(train_texts) #Converting text to a vector of word indexes 
test_sequences = tokenizer.texts_to_sequences(test_texts) 
word_index = tokenizer.word_index 
print('Found %s unique tokens.' % len(word_index))

Found 88582 unique tokens.


In [23]:
print("train_sequences:", type(train_sequences), len(train_sequences))              #This is a list of lists, one list for each review
print("train_sequences[0]:", type(train_sequences[0]), len(train_sequences[0]))     #This is a list of word indexes for the first review
print("train_sequences[0]:", train_sequences[0])                                    #This will print a list of word indexes (depends on the tokenizer)

train_sequences: <class 'list'> 25000
train_sequences[0]: <class 'list'> 115
train_sequences[0]: [11, 13, 32, 424, 391, 17, 89, 27, 10553, 8, 31, 1365, 3584, 39, 485, 11037, 196, 23, 84, 153, 18, 11, 212, 328, 27, 65, 246, 214, 8, 476, 57, 65, 84, 113, 97, 21, 5674, 11, 1321, 642, 766, 11, 17, 6, 32, 399, 8169, 175, 2454, 415, 1, 88, 1230, 136, 68, 145, 51, 1, 7576, 68, 228, 65, 2932, 15, 19499, 2903, 18510, 1478, 4939, 2, 38, 3899, 116, 1583, 16, 3584, 13, 161, 18, 3, 1230, 916, 7916, 8, 3, 17, 12, 13, 4138, 4, 98, 144, 1213, 10, 241, 682, 12, 47, 23, 99, 37, 11, 7180, 5514, 37, 1365, 13886, 49, 400, 10, 97, 1196, 866, 140, 9]


In [24]:
print("test_sequences:", type(test_sequences), len(test_sequences))                               #This is a list of lists, one list for each review
print("test_sequences[24999]:", type(test_sequences[24999]), len(test_sequences[24999]))          #This is a list of word indexes for the 25000th review
print("test_sequences[24999]:", test_sequences[24999])                                            #This will print a list of word indexes (depends on the tokenizer)

test_sequences: <class 'list'> 25000
test_sequences[24999]: <class 'list'> 52
test_sequences[24999]: [33, 40, 89, 94, 2465, 37, 33, 340, 5, 11, 28, 66, 2206, 84, 102, 2, 1, 830, 3143, 4, 541, 117, 2713, 123, 6520, 15, 3, 7607, 1069, 120, 11, 128, 1286, 14, 28, 4, 1, 4097, 1146, 7607, 1069, 284, 2, 28, 4, 1, 88, 14051, 1573, 447, 8607, 2126]


In [12]:
#Converting this to sequences to be fed into neural network. Max seq. len is 1000 as set earlier
#initial padding of 0s, until vector is of size MAX_SEQUENCE_LENGTH
trainvalid_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
trainvalid_labels = to_categorical(np.asarray(train_labels))
test_labels = to_categorical(np.asarray(test_labels))

Splitting the train data into train and valid is done


In [None]:
# split the training data into a training set and a validation set
indices = np.arange(trainvalid_data.shape[0])
np.random.shuffle(indices)

trainvalid_data = trainvalid_data[indices]
trainvalid_labels = trainvalid_labels[indices]

num_validation_samples = int(VALIDATION_SPLIT * trainvalid_data.shape[0])

In [None]:
x_train = trainvalid_data[:-num_validation_samples]
y_train = trainvalid_labels[:-num_validation_samples]
x_val = trainvalid_data[-num_validation_samples:]
y_val = trainvalid_labels[-num_validation_samples:]

#This is the data we will use for CNN and RNN training

print('Splitting the train data into train and valid is done')

In [13]:
print('Preparing embedding matrix.')

GLOVE_MODEL = 'glove.6B.100d.txt'
print("GLOVE_MODEL: ", GLOVE_MODEL)

# first, build index mapping words in the embeddings set
# to their embedding vector
embeddings_index = {}
with open(os.path.join(GLOVE_DIR, GLOVE_MODEL),encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors in Glove embeddings.' % len(embeddings_index))
#print(embeddings_index["google"])

# prepare embedding matrix - rows are the words from word_index, columns are the embeddings of that word from glove.
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load these pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
print("Preparing of embedding matrix is done")

Preparing embedding matrix.
GLOVE_MODEL:  glove.6B.100d.txt
Found 400000 word vectors in Glove embeddings.
Preparing of embedding matrix is done


In [14]:
print("embedding_layer:", type(embedding_layer))
print("embedding_layer:", embedding_layer.get_config())

embedding_layer: <class 'keras.layers.core.embedding.Embedding'>
embedding_layer: {'name': 'embedding', 'trainable': False, 'dtype': 'float32', 'batch_input_shape': (None, 1000), 'input_dim': 20001, 'output_dim': 100, 'embeddings_initializer': {'class_name': 'Constant', 'config': {'value': array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.038194  , -0.24487001,  0.72812003, ..., -0.1459    ,
         0.82779998,  0.27061999],
       [-0.071953  ,  0.23127   ,  0.023731  , ..., -0.71894997,
         0.86894   ,  0.19539   ],
       ...,
       [ 0.40121001, -0.6886    , -0.17046   , ..., -0.63893002,
        -0.90948999, -0.69011003],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.077072  , -0.1725    ,  0.20935   , ..., -0.24908   ,
        -0.73106998,  0.13907   ]])}}, 'embeddings_regularizer': None, 'activity_regularizer': None, 'embeddings_constraint': None, 

In [15]:
import tensorflow as tf

# Function to detect and set the best available device
def set_device():
    if tf.config.list_physical_devices('GPU'):
        print("Using GPU (CUDA)")
        return "/device:GPU:0"
    elif tf.config.list_physical_devices('MPS'):
        print("Using Apple MPS (Metal Performance Shaders)")
        return "/device:GPU:0"  # MPS is identified as a GPU device in TensorFlow
    else:
        print("Using CPU")
        return "/device:CPU:0"

### 1D CNN Model with pre-trained embedding

In [16]:
# Set the device
device_name = set_device()
print("Running on device:", device_name)

Using GPU (CUDA)
Running on device: /device:GPU:0


In [17]:
print('Define a 1D CNN model.')

with tf.device(device_name):

    cnnmodel = Sequential()
    cnnmodel.add(embedding_layer)
    cnnmodel.add(Conv1D(128, 5, activation='relu'))
    cnnmodel.add(MaxPooling1D(5))
    cnnmodel.add(Conv1D(128, 5, activation='relu'))
    cnnmodel.add(MaxPooling1D(5))
    cnnmodel.add(Conv1D(128, 5, activation='relu'))
    cnnmodel.add(GlobalMaxPooling1D())
    cnnmodel.add(Dense(128, activation='relu'))
    cnnmodel.add(Dense(len(labels_index), activation='softmax'))

    cnnmodel.compile(loss='categorical_crossentropy',
                optimizer='rmsprop',
                metrics=['acc'])
    
    #Train the model. Tune to validation set. 
    cnnmodel.fit(x_train, y_train,
            batch_size=128,
            epochs=10, validation_data=(x_val, y_val))

    #Evaluate on test set:
    score, acc = cnnmodel.evaluate(test_data, test_labels)
    print('Test accuracy with CNN:', acc)

Define a 1D CNN model.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy with CNN: 0.8460000157356262


### 1D CNN model with training your own embedding

In [19]:
print("y_train:", type(y_train), y_train.shape)
print("y_train[0]:", y_train[0])

y_train: <class 'numpy.ndarray'> (20000, 2)
y_train[0]: [0. 1.]


In [21]:
print("x_train:", type(x_train), x_train.shape)
print("x_train[0]:", x_train[0])

x_train: <class 'numpy.ndarray'> (20000, 1000)
x_train[0]: [    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     

In [20]:
print(f"Using device: {device_name}")

print("Defining and training a CNN model, training embedding layer on the fly instead of using pre-trained embeddings")

# Define the CNN model
cnnmodel = Sequential()

# Force the Embedding layer to run on the CPU
with tf.device('/CPU:0'):
    cnnmodel.add(Embedding(MAX_NUM_WORDS, 128, input_length=MAX_SEQUENCE_LENGTH))

# Rest of the model can run on the GPU
with tf.device(device_name):
    cnnmodel.add(Conv1D(128, 5, activation='relu'))
    cnnmodel.add(MaxPooling1D(5))
    cnnmodel.add(Conv1D(128, 5, activation='relu'))
    cnnmodel.add(MaxPooling1D(5))
    cnnmodel.add(Conv1D(128, 5, activation='relu'))
    cnnmodel.add(GlobalMaxPooling1D())
    cnnmodel.add(Dense(128, activation='relu'))
    cnnmodel.add(Dense(len(labels_index), activation='softmax'))

    cnnmodel.compile(loss='categorical_crossentropy',
                     optimizer='rmsprop',
                     metrics=['acc'])

    #Train the model. Tune to validation set. 
    cnnmodel.fit(x_train, y_train,
            batch_size=128,
            epochs=10, validation_data=(x_val, y_val))

    #Evaluate on test set:
    score, acc = cnnmodel.evaluate(test_data, test_labels)
    print('Test accuracy with CNN:', acc)

Using device: /device:GPU:0
Defining and training a CNN model, training embedding layer on the fly instead of using pre-trained embeddings
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy with CNN: 0.8470799922943115


### LSTM Model with training your own embedding 

In [None]:
print("Defining and training an LSTM model, training embedding layer on the fly")

# Define the RNN model
rnnmodel = Sequential()

# Force the Embedding layer to run on the CPU
with tf.device('/CPU:0'):
        rnnmodel.add(Embedding(MAX_NUM_WORDS, 128))

with tf.device(device_name):
        rnnmodel.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
        rnnmodel.add(Dense(2, activation='sigmoid'))

rnnmodel.compile(loss='binary_crossentropy',
        optimizer='adam',
        metrics=['accuracy'])

In [None]:
print('Training the RNN')

rnnmodel.fit(x_train, y_train,
        batch_size=128,
        epochs=10,
        validation_data=(x_val, y_val))

In [None]:
score, acc = rnnmodel.evaluate(test_data, test_labels,
                        batch_size=32)

print('Test accuracy with RNN:', acc)

### LSTM Model using pre-trained Embedding Layer

In [None]:
print("Defining and training an LSTM model, using pre-trained embedding layer")

rnnmodel2 = Sequential()
rnnmodel2.add(embedding_layer)
rnnmodel2.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
rnnmodel2.add(Dense(2, activation='sigmoid'))
rnnmodel2.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
print('Training the RNN')

rnnmodel2.fit(x_train, y_train,
          batch_size=32,
          epochs=1,
          validation_data=(x_val, y_val))
score, acc = rnnmodel2.evaluate(test_data, test_labels,
                            batch_size=32)
print('Test accuracy with RNN:', acc)