In [8]:
import numpy as np
import keras
import tensorflow as tf
from keras.layers import Dense
from keras.models import Sequential
from keras.layers import TextVectorization
from keras.utils import pad_sequences

In [4]:
# Define data 
#Define the training data as an array of strings (movie reviews) and their corresponding labels (0 for negative and 1 for positive).
texts = [
    "This movie is amazing!",
    "The acting was terrible and the plot was boring.",
    "I loved everything about this movie!",
    "The special effects were impressive but the story fell flat.",
    "The dialogue was hilarious and the characters were engaging.",
    "I couldn't even make it through this movie. It was that bad."
]
labels = np.array([1, 0, 1, 0, 1, 0])

In [7]:
# Define TextVectorization layer
#Define a TextVectorization layer that converts each string into a sequence of integers. 
#The layer is adapted to the training data so that it can learn the vocabulary of the dataset.

max_len = 100 # Maximum sequence length to pad/crop sequences to
vectorizer = TextVectorization(max_tokens=10000, output_mode='int', 
standardize='lower_and_strip_punctuation', split='whitespace', 
output_sequence_length=max_len)
#max_tokens - specifies the maximum number of words to keep in the vocabulary.
#output_mode parameter - set to 'int', means that the layer will output integer indices that correspond to the words in the vocabulary
#output_sequence_length parameter- set to max_len, is the maximum length of each sequence of text data after padding or cropping

In [None]:
#This step allows the layer to learn the vocabulary of the training data and configure its parameters accordingly.
vectorizer.adapt(texts)

# Vectorize input data
X = vectorizer(texts)

# Pad or crop the sequences to ensure they all have the same length using the pad_sequences function.
X = pad_sequences(X, maxlen=max_len)

In [14]:
X

array([[ 4,  5, 18, 33,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0],
       [ 2, 34,  3, 12,  9,  2, 15,  3, 31,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0],
       [ 8, 17, 23, 35,  4,  5,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0

In [13]:
len(X[0])

100

vectorizer.adapt(texts) method is called to fit the TextVectorization layer to the training data (texts). This method computes the vocabulary of the text data and determines the length of the output sequence based on the output_sequence_length parameter. This step is necessary to ensure that the layer can handle variable-length inputs during inference. <br>

After adapting the layer to the training data, the X = vectorizer(texts) line is used to vectorize the text data into a tensor of integer values. Each review is converted into a sequence of integers, where each integer represents a word in the vocabulary. The resulting tensor has shape (num_samples, sequence_length) where num_samples is the number of samples (in this case, the number of movie reviews) and sequence_length is the length of the output sequences (which is max_len in this code).

In [9]:
# Build model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(max_len,)))
model.add(Dense(1, activation='sigmoid'))

In [10]:
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
model.fit(X, labels, batch_size=32, epochs=1)



<keras.callbacks.History at 0x7f7a89180520>

In [11]:
# Evaluate model on new data
new_texts = [
    "I was blown away by this movie!",
    "I wouldn't recommend this movie to anyone."
]
new_labels = np.array([1, 0])
new_X = vectorizer(new_texts)
new_X = pad_sequences(new_X, maxlen=max_len)
loss, accuracy = model.evaluate(new_X, new_labels)
print("Test set accuracy: {:.2f}".format(accuracy))


Test set accuracy: 0.50


### Custom functions

In [15]:
# Define custom functions for standardizing and splitting text
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data) #tf.strings is a module in the TensorFlow library that provides a collection of operations for working with strings in TensorFlow.
    return tf.strings.regex_replace(lowercase, '[^a-zA-Z0-9]', ' ') # replace any character that is not a letter or a digit with a space character

def custom_splitting(input_data):
    return tf.strings.split(input_data)

In [16]:
# Define data
texts = [
    "This movie is amazing!",
    "The acting was terrible and the plot was boring.",
    "I loved everything about this movie!",
    "The special effects were impressive but the story fell flat.",
    "The dialogue was hilarious and the characters were engaging.",
    "I couldn't even make it through this movie. It was that bad."
]
labels = tf.constant([1, 0, 1, 0, 1, 0])

In [17]:
# Define TextVectorization layer with custom functions
max_len = 100
vectorizer = TextVectorization(
    max_tokens=10000,
    output_mode='int',
    output_sequence_length=max_len,
    standardize=custom_standardization,
    split=custom_splitting
)

# Adapt the TextVectorization layer to the training data
vectorizer.adapt(texts)

# Vectorize the input data
X = vectorizer(texts)

In [18]:
# Pad/crop sequences to ensure they all have the same length
X = pad_sequences(X, maxlen=max_len)

In [19]:
# Print the vocabulary
vocabulary = vectorizer.get_vocabulary()
print(vocabulary)

# Print the first example after vectorization
print(X[0])


['', '[UNK]', 'the', 'was', 'this', 'movie', 'were', 'it', 'i', 'and', 'through', 'that', 'terrible', 't', 'story', 'special', 'plot', 'make', 'loved', 'is', 'impressive', 'hilarious', 'flat', 'fell', 'everything', 'even', 'engaging', 'effects', 'dialogue', 'couldn', 'characters', 'but', 'boring', 'bad', 'amazing', 'acting', 'about']
[ 4  5 19 34  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0]
