<a href="https://colab.research.google.com/github/ronsoare/machine_learning/blob/main/text_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import sklearn
import tensorflow as tf
import string

In [78]:
text = "I would like to LEARN MORE ABOUT NPL, because this is very important to my carrer!"

In [79]:
dataset = [
 "I write, erase, rewrite",
 "Erase again, and then",
 "A poppy blooms.",
]

In [80]:
class Vectorizer:
  def standardize(self, text):
    text = text.lower()
    return "".join(char for char in text if char not in string.punctuation)
  def tokenizer(self, text):
    text_split = text.split()
    return text_split
  def make_vocabulary(self, dataset):
    self.vocabulary ={'':0, 'UNK':1}
    for text in dataset:
      text = self.standardize(text)
      tokens = self.tokenizer(text)
      for token in tokens:
        if token not in self.vocabulary:
          self.vocabulary[token] = len(self.vocabulary)
    self.inverse_vocabulary = dict((v, k) for (k, v) in self.vocabulary.items())
  def encoded(self, text):
    text = self.standardize(text)
    tokens = self.tokenizer(text)
    return [self.vocabulary.get(token, 1) for token in tokens]
  def decode(self, int_sequence):
    return " ".join(self.inverse_vocabulary.get(num, 'UNK') for num in int_sequence)

In [81]:
vectorizer = Vectorizer()
vectorizer.make_vocabulary(dataset)

In [82]:
vectorizer.vocabulary

{'': 0,
 'UNK': 1,
 'i': 2,
 'write': 3,
 'erase': 4,
 'rewrite': 5,
 'again': 6,
 'and': 7,
 'then': 8,
 'a': 9,
 'poppy': 10,
 'blooms': 11}

In [83]:
int_sequence = vectorizer.encoded(text)
decoded_sequence = vectorizer.decode(int_sequence)
print(text)
print(int_sequence)
print(decoded_sequence)

I would like to LEARN MORE ABOUT NPL, because this is very important to my carrer!
[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
i UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK


### Another way to do that is use the TextVectorization layer from keras.

In [84]:
from tensorflow.keras.layers import TextVectorization

In [87]:
vectorization = TextVectorization(
    output_mode = 'int'
)

In [88]:
vectorization.adapt(dataset)

In [89]:
vectorization.get_vocabulary()

['',
 '[UNK]',
 'erase',
 'write',
 'then',
 'rewrite',
 'poppy',
 'i',
 'blooms',
 'and',
 'again',
 'a']

In [90]:
encoded_sequence = vectorization(text)
print(encoded_sequence)

tf.Tensor([7 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1], shape=(16,), dtype=int64)


In [91]:
sample = "I write, rewrite, and still rewrite again"
vocabulary = vectorization.get_vocabulary()
inverse_vocabulary = dict(enumerate(vocabulary))
encode = vectorization(sample)
decode = " ".join(inverse_vocabulary[int(i)] for i in encode)
print(encode)
print(decode)

tf.Tensor([ 7  3  5  9  1  5 10], shape=(7,), dtype=int64)
i write rewrite and [UNK] rewrite again


In [2]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  3298k      0  0:00:24  0:00:24 --:--:-- 3803k


In [3]:
!rm -r aclImdb/train/unsup/

In [4]:
!cat aclImdb/train/pos/4077_10.txt

I first saw this back in the early 90s on UK TV, i did like it then but i missed the chance to tape it, many years passed but the film always stuck with me and i lost hope of seeing it TV again, the main thing that stuck with me was the end, the hole castle part really touched me, its easy to watch, has a great story, great music, the list goes on and on, its OK me saying how good it is but everyone will take there own best bits away with them once they have seen it, yes the animation is top notch and beautiful to watch, it does show its age in a very few parts but that has now become part of it beauty, i am so glad it has came out on DVD as it is one of my top 10 films of all time. Buy it or rent it just see it, best viewing is at night alone with drink and food in reach so you don't have to stop the film.<br /><br />Enjoy

### Separate 20% of train set for validation

In [9]:
import os, pathlib, shutil, random
from tensorflow import keras

In [6]:
base_dir = pathlib.Path('aclImdb')
val_dir = base_dir/'val'
train_dir = base_dir/'train'
for category in('neg', 'pos'):
  os.makedirs(val_dir/category)
  files = os.listdir(train_dir/category)
  random.Random(1337).shuffle(files)
  num_val_samples = int(0.2*len(files))
  val_files = files[-num_val_samples:]
  for fname in val_files:
    shutil.move(
        train_dir/category/fname,
        val_dir/category/fname
    )

In [7]:
batch_size = 32
train_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/train", batch_size = batch_size
)
val_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/val", batch_size=batch_size

)
test_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/test", batch_size = batch_size
)

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [8]:
for inputs, target in train_ds:
  print("inputs.shape", inputs.shape)
  print("inputs.dtype", inputs.dtype)
  print("targets.shape", target.shape)
  print("Inputs[0]", inputs[0])
  print("targets[0]", target[0])
  break

inputs.shape (32,)
inputs.dtype <dtype: 'string'>
targets.shape (32,)
Inputs[0] tf.Tensor(b'This clunker of a film sets a new standard for bad filmmaking. Jared Rushton gives an adequate performance of a very poorly-created character in an ill-fated movie, thereby creating a net effect of a very bad movie. The film\'s main thrust is how a boy\'s temporary excursion into the Canadian wilderness after surviving a plane crash solo allows the disgruntled adolescent to deal with his anguish over discovering his mother\'s extramarital affair. Unfortunately it turns into a bizarre collage of random "survival events" (including two especially hokey scenes involving fighting a bear) and strange hallucinations that make you wonder if this kid isn\'t just sitting in an alley somewhere on pot dreaming up this whole movie (and what a nightmare it is!). Furthermore, despite the heralds of some reviewers of the family viewability of the film, there are several scenes not suitable for very young child

### Using Multi-hot for encodig with unigrams or single words

In [14]:
# instaciate the our tokenizer
text_vectorization = TextVectorization(
    max_tokens = 20000,
    output_mode='multi_hot',
)

In [15]:
# preparing the data
text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)

binary_1gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
binary_1gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
binary_1gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4 )

In [None]:
# show some sample: we can see below that each token is map for one vector of 20000 in multi-hot binary.
for inputs, targets in binary_1gram_train_ds:
  print("inputs.shape:", inputs.shape)
  print("inputs.dtype:", inputs.dtype)
  print("targets.shape:", targets.shape)
  print("targets.dtype:", targets.dtype)
  print("inputs[0]:", inputs[0].numpy())
  print("targets[0]:", targets[0])
  break

inputs.shape: (32, 20000)
inputs.dtype: <dtype: 'float32'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: [1. 1. 1. ... 0. 0. 0.]
targets[0]: tf.Tensor(1, shape=(), dtype=int32)


In [16]:
#prepare the function of our model for the experiments we'll do.
from tensorflow import keras
from tensorflow.keras import layers

def get_model(max_tokens =20000, hidden_state = 16):
  inputs = keras.Input(shape=(max_tokens))
  x = layers.Dense(hidden_state, activation='relu')(inputs)
  x = layers.Dropout(0.5)(x)
  outputs = layers.Dense(1,activation='sigmoid')(x)
  model = keras.Model(inputs, outputs)
  model.compile(optimizer="rmsprop", loss='binary_crossentropy', metrics=['accuracy'])
  return model

In [None]:
#now let's train and evaluate our model in the test set
model = get_model()
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_2 (Dense)             (None, 16)                320016    
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320033 (1.22 MB)
Trainable params: 320033 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# define callbacks
callbacks = [
    keras.callbacks.ModelCheckpoint("binary_1gram.keras", save_best_only=True),
]

# let's train the model

model.fit(binary_1gram_train_ds.cache(),
          validation_data=binary_1gram_val_ds.cache(), epochs=10, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7e4606dc1f30>

In [None]:
best_model = keras.models.load_model("/content/binary_1gram.keras")

In [None]:
print(f"Test accuracy: {best_model.evaluate(binary_1gram_test_ds)[1]:.3f}")

Test accuracy: 0.888


### Now let's using the Bigrams with binary encoding

In [None]:
text_vectorization = TextVectorization(
    ngrams = 2,
    max_tokens = 20000,
    output_mode ='multi_hot',
)

In [None]:
text_vectorization.adapt(text_only_train_ds)

binary_2gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
binary_2gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
binary_2gram_test_ds = test_ds.map(lambda x, y:(text_vectorization(x), y), num_parallel_calls=4)

In [None]:
# show some sample: we can see below that each token is map again for one vector of 20000 in multi-hot binary. But now we have 2gram.
for inputs, targets in binary_2gram_train_ds:
  print("inputs.shape:", inputs.shape)
  print("inputs.dtype:", inputs.dtype)
  print("targets.shape:", targets.shape)
  print("targets.dtype:", targets.dtype)
  print("inputs[0]:", inputs[0].numpy())
  print("targets[0]:", targets[0])
  break

inputs.shape: (32, 20000)
inputs.dtype: <dtype: 'float32'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: [1. 1. 1. ... 0. 0. 0.]
targets[0]: tf.Tensor(0, shape=(), dtype=int32)


In [None]:
model = get_model()
callbacks = [keras.callbacks.ModelCheckpoint("binary_2gram.keras", save_best_only=True)]
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_6 (Dense)             (None, 16)                320016    
                                                                 
 dropout_3 (Dropout)         (None, 16)                0         
                                                                 
 dense_7 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320033 (1.22 MB)
Trainable params: 320033 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
history = model.fit(binary_2gram_train_ds.cache(),
                    validation_data = binary_2gram_val_ds, epochs=10,
                    callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
best_model = keras.models.load_model("binary_2gram.keras")

In [None]:
print(f"Test accuracy: {best_model.evaluate(binary_2gram_test_ds)[1]:.3f}")

Test accuracy: 0.890


## BIGRAMS WITH TF-IDF: Term Frequency Inverse, Documente Frequency.

In [17]:
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode= 'tf_idf'
)

In [22]:
text_vectorization.adapt(text_only_train_ds)
tfidf_2gram_train_ds = train_ds.map(lambda x,y: (text_vectorization(x), y), num_parallel_calls=4)
tfidf_2gram_val_ds = val_ds.map(lambda x,y:(text_vectorization(x), y), num_parallel_calls=4)
tfidf_2gram_tes_ds = test_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

In [23]:
model = get_model()
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_2 (Dense)             (None, 16)                320016    
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320033 (1.22 MB)
Trainable params: 320033 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [24]:
callbacks =[keras.callbacks.ModelCheckpoint("tfidf_2gram.keras", save_best_only=True)]
history = model.fit(tfidf_2gram_train_ds.cache(), validation_data= tfidf_2gram_val_ds,
                    epochs=10, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
best_model = keras.models.load_model("tfidf_2gram.keras")
print(f"Test accuracy: {best_model.evaluate(tfidf_2gram_tes_ds)[1]:.3f}")

Test accuracy: 0.893


### how can you export our model with the preprocessing input ? That's easy because we just need creating end to end model, like that:

In [26]:
inputs = keras.Input(shape=(1,), dtype="string")
preprocess_inputs = text_vectorization(inputs)
outputs = best_model(preprocess_inputs)
end_to_end_model = keras.Model(inputs, outputs)

In [27]:
end_to_end_model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 20000)             1         
 Vectorization)                                                  
                                                                 
 model_1 (Functional)        (None, 1)                 320033    
                                                                 
Total params: 320034 (1.22 MB)
Trainable params: 320033 (1.22 MB)
Non-trainable params: 1 (8.00 Byte)
_________________________________________________________________


In [28]:
raw_text_data = tf.convert_to_tensor([
 ["That was an excellent movie, I loved it."],
])
predictions = end_to_end_model(raw_text_data)
print(f"{float(predictions[0] * 100):.2f} percent positive")

87.55 percent positive


### Sequence model approach: this text model take care about trh order of words.

In [36]:
# get the most 20000 commom words
# let's configuration out TextVectorization to return unique id for each token with the max_legth equals to 600.
# In another words, sequence that are > 600 is truncate and the sequence <600 is padding.
text_only_train_ds = train_ds.map(lambda x, y: x)
max_length = 600
max_tokens = 20000
text_vectorization = TextVectorization(
    max_tokens=max_tokens,
    output_sequence_length =max_length,
    output_mode = 'int',
)

In [37]:
text_vectorization.adapt(text_only_train_ds)
int_train_ds = train_ds.map(lambda x,y: (text_vectorization(x), y), num_parallel_calls=4)
int_val_ds = val_ds.map(lambda x, y:(text_vectorization(x), y), num_parallel_calls=4)
int_test_ds = test_ds.map(lambda x,y:(text_vectorization(x), y), num_parallel_calls=4)

In [17]:
from tensorflow.keras import layers

In [13]:
#let's map each unique id(integer) to vector using one-hot encoding and build the model
inputs = keras.Input(shape=(None,), dtype='int64')
embedded = tf.one_hot(inputs, depth=max_tokens)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop", loss='binary_crossentropy', metrics=["accuracy"])
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 tf.one_hot_1 (TFOpLambda)   (None, None, 20000)       0         
                                                                 
 bidirectional (Bidirection  (None, 64)                5128448   
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 5128513 (19.56 MB)
Trainable params: 5128513 (19.56 MB)
Non-trainable params: 0 (0.00 Byte)
_____________________

In [14]:
# we stop the train because this approach it's easy but not the best idea for two reasons: training is very slowly and one hot assumes the each token is
# independ to each other: it's not true when we talking about text classification because the context of the word and, consequently, words can be
# relationated with others.
callbacks = [keras.callbacks.ModelCheckpoint("one_hot_bidir_lstm.keras")]
history = model.fit(int_train_ds.cache(), validation_data=int_val_ds.cache(),
                    epochs=10, callbacks=callbacks)

Epoch 1/10
  8/625 [..............................] - ETA: 3:00:14 - loss: 0.6904 - accuracy: 0.5391

KeyboardInterrupt: 

## The better idea is use WORD EMBEDDINGS. Let's move on!

In [14]:
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = layers.Embedding(input_dim=max_tokens, output_dim=256)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=["accuracy"])
model.summary()
callbacks = [keras.callbacks.ModelCheckpoint("embedding_bidir_lstm.keras", save_best_only=True)]

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 256)         5120000   
                                                                 
 bidirectional (Bidirection  (None, 64)                73984     
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 5194049 (19.81 MB)
Trainable params: 5194049 (19.81 MB)
Non-trainable params: 0 (0.00 Byte)
_____________________

In [15]:
history = model.fit(int_train_ds.cache(), validation_data=int_val_ds.cache(),
                    epochs=10, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:
best_model = keras.models.load_model("embedding_bidir_lstm.keras")
print(f"Test accuracy: {best_model.evaluate(int_test_ds)[1]:.3f}")

Test accuracy: 0.850


## Using masking

In [21]:
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = layers.Embedding(input_dim=max_tokens, output_dim=256, mask_zero=True)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=["accuracy"])
model.summary()
callbacks= [keras.callbacks.ModelCheckpoint("embeddings_bidir_gru_with_masking.keras", save_best_only=True)]

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 256)         5120000   
                                                                 
 bidirectional_1 (Bidirecti  (None, 64)                73984     
 onal)                                                           
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 5194049 (19.81 MB)
Trainable params: 5194049 (19.81 MB)
Non-trainable params: 0 (0.00 Byte)
___________________

In [22]:
model.fit(int_train_ds, validation_data=int_val_ds,
          epochs=10, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7c21af1db400>

In [23]:
best_model = keras.models.load_model("embeddings_bidir_gru_with_masking.keras")
print(f"Test accuracy: {best_model.evaluate(int_test_ds)[1]:.3f}")

Test accuracy: 0.874


## Using pretrained word embeddings

In [24]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2024-05-10 16:34:19--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-05-10 16:34:19--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-05-10 16:34:20--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [25]:
!unzip -q glove.6B.zip

In [32]:
path_to_glove_file = "glove.6B.100d.txt"
embedding_index = {}
with open(path_to_glove_file) as f:
  for line in f:
    word, coef = line.split(maxsplit=1)
    coef = np.fromstring(coef, "f", sep=" ")
    embedding_index[word] = coef

In [48]:
print(f"Found {len(embedding_index)} word vectors.")

Found 400000 word vectors.


In [41]:
embedding_dimension = 100
vocabulary = text_vectorization.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))
print(word_index)



In [44]:
embedding_matrix = np.zeros((max_tokens, embedding_dimension)) #
print(embedding_matrix.shape)

(20000, 100)


In [49]:
for word, i in word_index.items():
  if i < max_tokens:
    embedding_vector = embedding_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [71]:
embedding_layer = layers.Embedding(input_dim= max_tokens, output_dim=embedding_dimension,
                                   embeddings_initializer=keras.initializers.Constant(embedding_matrix),
                                   trainable=False,
                                   mask_zero=True)

In [74]:
inputs = keras.Input(shape=(None,), dtype='int64')
embedded = embedding_layer(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop", loss='binary_crossentropy', metrics=["accuracy"])
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_2 (Embedding)     (None, None, 100)         2000000   
                                                                 
 bidirectional_4 (Bidirecti  (None, 64)                34048     
 onal)                                                           
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 2034113 (7.76 MB)
Trainable params: 34113 (133.25 KB)
Non-trainable params: 2000000 (7.63 MB)
_________________

In [75]:
callbacks = [keras.callbacks.ModelCheckpoint("glove_embeddings_sequence_model.keras", save_best_only=True)]
model.fit(int_train_ds, validation_data=int_val_ds,
          epochs=10, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7c2109ac0310>

In [76]:
best_model = keras.models.load_model("glove_embeddings_sequence_model.keras")
print(f"Test accuracy: {best_model.evaluate(int_test_ds)[1]:.3f}")

Test accuracy: 0.878
