In [1]:
import keras_nlp

Using TensorFlow backend


In [2]:
# Unbatched input.
tokenizer = keras_nlp.models.BertTokenizer.from_preset("bert_base_en_uncased",)
tokenizer("The quick brown fox jumped.")

# Batched input.
tokenizer(["The quick brown fox jumped.", "The fox slept."])

# Detokenization.
tokenizer.detokenize(tokenizer("The quick brown fox jumped."))

# Custom vocabulary.
vocab = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
vocab += ["The", "quick", "brown", "fox", "jumped", "."]
tokenizer = keras_nlp.models.BertTokenizer(vocabulary=vocab)
tokenizer("The quick brown fox jumped.")

Downloading data from https://storage.googleapis.com/keras-nlp/models/bert_base_en_uncased/v1/vocab.txt


<tf.Tensor: shape=(6,), dtype=int32, numpy=array([ 5,  6,  7,  8,  9, 10])>

In [4]:
from keras_nlp.models import BertTokenizer

In [6]:
# Load a preset tokenizer.
tokenizer = BertTokenizer.from_preset("bert_base_en_uncased")

# Tokenize some input.
tokenizer("The quick brown fox tripped.")

# Detokenize some input.
tokenizer.detokenize([5, 6, 7, 8, 9])

<tf.Tensor: shape=(), dtype=string, numpy=b'[unused4] [unused5] [unused6] [unused7] [unused8]'>

In [8]:
import tensorflow as tf

In [9]:
preprocessor = keras_nlp.models.BertPreprocessor.from_preset("bert_base_en_uncased")

# Tokenize and pack a single sentence.
preprocessor("The quick brown fox jumped.")

# Tokenize a batch of single sentences.
preprocessor(["The quick brown fox jumped.", "Call me Ishmael."])

# Preprocess a batch of sentence pairs.
# When handling multiple sequences, always convert to tensors first!
first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
second = tf.constant(["The fox tripped.", "Oh look, a whale."])
preprocessor((first, second))

# Custom vocabulary.
vocab = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
vocab += ["The", "quick", "brown", "fox", "jumped", "."]
tokenizer = keras_nlp.models.BertTokenizer(vocabulary=vocab)
preprocessor = keras_nlp.models.BertPreprocessor(tokenizer)
preprocessor("The quick brown fox jumped.")

{'token_ids': <tf.Tensor: shape=(512,), dtype=int32, numpy=
 array([ 1,  5,  6,  7,  8,  9, 10,  2,  3,  3,  3,  3,  3,  3,  3,  3,  3,
         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
         3,  3,  3,  3,  3, 

In [10]:
preprocessor((first, second))

{'token_ids': <tf.Tensor: shape=(2, 512), dtype=int32, numpy=
 array([[1, 5, 6, ..., 3, 3, 3],
        [1, 0, 0, ..., 3, 3, 3]])>,
 'segment_ids': <tf.Tensor: shape=(2, 512), dtype=int32, numpy=
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])>,
 'padding_mask': <tf.Tensor: shape=(2, 512), dtype=bool, numpy=
 array([[ True,  True,  True, ..., False, False, False],
        [ True,  True,  True, ..., False, False, False]])>}

In [None]:
# Mapping with tf.data.Dataset.

In [11]:
preprocessor = keras_nlp.models.BertPreprocessor.from_preset(
    "bert_base_en_uncased"
)

first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
second = tf.constant(["The fox tripped.", "Oh look, a whale."])
label = tf.constant([1, 1])

# Map labeled single sentences.
ds = tf.data.Dataset.from_tensor_slices((first, label))
ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)

# Map unlabeled single sentences.
ds = tf.data.Dataset.from_tensor_slices(first)
ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)

# Map labeled sentence pairs.
ds = tf.data.Dataset.from_tensor_slices(((first, second), label))
ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)

# Map unlabeled sentence pairs.
ds = tf.data.Dataset.from_tensor_slices((first, second))
# Watch out for tf.data's default unpacking of tuples here!
# Best to invoke the `preprocessor` directly in this case.
ds = ds.map(
    lambda first, second: preprocessor(x=(first, second)),
    num_parallel_calls=tf.data.AUTOTUNE,
)

In [None]:
# BertBackbone model

In [14]:
import numpy as np

In [15]:
input_data = {
    "token_ids": np.ones(shape=(1, 12), dtype="int32"),
    "segment_ids": np.array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]]),
    "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
}

# Pretrained BERT encoder.
model = keras_nlp.models.BertBackbone.from_preset("bert_base_en_uncased")
model(input_data)

# Randomly initialized BERT encoder with a custom config.
model = keras_nlp.models.BertBackbone(
    vocabulary_size=30552,
    num_layers=4,
    num_heads=4,
    hidden_dim=256,
    intermediate_dim=512,
    max_sequence_length=128,
)
model(input_data)

Downloading data from https://storage.googleapis.com/keras-nlp/models/bert_base_en_uncased/v1/model.h5


{'sequence_output': <tf.Tensor: shape=(1, 12, 256), dtype=float32, numpy=
 array([[[-0.49211618, -0.02429868, -1.851578  , ...,  1.1772653 ,
          -0.08377397, -1.9198692 ],
         [-0.53546405, -0.06992134, -1.2401865 , ...,  0.38118008,
           0.4098072 , -1.9182652 ],
         [-0.8774007 , -1.276829  , -0.9952731 , ...,  0.12106248,
           0.11782676, -2.1016893 ],
         ...,
         [-0.5384043 , -0.5705477 , -1.4774486 , ...,  0.22520651,
          -0.05159385, -1.9838173 ],
         [-1.1852553 , -0.7356736 , -1.9857584 , ...,  0.3143446 ,
          -0.46081302, -1.6065874 ],
         [-1.6154847 , -0.20775907, -1.6879659 , ...,  0.66859907,
          -0.0133528 , -1.3697277 ]]], dtype=float32)>,
 'pooled_output': <tf.Tensor: shape=(1, 256), dtype=float32, numpy=
 array([[-0.30399308, -0.39208707, -0.31638876, -0.050208  , -0.4902698 ,
          0.13524899,  0.40939018, -0.30869877, -0.19257137,  0.08193897,
         -0.26379967,  0.11055775,  0.01061076,  0.08

In [16]:
# Load architecture and weights from preset
model = keras_nlp.models.BertBackbone.from_preset(
    "bert_tiny_en_uncased"
)

# Load randomly initialized model from preset architecture
model = keras_nlp.models.BertBackbone.from_preset(
    "bert_tiny_en_uncased",
    load_weights=False
)

Downloading data from https://storage.googleapis.com/keras-nlp/models/bert_tiny_en_uncased/v1/model.h5
