<a href="https://colab.research.google.com/github/schmuecker/transfer-learning/blob/main/natural_language/text_classification_with_bert/bert_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install tensorflow_text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import numpy as np

import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as hub
import tensorflow_text as text

# import os
# os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"


In [3]:
tf.__version__

'2.10.0'

In [4]:
devices = tf.config.experimental.list_physical_devices('GPU')
devices

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [5]:
physical_devices = tf.config.list_physical_devices('GPU')

try:
    tf.config.experimental.set_memory_growth(devices[0], True)
    print("Success")
except:
    print("Exception occured")
    pass

Success


**Read more about this dataset here: https://ai.stanford.edu/~amaas/data/sentiment/
As per this article:
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.** 

In [6]:
dataset, info = tfds.load('imdb_reviews', data_dir='./datasets', with_info=True, as_supervised=True)

[1mDownloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to ./datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling datasets/imdb_reviews/plain_text/1.0.0.incompleteS9EWQ7/imdb_reviews-train.tfrecord*...:   0%|      …

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling datasets/imdb_reviews/plain_text/1.0.0.incompleteS9EWQ7/imdb_reviews-test.tfrecord*...:   0%|       …

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling datasets/imdb_reviews/plain_text/1.0.0.incompleteS9EWQ7/imdb_reviews-unsupervised.tfrecord*...:   0%…

[1mDataset imdb_reviews downloaded and prepared to ./datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [7]:
info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset.
    This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_path='./datasets/imdb_reviews/plain_text/1.0.0',
    file_format=tfrecord,
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    supervised_keys=('text', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=25000, num_shards=1>,
        'train': <Sp

In [8]:
dataset

{Split('train'): <PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>,
 Split('test'): <PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>,
 Split('unsupervised'): <PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>}

In [9]:
train_dataset, test_dataset = dataset['train'], dataset['test']

In [10]:
type(train_dataset)

tensorflow.python.data.ops.dataset_ops.PrefetchDataset

In [11]:
len(train_dataset)

25000

In [12]:
len(test_dataset)

25000

In [13]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [14]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [15]:
X_train = []
y_train = []

for collection in train_dataset:
    input, label = collection
    for sample_input in input:
      X_train.append(sample_input.numpy())
    for sample_label in label:
      y_train.append(sample_label.numpy())

X_train = np.array(X_train)
y_train = np.array(y_train)

X_train.shape, y_train.shape

((25000,), (25000,))

In [16]:
for example, label in train_dataset.take(1):
    print('texts: ', example.numpy()[:3])
    print()
    print('labels: ', label.numpy()[:3])

texts:  [b"This should be required viewing for all young people. This is documentary at its best, from the haunting music and terrific narration by Olivier to its unflinching and penetrating analyses, The World at War is unforgettable and irreplaceable for anyone who wants to know about humanity's sorry experience at the nadir of the 20th century."
 b"Where to start ?! . . . I feel ... violated! Thats right, violated! I just spent 1.5hrs of my life, 1.5hrs that I could have spent doing something more useful, like watching paint dry, on this so called horror flick.<br /><br />Its not scary, its not funny, its not dramatic, its no action, its nothing...<br /><br />Its predictable, its boring, its tragic...<br /><br />I might come of a bit harsh here, but watch this movie and you will feel the same way ... or ... no, don't watch it...unless you want to feel violated also."
 b"**POSSIBLE SPOILERS**<br /><br />The biggest part of the movie that doesn't work IS the Wendigo, and when your tit

In [None]:
X_test = []
y_test = []

for collection in test_dataset:
    input, label = collection
    for sample_input in input:
      X_test.append(sample_input.numpy())
    for sample_label in label:
      y_test.append(sample_label.numpy())

X_test = np.array(X_test)
y_test = np.array(y_test)

X_test.shape, y_test.shape

In [17]:
e = tf.keras.layers.experimental.preprocessing.TextVectorization()
e.adapt([
    "I love samosas and jalebi",
    "I love biking and yoga",
    "I love tensorflow"
])

In [18]:
e.get_vocabulary()

['',
 '[UNK]',
 'love',
 'i',
 'and',
 'yoga',
 'tensorflow',
 'samosas',
 'jalebi',
 'biking']

In [19]:
e(["I love pizza"]).numpy()

array([[3, 2, 1]])

In [20]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [21]:
vocab = np.array(encoder.get_vocabulary())
vocab[:25]

array(['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i',
       'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but',
       'film', 'on', 'not', 'you', 'are'], dtype='<U14')

In [22]:
example[:2]

<tf.Tensor: shape=(2,), dtype=string, numpy=
array([b"This should be required viewing for all young people. This is documentary at its best, from the haunting music and terrific narration by Olivier to its unflinching and penetrating analyses, The World at War is unforgettable and irreplaceable for anyone who wants to know about humanity's sorry experience at the nadir of the 20th century.",
       b"Where to start ?! . . . I feel ... violated! Thats right, violated! I just spent 1.5hrs of my life, 1.5hrs that I could have spent doing something more useful, like watching paint dry, on this so called horror flick.<br /><br />Its not scary, its not funny, its not dramatic, its no action, its nothing...<br /><br />Its predictable, its boring, its tragic...<br /><br />I might come of a bit harsh here, but watch this movie and you will feel the same way ... or ... no, don't watch it...unless you want to feel violated also."],
      dtype=object)>

In [23]:
encoded_example = encoder(example)[:3].numpy()
encoded_example

array([[ 11, 139,  28, ...,   0,   0,   0],
       [113,   6, 369, ...,   0,   0,   0],
       [621,   1,  13, ...,   0,   0,   0]])

In [24]:
for n in range(3):
    print("Original: ", example[n].numpy())
    print("Round-trip: ", " ".join(vocab[encoded_example[n]]))
    print()

Original:  b"This should be required viewing for all young people. This is documentary at its best, from the haunting music and terrific narration by Olivier to its unflinching and penetrating analyses, The World at War is unforgettable and irreplaceable for anyone who wants to know about humanity's sorry experience at the nadir of the 20th century."
Round-trip:  this should be [UNK] viewing for all young people this is documentary at its best from the [UNK] music and [UNK] [UNK] by [UNK] to its [UNK] and [UNK] [UNK] the world at war is [UNK] and [UNK] for anyone who wants to know about [UNK] sorry experience at the [UNK] of the [UNK] [UNK]                                                                                                                                                                                                                                                                                                                                                                

In [25]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/jeongukjae/distilbert_multi_cased_preprocess/2")
bert_encoder = hub.KerasLayer("https://tfhub.dev/jeongukjae/distilbert_multi_cased_L-6_H-768_A-12/1")



In [26]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embeding([
    "500$ discount. hurry up", 
    "Bhavin, are you up for a volleybal game tomorrow?"]
)

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.01892428, -0.4616733 ,  0.30849636, ...,  0.188425  ,
         0.08047681, -0.08993998],
       [-0.26908082,  0.09576638, -0.21246633, ...,  0.30374333,
        -0.01314227, -0.08408737]], dtype=float32)>

In [31]:
class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    '''
    Halts the training after reaching 80 percent accuracy

    Args:
      epoch (integer) - index of epoch (required but unused in the function definition below)
      logs (dict) - metric results from the training epoch
    '''

    # Check accuracy
    if(logs.get('loss') < 0.3) and (logs.get('accuracy') > 0.8):

      # Stop if threshold is met
      print("\nLoss is lower than 0.3 and accuracy higher than 0.8. Cancelling training!")
      self.model.stop_training = True

# Instantiate class
callbacks = myCallback()

In [27]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [28]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_mask': (Non  0           ['text[0][0]']                   
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128)}                                                      
                                                                                                  
 keras_layer_1 (KerasLayer)     {'sequence_output':  134734080   ['keras_layer[0][0]',        

In [29]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [32]:
model.fit(X_train, y_train, epochs = 100, callbacks=[callbacks])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100

KeyboardInterrupt: ignored

In [None]:
import sys
print(sys.version)