<a href="https://colab.research.google.com/github/rahul94jh/MSC-Research/blob/main/bert_finetuning_stop_clickbait.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup

In [1]:
#Check GPU, if assigned k8 then factory reset couple of times until we get Tesla GPU
!nvidia-smi

Sun Jul  4 10:23:40 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install -q tensorflow-text
!pip install -q tf-models-official

[K     |████████████████████████████████| 4.3MB 8.1MB/s 
[K     |████████████████████████████████| 1.6MB 7.6MB/s 
[K     |████████████████████████████████| 358kB 46.7MB/s 
[K     |████████████████████████████████| 1.2MB 52.0MB/s 
[K     |████████████████████████████████| 38.2MB 83kB/s 
[K     |████████████████████████████████| 215kB 57.6MB/s 
[K     |████████████████████████████████| 686kB 40.3MB/s 
[K     |████████████████████████████████| 102kB 13.4MB/s 
[K     |████████████████████████████████| 645kB 40.5MB/s 
[K     |████████████████████████████████| 61kB 9.4MB/s 
[K     |████████████████████████████████| 51kB 8.6MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


#Imports

In [3]:
import os, math
import numpy as np
import pandas as pd
import requests
import shutil
import re
from pathlib import Path

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import tensorflow_addons as tfa
from official.nlp.data import classifier_data_lib
from official.nlp.bert import tokenization
from official.nlp import optimization


from tensorflow import keras
from official.nlp import optimization  # to create AdamW optmizer
AUTO = tf.data.experimental.AUTOTUNE # used in tf.data.Dataset API

tf.get_logger().setLevel('ERROR')

import sys

#Import custom script
sys.path.append('/content/drive/MyDrive/Colab Notebooks/clcikbait_detection/scripts')
from tf_dataset_helpers import read_tfrec_data
import model_helpers as mh
import visualization_helpers as vh

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [4]:
print("TF Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")

TF Version:  2.5.0
Eager mode:  True
Hub version:  0.12.0
GPU is available


#Configs

In [5]:
#Bert configs
bert_model_name = 'bert_en_uncased_L-12_H-768_A-12' 

map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/4'
}

map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
}

In [6]:
tfrecFiles_path = '/content/drive/MyDrive/Colab Notebooks/clcikbait_detection/dataset/Stop_clickbait/tfrec_data/'
model_root_path = '/content/drive/MyDrive/Colab Notebooks/clcikbait_detection/dataset/Stop_clickbait/saved_models'
saved_model_name = f'stop_clickbait_finetuned_{bert_model_name}'
saved_model_path = os.path.join(model_root_path, saved_model_name )
checkpoint_root_path = '/content/drive/MyDrive/Colab Notebooks/clcikbait_detection/dataset/Stop_clickbait/saved_models/checkpoints'
model_checkpoint_path = os.path.join(checkpoint_root_path, 'my_checkpoint' )


BATCH_SIZE = 32  
# Label categories
label_list = [0,1]
# maximum length of (token) input sequences
max_seq_len = 128
init_lr = 2e-5

epochs = 2

In [7]:
tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print('BERT model selected           :', tfhub_handle_encoder)
print('Preprocessing model auto-selected:', tfhub_handle_preprocess)

bert_layer = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='bert_encoder')
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
bert_tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

BERT model selected           : https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4
Preprocessing model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


#Scripts

In [8]:
#@title "Utilities [RUN ME]"
def read_tfrecord(example):
    features = {
        "class": tf.io.FixedLenFeature([], tf.int64),   # shape [] means scalar
        "text": tf.io.FixedLenFeature([], tf.string),
        "label": tf.io.FixedLenFeature([], tf.string)  # one bytestring
    }
    # decode the TFRecord
    example = tf.io.parse_single_example(example, features)
    
    
    class_num = example['class']
    text = example['text']
    label  = example['label']
    return text, class_num, label

def load_dataset(filenames):
  option_no_order = tf.data.Options()
  option_no_order.experimental_deterministic = False

  dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO)
  dataset = dataset.with_options(option_no_order)
  dataset = dataset.map(read_tfrecord, num_parallel_calls=AUTO)
  return dataset

def get_batched_dataset(dataset, train=False):
  if train:
    dataset = dataset.shuffle(num_train_examples)
    dataset = dataset.repeat()
  dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
  dataset = dataset.cache().prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
  return dataset

In [9]:
#@title "Utilities [RUN ME]"
def to_feature(text, label, label_list=label_list, max_seq_length=max_seq_len, tokenizer=bert_tokenizer):
  example = classifier_data_lib.InputExample(guid=None, text_a=text.numpy(), text_b=None, label=label.numpy())
  feature = classifier_data_lib.convert_single_example(0, example, label_list, max_seq_length, tokenizer)

  return (feature.input_ids, feature.input_mask, feature.segment_ids, feature.label_id)
  
def to_feature_map(text, label):
  input_ids, input_mask, segment_ids, label_id = tf.py_function(to_feature, inp=[text, label], Tout=[tf.int32,tf.int32, tf.int32, tf.int32 ])
  input_ids.set_shape([max_seq_len])
  segment_ids.set_shape([max_seq_len])
  input_mask.set_shape([max_seq_len])
  label_id.set_shape([])

  x = {
       'input_word_ids': input_ids,
       'input_mask': input_mask,
       'input_type_ids':segment_ids
  }
  
  return (x, label_id)
  

In [10]:
#@title "Utilities [RUN ME]"
def create_model():

 encoder_inputs = dict(
    input_word_ids=tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32, name="input_word_ids"),
    input_mask=tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32,  name="input_mask"),
    input_type_ids=tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32, name="input_type_ids"),
)
 
 net = bert_layer(encoder_inputs)['pooled_output']

 net = tf.keras.layers.Dropout(0.2)(net)
 net = tf.keras.layers.Dense(384, activation='ReLU', name='dense_384')(net)
 net = tf.keras.layers.Dense(192, activation='ReLU', name='dense_192')(net)
 net = tf.keras.layers.Dense(96, activation='ReLU', name='dense_96')(net)
 output = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(net)

 model = tf.keras.Model(
     encoder_inputs,
     outputs=output,
     name='prediction'
 )
 return model

#Read TFRecord data

In [11]:
#instantiate read_data utility
read_data = read_tfrec_data(tfrecFiles_path, VALIDATION_SPLIT=0.2, TESTING_SPLIT=0.2, MODE=1)

# splitting data files between training, validation and test
filenames, training_filenames, validation_filenames, testing_filenames = read_data.get_tfrec_files()
num_train_examples = 500 * len(training_filenames)

validation_steps = int(31986  // len(filenames) * len(validation_filenames)) // BATCH_SIZE
steps_per_epoch = int(31986  // len(filenames) * len(training_filenames)) // BATCH_SIZE

num_train_steps = steps_per_epoch * epochs
num_warmup_steps = num_train_steps // 10

print("With a batch size of {}, there will be {} batches per training epoch and {} batch(es) per validation run.".format(BATCH_SIZE, steps_per_epoch, validation_steps))

Pattern matches 64 data files. Splitting dataset into 52 training files , 10 validation files and 2 test files
With a batch size of 32, there will be 810 batches per training epoch and 155 batch(es) per validation run.


#Load TFRecord into TF Dataset

In [12]:
# create the datasets
with tf.device('/cpu:0'):
  train_ds = load_dataset(training_filenames)
  val_ds = load_dataset(validation_filenames)
  test_ds = load_dataset(testing_filenames)

In [13]:
for i,(text, class_num, label) in enumerate(train_ds.take(10)):
  print(f"text : {text.numpy()}, class : {class_num.numpy()}, label : {label.numpy()}")

text : b'american singer and actor al martino dies at age', class : 0, label : b'nonclickbaits'
text : b'suicide bomber kills at least eight in eastern afghanistan', class : 0, label : b'nonclickbaits'
text : b'of our favourite television moments in', class : 1, label : b'clickbaits'
text : b'bombings reported in bangkok', class : 0, label : b'nonclickbaits'
text : b'us x factor producers confirm line up changes', class : 0, label : b'nonclickbaits'
text : b'ad losses put squeeze on tv news', class : 0, label : b'nonclickbaits'
text : b'at least killed by hurricane katrina serious flooding across affected region', class : 0, label : b'nonclickbaits'
text : b'india announces lok sabha elections for', class : 0, label : b'nonclickbaits'
text : b'speedskating provided marsicano a refuge from bullying and depression', class : 0, label : b'nonclickbaits'
text : b'belgian ship hijacked off horn of africa by somali pirates', class : 0, label : b'nonclickbaits'


In [14]:
with tf.device('/cpu:0'):
  train_ds = train_ds.map(lambda text, class_num, label:(text, class_num))
  val_ds = val_ds.map(lambda text, class_num, label:(text, class_num))
  test_ds = test_ds.map(lambda text, class_num, label:(text, class_num))

#Modeling

##Bert preprocessing

In [15]:
with tf.device('/cpu:0'):
  # train
  train_data = train_ds.map(to_feature_map, num_parallel_calls=tf.data.experimental.AUTOTUNE)
  train_data = get_batched_dataset(train_data, train=True)

  # valid
  val_data = val_ds.map(to_feature_map, num_parallel_calls=tf.data.experimental.AUTOTUNE)
  val_data = get_batched_dataset(val_data)

  # test
  test_data = test_ds.map(to_feature_map, num_parallel_calls=tf.data.experimental.AUTOTUNE)
  test_data = get_batched_dataset(test_data)
  

In [16]:
# train data spec
train_data.element_spec

({'input_mask': TensorSpec(shape=(32, 128), dtype=tf.int32, name=None),
  'input_type_ids': TensorSpec(shape=(32, 128), dtype=tf.int32, name=None),
  'input_word_ids': TensorSpec(shape=(32, 128), dtype=tf.int32, name=None)},
 TensorSpec(shape=(32,), dtype=tf.int32, name=None))

#Build classifier

In [17]:
classifier_model = create_model()
classifier_model.summary()

Model: "prediction"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_mask (InputLayer)         [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_type_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_word_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
bert_encoder (KerasLayer)       {'encoder_outputs':  109482241   input_mask[0][0]                 
                                                                 input_type_ids[0][0]    

In [18]:
#Load weights if available
if os.path.exists(checkpoint_root_path):
  print('loading weight')
  classifier_model.load_weights(model_checkpoint_path)

loading weight


In [19]:
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                        verbose=1, 
                                        patience=5, 
                                        mode='min', 
                                        restore_best_weights=True)

METRICS = [
                keras.metrics.TruePositives(name='tp'),
                keras.metrics.FalsePositives(name='fp'),
                keras.metrics.TrueNegatives(name='tn'),
                keras.metrics.FalseNegatives(name='fn'), 
                keras.metrics.BinaryAccuracy(name='accuracy'),
                keras.metrics.Precision(name='precision'),
                keras.metrics.Recall(name='recall'),
                keras.metrics.AUC(name='auc'),
                keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
     ]

optimizer = optimization.create_optimizer(
      init_lr=init_lr,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      optimizer_type='adamw')

classifier_model.compile(optimizer=optimizer,
              loss=tf.keras.losses.BinaryCrossentropy(), 
              metrics=METRICS)

#tf.keras.utils.plot_model(model=classifier_model, show_shapes=True, dpi=60)

#Train classifier model

In [20]:
history = classifier_model.fit(
      x=train_data,
      validation_data=val_data,
      steps_per_epoch=steps_per_epoch,
      epochs=epochs,
      validation_steps=validation_steps,
      callbacks=[es])

Epoch 1/2
Epoch 2/2


In [21]:
# Save model weights
classifier_model.save_weights(model_checkpoint_path)

#Evaluate model

In [26]:
results = classifier_model.evaluate(test_data)

for name, value in zip(classifier_model.metrics_names, results):
  print(name, ': ', value)
print()

loss :  0.11084893345832825
tp :  477.0
fp :  11.0
tn :  496.0
fn :  8.0
accuracy :  0.9808467626571655
precision :  0.9774590134620667
recall :  0.983505129814148
auc :  0.9886781573295593
prc :  0.9828189611434937



#Export for inference

In [None]:
classifier_model.save(saved_model_path, include_optimizer=False)