Follow these tutorials:

- For BERTweet: https://www.kaggle.com/code/tylerrosacker/bertweet-transfer-learning
- Class notebook: https://github.com/datasci-w266/2023-spring-main/blob/master/materials/walkthrough_notebooks/bert_as_black_box/Keras_HuggingFace_Transformers_BERT_notebook.ipynb

## Imports

In [1]:
!pip install -q transformers

In [7]:
pip install emoji==0.6.0

Collecting emoji==0.6.0
  Downloading emoji-0.6.0.tar.gz (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 kB[0m [31m325.9 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25ldone
[?25h  Created wheel for emoji: filename=emoji-0.6.0-py3-none-any.whl size=49720 sha256=2efb9c2c6724a99be3b6cf5cb19440b017df2a7c12fea85089439b7df62abf11
  Stored in directory: /Users/cabanela/Library/Caches/pip/wheels/1b/bd/d9/310c33c45a553798a714e27e3b8395d37128425442b8c78e07
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-0.6.0
Note: you may need to restart the kernel to use updated packages.


In [42]:
import pandas as pd
import numpy as np

import tensorflow as tf
# import tensorflow_addons as tfa
import transformers
from transformers import AutoTokenizer,TFRobertaModel
# from transformers import AutoTokenizer,AutoModel

In [2]:
print(transformers.__version__)

4.27.4


## Load Data

In [3]:
# Load data from csv files
disability_df_train = pd.read_csv('data/disability-dataset-train.csv')
disability_df_val = pd.read_csv('data/disability-dataset-val.csv')
disability_df_test = pd.read_csv('data/disability-dataset-test.csv')

In [4]:
# Shuffle the data
disability_df_train = disability_df_train.copy().sample(frac=1, random_state=266)
disability_df_val = disability_df_val.copy().sample(frac=1, random_state=266)
disability_df_test = disability_df_test.copy().sample(frac=1, random_state=266)

In [None]:
# # Form tensors of labels and features.
# disability_train_labels = tf.convert_to_tensor(disability_df_train['toxicity_binary'])
# disability_val_labels = tf.convert_to_tensor(disability_df_val['toxicity_binary'])
# disability_test_labels = tf.convert_to_tensor(disability_df_test['toxicity_binary'])

# disability_train_examples = tf.convert_to_tensor(disability_df_train['comment_text'])
# disability_val_examples = tf.convert_to_tensor(disability_df_val['comment_text'])
# disability_test_examples = tf.convert_to_tensor(disability_df_test['comment_text'])

## Tokenize Input

Download the tokenizer corresponding to BERTweet from HuggingFace:
**TODO: Evaluate whether we need to download emoji tokenizer**

In [7]:
# For transformers v4.x+:
bertweet_tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


EDA revealed that most comments have 128 words or less, so we'll set MAX_SEQUENCE_LENGTH to 128 here.

In [35]:
# MAX_SEQUENCE_LENGTH = 128
# batch_size = 32

In [36]:
# def tokenize_function(examples):
#     return bertweet_tokenizer(examples["comment_text"].tolist(), max_length = MAX_SEQUENCE_LENGTH, padding="max_length", truncation=True, return_tensors = 'tf')

# tf_train_dataset = disability_df_train.map(tokenize_function, batched=True)
# tf_val_dataset = disability_df_val.map(tokenize_function, batched=True)

In [37]:
MAX_SEQUENCE_LENGTH = 128
batch_size = 32

tf_train_dataset = bertweet_tokenizer(disability_df_train['comment_text'].tolist(), 
                    max_length = MAX_SEQUENCE_LENGTH,
                    padding="max_length", 
                    truncation=True,
                    return_tensors = 'tf').data
tf_val_dataset = bertweet_tokenizer(disability_df_val['comment_text'].tolist(), 
                    max_length = MAX_SEQUENCE_LENGTH,
                    padding="max_length", 
                    truncation=True,
                    return_tensors = 'tf').data
tf_test_dataset = bertweet_tokenizer(disability_df_test['comment_text'].tolist(), 
                    max_length = MAX_SEQUENCE_LENGTH,
                    padding="max_length", 
                    truncation=True,
                    return_tensors = 'tf').data

In [21]:
tf_train_dataset['input_ids'].shape

TensorShape([13438, 128])

## Encode Data

In [26]:
len(disability_df_train['toxicity_binary'])

13438

In [38]:
train_features = {x: tf_train_dataset[x] for x in bertweet_tokenizer.model_input_names}
train_tf_dataset = tf.data.Dataset.from_tensor_slices((train_features, disability_df_train['toxicity_binary']))
train_tf_dataset = train_tf_dataset.shuffle(len(tf_train_dataset)).batch(batch_size)

val_features = {x: tf_val_dataset[x] for x in bertweet_tokenizer.model_input_names}
val_tf_dataset = tf.data.Dataset.from_tensor_slices((val_features, disability_df_val['toxicity_binary']))
val_tf_dataset = val_tf_dataset.batch(batch_size)

## Transfer Learning by Freezing all BERTweet Layers, but training top layer

In [43]:
bertweet_model_base = TFRobertaModel.from_pretrained("vinai/bertweet-base")

Downloading tf_model.h5:   0%|          | 0.00/740M [00:00<?, ?B/s]

Some layers from the model checkpoint at vinai/bertweet-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at vinai/bertweet-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [44]:
bertweet_model_base.save('saved_models/bertweet_base')



INFO:tensorflow:Assets written to: saved_models/assets


INFO:tensorflow:Assets written to: saved_models/assets


In [45]:
bertweet_model_loaded = tf.keras.models.load_model('saved_models/bertweet_base')





### Build a Model

In [47]:
# Build a simple classification model with BERTweet. Use the CLS token for classification purposes
#     """

# Freeze all layers of pre-trained BERTweet model

hidden_size = 16, 
dropout=0.3,
# learning_rate=0.00005
bertweet_model_loaded.trainable = False


input_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='input_ids_layer')
# token_type_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='token_type_ids_layer')
attention_mask = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='attention_mask_layer')

# bertweet_inputs = {'input_ids': input_ids,
#                'token_type_ids': token_type_ids,
#                'attention_mask': attention_mask}      

# BERTweet only accepts input_ids and attention_mask as input, not token_type_ids

bertweet_inputs = {'input_ids': input_ids,
               'attention_mask': attention_mask}      

# bertweet_out = bertweet_model_loaded(bertweet_inputs, training=False)
bertweet_out = bertweet_model_loaded(bertweet_inputs)

# pooler_token = bertweet_out[1]
cls_token = bertweet_out[0][:, 0, :]

hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(cls_token)


hidden = tf.keras.layers.Dropout(dropout)(hidden)  


classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer')(hidden)

classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])
classification_model.summary()

ValueError: Exception encountered when calling layer "tf_roberta_model" (type TFRobertaModel).

Could not find matching concrete function to call loaded from the SavedModel. Got:
  Positional arguments (14 total):
    * {'attention_mask': <tf.Tensor 'input_ids_2:0' shape=(None, 128) dtype=int64>,
 'input_ids': <tf.Tensor 'input_ids:0' shape=(None, 128) dtype=int64>,
 'token_type_ids': <tf.Tensor 'input_ids_1:0' shape=(None, 128) dtype=int64>}
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * False
  Keyword arguments: {}

 Expected these arguments to match one of the following 4 option(s):

Option 1:
  Positional arguments (14 total):
    * {'attention_mask': TensorSpec(shape=(None, None), dtype=tf.int32, name='attention_mask'),
 'input_ids': TensorSpec(shape=(None, None), dtype=tf.int32, name='input_ids_input_ids')}
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * True
  Keyword arguments: {}

Option 2:
  Positional arguments (14 total):
    * {'attention_mask': TensorSpec(shape=(None, None), dtype=tf.int32, name='attention_mask'),
 'input_ids': TensorSpec(shape=(None, None), dtype=tf.int32, name='input_ids_input_ids')}
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * False
  Keyword arguments: {}

Option 3:
  Positional arguments (14 total):
    * {'attention_mask': TensorSpec(shape=(None, None), dtype=tf.int32, name='input_ids_attention_mask'),
 'input_ids': TensorSpec(shape=(None, None), dtype=tf.int32, name='input_ids_input_ids')}
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * True
  Keyword arguments: {}

Option 4:
  Positional arguments (14 total):
    * {'attention_mask': TensorSpec(shape=(None, None), dtype=tf.int32, name='input_ids_attention_mask'),
 'input_ids': TensorSpec(shape=(None, None), dtype=tf.int32, name='input_ids_input_ids')}
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * None
    * False
  Keyword arguments: {}

Call arguments received by layer "tf_roberta_model" (type TFRobertaModel):
  • attention_mask={'input_ids': 'tf.Tensor(shape=(None, 128), dtype=int64)', 'token_type_ids': 'tf.Tensor(shape=(None, 128), dtype=int64)', 'attention_mask': 'tf.Tensor(shape=(None, 128), dtype=int64)'}
  • token_type_ids=None
  • position_ids=None
  • head_mask=None
  • inputs_embeds=None
  • encoder_hidden_states=None
  • encoder_attention_mask=None
  • past_key_values=None
  • use_cache=None
  • output_attentions=None
  • output_hidden_states=None
  • return_dict=None
  • training=False

In [77]:
bertweet_tokenizer("This is a test sentence")

{'input_ids': [0, 126, 17, 11, 1156, 5199, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [None]:
bertweet_model_loaded(bertweet_inputs)

In [83]:
test_sentence_tokenized

{'input_ids': [0, 126, 17, 11, 1156, 5199, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [84]:
tf.convert_to_tensor(test_sentence_tokenized['input_ids'],None)

<tf.Tensor: shape=(7,), dtype=int32, numpy=array([   0,  126,   17,   11, 1156, 5199,    2], dtype=int32)>

In [85]:
test_sentence_tokenized = bertweet_tokenizer(["This is a test sentence", "This is another test sentence"])
test_sentence_inputs = {'input_ids': tf.convert_to_tensor(test_sentence_tokenized['input_ids']),
               'attention_mask': tf.convert_to_tensor(test_sentence_tokenized['attention_mask'])}
test_sentence_output = bertweet_model_loaded(test_sentence_inputs)

2023-04-08 18:27:19.905427: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [98]:
# test_sentence_output['last_hidden_state'][:, 0, :]
test_sentence_output['last_hidden_state'][:, 0, :]

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.06417338,  0.2017336 ,  0.1931608 , ..., -0.04359879,
        -0.00547554,  0.00307681],
       [-0.04666793,  0.2512347 ,  0.2019499 , ..., -0.02994854,
         0.01557533,  0.01622131]], dtype=float32)>

In [58]:
hidden_size = 16, 
dropout=0.3,

# Freeze all layers of pre-trained BERTweet model
bertweet_model_loaded.trainable = False

input_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_ids_input_ids')
attention_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='attention_mask')

# input_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_ids_input_ids')
# attention_mask = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='attention_mask')

# BERTweet only accepts input_ids and attention_mask as input, not token_type_ids
bertweet_inputs = {'input_ids': input_ids,
               'attention_mask': attention_mask}

# bertweet_out = bertweet_model_loaded(bertweet_inputs)
bertweet_out = bertweet_model_loaded(bertweet_inputs)

In [59]:
bertweet_out

{'last_hidden_state': <KerasTensor: shape=(None, None, 768) dtype=float32 (created by layer 'tf_roberta_model')>,
 'pooler_output': <KerasTensor: shape=(None, 768) dtype=float32 (created by layer 'tf_roberta_model')>}

In [72]:
bertweet_out['last_hidden_state'][:, 0, :][-1]

<KerasTensor: shape=(768,) dtype=float32 (created by layer 'tf.__operators__.getitem_12')>

In [63]:
# pooler_token = bertweet_out[1]
cls_token = bertweet_out['last_hidden_state'][:, 0, :]

hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(cls_token)

hidden = tf.keras.layers.Dropout(dropout)(hidden)  


classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer')(hidden)

classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])
classification_model.summary()

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'tuple'

### Train the top layer

In [None]:
classification_model.compile(optimizer=tf.keras.optimizers.Adam(),
                             loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), 
                             metrics=['accuracy','precision','recall','f1-score'])
epochs = 10
model.fit(train_tf_dataset, epochs=epochs, validation_data=val_tf_dataset)

### Do a round of fine-tuning of the entire model

In [None]:
bertweet_model_loaded.trainable = True
classification_model.summary()

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-5),  # Low learning rate
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), 
    metrics=['accuracy','precision','recall','f1-score']
)

epochs = 5
model.fit(train_ds, epochs=epochs, validation_data=validation_ds)

## Fine-Tuning by Freezing only some Layers

## Save weights/model checkpoint