Follow these tutorials:

- For BERTweet: https://www.kaggle.com/code/tylerrosacker/bertweet-transfer-learning
- Class notebook: https://github.com/datasci-w266/2023-spring-main/blob/master/materials/walkthrough_notebooks/bert_as_black_box/Keras_HuggingFace_Transformers_BERT_notebook.ipynb

## Imports

In [1]:
!pip install -q transformers

In [7]:
pip install emoji==0.6.0

Collecting emoji==0.6.0
  Downloading emoji-0.6.0.tar.gz (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 kB[0m [31m325.9 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25ldone
[?25h  Created wheel for emoji: filename=emoji-0.6.0-py3-none-any.whl size=49720 sha256=2efb9c2c6724a99be3b6cf5cb19440b017df2a7c12fea85089439b7df62abf11
  Stored in directory: /Users/cabanela/Library/Caches/pip/wheels/1b/bd/d9/310c33c45a553798a714e27e3b8395d37128425442b8c78e07
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-0.6.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
# import tensorflow_addons as tfa
import transformers
from transformers import AutoTokenizer,TFRobertaModel
# from transformers import AutoTokenizer,AutoModel

In [2]:
print(transformers.__version__)

4.27.4


## Load Data

In [3]:
# Load data from csv files
disability_df_train = pd.read_csv('data/disability-dataset-train.csv')
disability_df_val = pd.read_csv('data/disability-dataset-val.csv')
disability_df_test = pd.read_csv('data/disability-dataset-test.csv')

In [4]:
# Shuffle the data
disability_df_train = disability_df_train.copy().sample(frac=1, random_state=266)
disability_df_val = disability_df_val.copy().sample(frac=1, random_state=266)
disability_df_test = disability_df_test.copy().sample(frac=1, random_state=266)

In [5]:
# # Form tensors of labels and features.
# disability_train_labels = tf.convert_to_tensor(disability_df_train['toxicity_binary'])
# disability_val_labels = tf.convert_to_tensor(disability_df_val['toxicity_binary'])
# disability_test_labels = tf.convert_to_tensor(disability_df_test['toxicity_binary'])

# disability_train_examples = tf.convert_to_tensor(disability_df_train['comment_text'])
# disability_val_examples = tf.convert_to_tensor(disability_df_val['comment_text'])
# disability_test_examples = tf.convert_to_tensor(disability_df_test['comment_text'])

## Data Imbalance Stuff

Count number of positive and negative labels:

In [6]:
neg, pos = np.bincount(disability_df_train['toxicity_binary'])
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

initial_bias = np.log([pos/neg])

Examples:
    Total: 13438
    Positive: 2831 (21.07% of total)



Calculate class weights:

In [7]:
# Scaling by total/2 helps keep the loss to a similar magnitude.
# The sum of the weights of all examples stays the same.
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

Weight for class 0: 0.63
Weight for class 1: 2.37


## Tokenize Input

Download the tokenizer corresponding to BERTweet from HuggingFace:
**TODO: Evaluate whether we need to download emoji tokenizer**

In [8]:
# For transformers v4.x+:
bertweet_tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


EDA revealed that most comments have 128 words or less, so we'll set MAX_SEQUENCE_LENGTH to 128 here.

In [9]:
MAX_SEQUENCE_LENGTH = 128
batch_size = 32

In [8]:
# def tokenize_function(examples):
#     return bertweet_tokenizer(examples["comment_text"].tolist(), max_length = MAX_SEQUENCE_LENGTH, padding="max_length", truncation=True, return_tensors = 'tf')

# tf_train_dataset = disability_df_train.map(tokenize_function, batched=True)
# tf_val_dataset = disability_df_val.map(tokenize_function, batched=True)

In [10]:
tf_train_dataset = bertweet_tokenizer(disability_df_train['comment_text'].tolist(), 
                    max_length = MAX_SEQUENCE_LENGTH,
                    padding="max_length", 
                    truncation=True,
                    return_tensors = 'tf').data
tf_val_dataset = bertweet_tokenizer(disability_df_val['comment_text'].tolist(), 
                    max_length = MAX_SEQUENCE_LENGTH,
                    padding="max_length", 
                    truncation=True,
                    return_tensors = 'tf').data
tf_test_dataset = bertweet_tokenizer(disability_df_test['comment_text'].tolist(), 
                    max_length = MAX_SEQUENCE_LENGTH,
                    padding="max_length", 
                    truncation=True,
                    return_tensors = 'tf').data

Metal device set to: Apple M1 Pro


In [11]:
tf_train_dataset['input_ids'].shape

TensorShape([13438, 128])

## Encode Data

In [12]:
len(disability_df_train['toxicity_binary'])

13438

In [13]:
train_features = {x: tf_train_dataset[x] for x in bertweet_tokenizer.model_input_names}
train_tf_dataset = tf.data.Dataset.from_tensor_slices((train_features, disability_df_train['toxicity_binary']))
train_tf_dataset = train_tf_dataset.shuffle(len(tf_train_dataset)).batch(batch_size)

val_features = {x: tf_val_dataset[x] for x in bertweet_tokenizer.model_input_names}
val_tf_dataset = tf.data.Dataset.from_tensor_slices((val_features, disability_df_val['toxicity_binary']))
val_tf_dataset = val_tf_dataset.batch(batch_size)

## Transfer Learning by Freezing all BERTweet Layers, but training top layer

In [14]:
bertweet_model_base = TFRobertaModel.from_pretrained("vinai/bertweet-base")

Some layers from the model checkpoint at vinai/bertweet-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at vinai/bertweet-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [44]:
# bertweet_model_base.save('saved_models/bertweet_base')



INFO:tensorflow:Assets written to: saved_models/assets


INFO:tensorflow:Assets written to: saved_models/assets


In [8]:
# DOESN'T WORK, GIVES WEIRD ERRORS, SOMETHING MISSING WHEN I TRY TO CALL THE LOADED MODEL
# bertweet_model_loaded = tf.keras.models.load_model('saved_models/bertweet_base')



### Build a Model

In [15]:
# Layer Hyperparameters
hidden_size = 16
dropout = 0.3

# Correct bias initialization to address data imbalance
output_bias = tf.keras.initializers.Constant(initial_bias)

# BERTweet Inputs
ids = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_ids')
att = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='token_type_ids')
tok = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='attention_mask')

bertweet_model_base.trainable = False

bertweet_output = bertweet_model_base(ids,attention_mask=att,token_type_ids=tok,training=False)

cls_token = bertweet_output['last_hidden_state'][:,0,:]

hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(cls_token)

hidden = tf.keras.layers.Dropout(dropout)(hidden)

classification = tf.keras.layers.Dense(1, activation='sigmoid',bias_initializer=output_bias,name='classification_layer')(hidden)

classification_model = tf.keras.Model(inputs=[ids, att, tok], outputs=[classification])
classification_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 token_type_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 tf_roberta_model (TFRobertaMod  TFBaseModelOutputWi  134899968  ['input_ids[0][0]',              
 el)                            thPoolingAndCrossAt               'token_type_ids[0][0]',     

In [15]:
# # Build a simple classification model with BERTweet. Use the CLS token for classification purposes
# #     """

# # Freeze all layers of pre-trained BERTweet model

# hidden_size = 16, 
# dropout=0.3,
# # learning_rate=0.00005
# bertweet_model_loaded.trainable = False


# input_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='input_ids_layer')
# # token_type_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='token_type_ids_layer')
# attention_mask = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='attention_mask_layer')

# # bertweet_inputs = {'input_ids': input_ids,
# #                'token_type_ids': token_type_ids,
# #                'attention_mask': attention_mask}      

# # BERTweet only accepts input_ids and attention_mask as input, not token_type_ids

# bertweet_inputs = {'input_ids': input_ids,
#                'attention_mask': attention_mask}      

# # bertweet_out = bertweet_model_loaded(bertweet_inputs, training=False)
# bertweet_out = bertweet_model_loaded(bertweet_inputs)

# # pooler_token = bertweet_out[1]
# cls_token = bertweet_out[0][:, 0, :]

# hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(cls_token)


# hidden = tf.keras.layers.Dropout(dropout)(hidden)  


# classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer')(hidden)

# classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])
# classification_model.summary()

In [16]:
# hidden_size = 16, 
# dropout=0.3,

# # Freeze all layers of pre-trained BERTweet model
# bertweet_model_loaded.trainable = False

# input_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_ids_input_ids')
# attention_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='attention_mask')

# # input_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_ids_input_ids')
# # attention_mask = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='attention_mask')

# # BERTweet only accepts input_ids and attention_mask as input, not token_type_ids
# bertweet_inputs = {'input_ids': input_ids,
#                'attention_mask': attention_mask}

# # bertweet_out = bertweet_model_loaded(bertweet_inputs)
# bertweet_out = bertweet_model_loaded(bertweet_inputs)

### Train the top layer

In [17]:
# WARNING:absl:At this time, the v2.11+ optimizer `tf.keras.optimizers.Adam` runs slowly on M1/M2 Macs,
# please use the legacy Keras optimizer instead, located at `tf.keras.optimizers.legacy.Adam`.
classification_model.compile(optimizer=tf.keras.optimizers.legacy.Adam(),
                             loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), 
                             metrics=[tf.keras.metrics.BinaryAccuracy(),
                                     tf.keras.metrics.Precision(),
                                     tf.keras.metrics.Recall()])
# I tried 10 epochs and model wasn't really learning, already converged
batch_size = 32
epochs = 3
classification_model.fit(train_tf_dataset, batch_size=32, epochs=3, validation_data=val_tf_dataset,class_weight=class_weight)

Epoch 1/3


2023-04-09 00:51:56.372362: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x29ba9b9d0>

In [18]:
# What version of Python do you have?
import sys

import tensorflow.keras
import pandas as pd
import sklearn as sk
import tensorflow as tf
import platform

print(f"Python Platform: {platform.platform()}")
print(f"Tensor Flow Version: {tf.__version__}")
print(f"Keras Version: {tensorflow.keras.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
gpu = len(tf.config.list_physical_devices('GPU'))>0
print("GPU is", "available" if gpu else "NOT AVAILABLE")


Python Platform: macOS-12.5-arm64-arm-64bit
Tensor Flow Version: 2.12.0
Keras Version: 2.12.0

Python 3.10.10 | packaged by conda-forge | (main, Mar 24 2023, 20:12:31) [Clang 14.0.6 ]
Pandas 2.0.0
Scikit-Learn 1.2.2
GPU is available


### Do a round of fine-tuning of the entire model

In [23]:
bertweet_model_base.trainable = True
classification_model.summary()

# WARNING:absl:At this time, the v2.11+ optimizer `tf.keras.optimizers.Adam` runs slowly on M1/M2 Macs,
# please use the legacy Keras optimizer instead, located at `tf.keras.optimizers.legacy.Adam`.
classification_model.compile(
    optimizer=tf.keras.optimizers.legacy.Adam(1e-5),  # Low learning rate
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), 
    metrics=['accuracy']
)

epochs = 5
classification_model.fit(train_tf_dataset, epochs=epochs, validation_data=val_tf_dataset)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 token_type_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 tf_roberta_model (TFRobertaMod  TFBaseModelOutputWi  134899968  ['input_ids[0][0]',              
 el)                            thPoolingAndCrossAt               'token_type_ids[0][0]',     









KeyboardInterrupt: 

In [24]:
bertweet_model_base(ids,attention_mask=att,token_type_ids=tok, training=False)

TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'tf_roberta_model')>, pooler_output=<KerasTensor: shape=(None, 768) dtype=float32 (created by layer 'tf_roberta_model')>, past_key_values=None, hidden_states=None, attentions=None, cross_attentions=None)

## Fine-Tuning by Freezing only some Layers

## Save weights/model checkpoint