In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_addons as tfa
import transformers
from transformers import AutoTokenizer,TFRobertaModel

import pytz
import datetime

In [27]:
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [28]:
# What version of Python do you have?
import sys
import platform
import sklearn as sk

print(f"Python Platform: {platform.platform()}")
print(f"Tensor Flow Version: {tf.__version__}")
print(f"Keras Version: {tf.keras.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
gpu = len(tf.config.list_physical_devices('GPU'))>0
print("GPU is", "available" if gpu else "NOT AVAILABLE")

Python Platform: macOS-12.5-arm64-arm-64bit
Tensor Flow Version: 2.12.0
Keras Version: 2.12.0

Python 3.10.10 | packaged by conda-forge | (main, Mar 24 2023, 20:12:31) [Clang 14.0.6 ]
Pandas 2.0.0
Scikit-Learn 1.2.2
GPU is available


## Functions and Constants

In [29]:
def to_arrays(df):
  X = df['comment_text'].to_numpy()
  y = df['toxicity_binary'].to_numpy()
  return X, y

def load_data(group):
  df_train = pd.read_csv('data/' + group + '-dataset-train.csv')
  df_val = pd.read_csv('data/' + group + '-dataset-val.csv')
  df_test = pd.read_csv('data/' + group + '-dataset-test.csv')

  X_train, y_train = to_arrays(df_train)
  X_val, y_val = to_arrays(df_val)
  X_test, y_test = to_arrays(df_test)

  return X_train, y_train, X_test, y_test, X_val, y_val

In [30]:
MAX_SEQUENCE_LENGTH = 128

In [31]:
def tokenizing_pipeline(X, tokenizer):
  bert_tokenized = tokenizer(list(X),
                max_length=MAX_SEQUENCE_LENGTH,
                truncation=True,
                padding='max_length',
                return_tensors='tf')
  bert_inputs = [bert_tokenized.input_ids,
                 bert_tokenized.token_type_ids,
                 bert_tokenized.attention_mask]
  return bert_inputs

In [32]:
def build_bertweet_cls_model(max_sequence_length=MAX_SEQUENCE_LENGTH,
                          hidden_size=100, 
                          dropout=0.3,
                          learning_rate=0.0001,
                          num_train_layers=0):

    # freeze all pre-trained BERTweet layers
    if num_train_layers == 0:
      bertweet_model.trainable = False

    # partially freeze the first n pre-trained BERTweet layers
    else:
        for layer_num in range(num_train_layers):
            bertweet_model.roberta.encoder.layer[layer_num].trainable = False
    
    input_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}      

    # Use the same bertweet model instance
    bert_out = bertweet_model(bert_inputs)

    cls_token = bert_out[0][:, 0, :]

    
    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(cls_token)

    hidden = tf.keras.layers.Dropout(dropout)(hidden)  

    f1_score = tfa.metrics.F1Score(1, threshold = 0.5)

    classification = tf.keras.layers.Dense(1, activation='sigmoid', name='classification_layer')(hidden)
    
    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])
    
    classification_model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=learning_rate),
                                 loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), 
                                 metrics=[tf.keras.metrics.BinaryAccuracy(),
                                        tf.keras.metrics.Precision(),
                                        tf.keras.metrics.Recall(),
                                        f1_score])

    return classification_model

## Load Data

In [33]:
X_train_disability, y_train_disability, X_test_disability, y_test_disability, X_val_disability, y_val_disability = load_data('disability')

In [34]:
X_train_disability[0]

"Doesn't work? How do we know? When the country is at the point of legalizing silencers, and the right of the mentally ill to own assault weapons, it's laughable to think we have ANY gun controls."

In [35]:
X_train_disability[1]

'LoL. The mental retardation of the (d)onkeys is stunning.\nThey propose the craziest whackjob laws without one regard to the Constitution.'

## Load BERTweet Model from_pretrained() with normalization=True

In [36]:
# For transformers v4.x+:
# bertweet_tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
# bertweet_tokenizer_heavy = AutoTokenizer.from_pretrained("vinai/bertweet-base",
#                                                     use_fast=False,
#                                                     normalization=True,
#                                                     add_special_tokens=True,
#                                                     return_attention_mask=True)
bertweet_tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base",
                                                    use_fast=False,
                                                    normalization=True,
                                                    add_special_tokens=True,
                                                    return_attention_mask=True)
bertweet_model = TFRobertaModel.from_pretrained("vinai/bertweet-base")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some layers from the model checkpoint at vinai/bertweet-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at vinai/bertweet-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


## Tokenize Data Splits

In [37]:
bertweet_train_inputs_disability = tokenizing_pipeline(X_train_disability, bertweet_tokenizer)
bertweet_test_inputs_disability = tokenizing_pipeline(X_test_disability, bertweet_tokenizer)
bertweet_val_inputs_disability = tokenizing_pipeline(X_val_disability, bertweet_tokenizer)

## Calculate Class Weights for Disability

In [13]:
neg, pos = np.bincount(y_train_disability)
total = neg + pos
print('Disability Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

# Scaling by total/2 helps keep the loss to a similar magnitude.
# The sum of the weights of all examples stays the same.
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

disability_class_weight = {0: weight_for_0, 1: weight_for_1}

print('Disability Weight for class 0: {:.2f}'.format(weight_for_0))
print('Disability Weight for class 1: {:.2f}'.format(weight_for_1))

Disability Examples:
    Total: 12798
    Positive: 2758 (21.55% of total)

Disability Weight for class 0: 0.64
Disability Weight for class 1: 2.32


In [14]:
disability_model = build_bertweet_cls_model(num_train_layers=6, learning_rate=1e-4)
disability_model.summary()

ValueError: in user code:

    File "/Users/cabanela/anaconda3/envs/w266tensorflow/lib/python3.10/site-packages/transformers/modeling_tf_utils.py", line 942, in run_call_with_unpacked_inputs  *
        unpacked_inputs = input_processing(func, config, **fn_args_and_kwargs)
    File "/Users/cabanela/anaconda3/envs/w266tensorflow/lib/python3.10/site-packages/transformers/modeling_tf_utils.py", line 498, in input_processing  *
        raise ValueError(

    ValueError: The following keyword arguments are not supported by this model: ['name'].


## Train Disability Model 

In [None]:
disability_history = disability_model.fit(bertweet_train_inputs_disability,
                                          y_train_disability,
                                          validation_data=(bertweet_val_inputs_disability, y_val_disability),
                                          batch_size=64,
                                          epochs=1,
                                          class_weight=disability_class_weight)

## Evaluate Disability Model on Test Set

In [None]:
disability_test_history = disability_model.evaluate(bertweet_test_inputs_disability, y_test_disability)

# Save Model & Model Weights

In [38]:
from keras.models import load_model

In [None]:
model.save('test_load_model_epoch1.h5')  # creates a HDF5 file 'my_model.h5'

In [None]:
# identical to the previous one
loaded_model = load_model('my_model.h5')

In [None]:
# If you need to load the weights into a different architecture (with some layers in common),
# for instance for fine-tuning or transfer-learning, you can load them by layer name:
loadmodel.load_weights('my_model_weights.h5', by_name=True)

In [None]:
disability_model.load_weights('saved_weights/take2_disability_only_half_frozen.h5')

In [39]:
random_model = build_bertweet_cls_model(num_train_layers=6, learning_rate=1e-5)
random_model.summary()

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 attention_mask_layer (InputLay  [(None, 128)]       0           []                               
 er)                                                                                              
                                                                                                  
 input_ids_layer (InputLayer)   [(None, 128)]        0           []                               
                                                                                                  
 token_type_ids_layer (InputLay  [(None, 128)]       0           []                               
 er)                                                                                              
                                                                                            

In [40]:
# Re-evaluate the model
# loss, acc = random_model.evaluate(bertweet_test_inputs_disability, y_test_disability, verbose=2)
# print("Restored model, accuracy: {:5.2f}%".format(100 * acc))
random_model.evaluate(bertweet_test_inputs_disability, y_test_disability)



[0.6317230463027954,
 0.770158052444458,
 0.08264463394880295,
 0.013210039585828781,
 array([0.02277905], dtype=float32)]

In [41]:
loaded_model = build_bertweet_cls_model(num_train_layers=6, learning_rate=1e-5)
loaded_model.load_weights('saved_weights/take2_disability_only_half_frozen.h5')
loaded_model.summary()

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 attention_mask_layer (InputLay  [(None, 128)]       0           []                               
 er)                                                                                              
                                                                                                  
 input_ids_layer (InputLayer)   [(None, 128)]        0           []                               
                                                                                                  
 token_type_ids_layer (InputLay  [(None, 128)]       0           []                               
 er)                                                                                              
                                                                                            

In [42]:
# # Re-evaluate the model
# loss, acc = loaded_model.evaluate(bertweet_test_inputs_disability, y_test_disability, verbose=2)
# print("Restored model, accuracy: {:5.2f}%".format(100 * acc))
loaded_model.evaluate(bertweet_test_inputs_disability, y_test_disability)



[0.6017908453941345,
 0.7350656390190125,
 0.42918193340301514,
 0.928665816783905,
 array([0.5870564], dtype=float32)]

In [None]:
# Loads the weights
model2.load_weights(checkpoint_path)

# Re-evaluate the model
loss, acc = model2.evaluate(test_images, test_labels, verbose=2)
print("Restored model, accuracy: {:5.2f}%".format(100 * acc))

# do stuff with model2 (e.g. predict() to get y_pred)
# Then you can do stuff like keras.metrics.accuracy(y_true, y_pred)
# Then you can do stuff like keras.metrics.confusion_matrix(y_test, y_pred)

# HOW TO LOAD DISABILITY-ONLY WEIGHTS

In [None]:
def build_bertweet_cls_model(max_sequence_length=MAX_SEQUENCE_LENGTH,
                          hidden_size=100, 
                          dropout=0.3,
                          learning_rate=0.0001,
                          num_train_layers=0):

    # freeze all pre-trained BERTweet layers
    if num_train_layers == 0:
      bertweet_model.trainable = False

    # partially freeze the first n pre-trained BERTweet layers
    else:
        for layer_num in range(num_train_layers):
            bertweet_model.roberta.encoder.layer[layer_num].trainable = False
    
    input_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}      

    # Use the same bertweet model instance
    bert_out = bertweet_model(bert_inputs)

    cls_token = bert_out[0][:, 0, :]

    
    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(cls_token)

    hidden = tf.keras.layers.Dropout(dropout)(hidden)  

    f1_score = tfa.metrics.F1Score(1, threshold = 0.5)

    classification = tf.keras.layers.Dense(1, activation='sigmoid', name='classification_layer')(hidden)
    
    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])
    # before compiling, load weights from the top classification layer from the best disability-only model with same architecture
    classification_model.load_weights("disability-only-weights-best.hdf5")
    
    classification_model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=learning_rate),
                                 loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), 
                                 metrics=[tf.keras.metrics.BinaryAccuracy(),
                                        tf.keras.metrics.Precision(),
                                        tf.keras.metrics.Recall(),
                                        f1_score])

    return classification_model

In [None]:
# checkpoint
filepath="weights.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]
# Fit the model
model.fit(X, Y, validation_split=0.33, epochs=150, batch_size=10, callbacks=callbacks_list, verbose=0)