## Imports

In [1]:
import tensorflow as tf
tf.keras.backend.clear_session()

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# import tensorflow as tf
import tensorflow_addons as tfa
import transformers
from transformers import AutoTokenizer,TFRobertaModel

import pytz
import datetime

 The versions of TensorFlow you are currently using is 2.12.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


## Resources Configuration

In [3]:
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [4]:
# What version of Python do you have?
import sys
import platform
import sklearn as sk

print(f"Python Platform: {platform.platform()}")
print(f"Tensor Flow Version: {tf.__version__}")
print(f"Keras Version: {tf.keras.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
gpu = len(tf.config.list_physical_devices('GPU'))>0
print("GPU is", "available" if gpu else "NOT AVAILABLE")

Python Platform: macOS-12.5-arm64-arm-64bit
Tensor Flow Version: 2.12.0
Keras Version: 2.12.0

Python 3.10.10 | packaged by conda-forge | (main, Mar 24 2023, 20:12:31) [Clang 14.0.6 ]
Pandas 2.0.0
Scikit-Learn 1.2.2
GPU is available


## Functions and Constants

In [5]:
def to_arrays(df):
  X = df['comment_text'].to_numpy()
  y = df['toxicity_binary'].to_numpy()
  return X, y

def load_data(group):
  df_train = pd.read_csv('data/' + group + '-dataset-train.csv')
  df_val = pd.read_csv('data/' + group + '-dataset-val.csv')
  df_test = pd.read_csv('data/' + group + '-dataset-test.csv')

  X_train, y_train = to_arrays(df_train)
  X_val, y_val = to_arrays(df_val)
  X_test, y_test = to_arrays(df_test)

  return X_train, y_train, X_test, y_test, X_val, y_val

In [6]:
MAX_SEQUENCE_LENGTH = 128

In [7]:
def tokenizing_pipeline(X, tokenizer):
  bert_tokenized = tokenizer(list(X),
                max_length=MAX_SEQUENCE_LENGTH,
                truncation=True,
                padding='max_length',
                return_tensors='tf')
  bert_inputs = [bert_tokenized.input_ids,
                 bert_tokenized.token_type_ids,
                 bert_tokenized.attention_mask]
  return bert_inputs

In [8]:
def build_bertweet_cls_model(max_sequence_length=MAX_SEQUENCE_LENGTH,
                          hidden_size=100, 
                          dropout=0.3,
                          learning_rate=0.0001,
                          num_train_layers=0):

    # freeze all pre-trained BERTweet layers
    if num_train_layers == 0:
      bertweet_model.trainable = False

    # partially freeze the first n pre-trained BERTweet layers
    else:
        for layer_num in range(num_train_layers):
            bertweet_model.roberta.encoder.layer[layer_num].trainable = False
    
    input_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}      

    # Use the same bertweet model instance
    bert_out = bertweet_model(bert_inputs)

    cls_token = bert_out[0][:, 0, :]

    
    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(cls_token)

    hidden = tf.keras.layers.Dropout(dropout)(hidden)  

    f1_score = tfa.metrics.F1Score(1, threshold = 0.5)

    classification = tf.keras.layers.Dense(1, activation='sigmoid', name='classification_layer')(hidden)
    
    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])
    
    classification_model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=learning_rate),
                                 loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), 
                                 metrics=[tf.keras.metrics.BinaryAccuracy(),
                                        tf.keras.metrics.Precision(),
                                        tf.keras.metrics.Recall(),
                                        f1_score])

    return classification_model

## Load Data

In [9]:
X_train_disability, y_train_disability, X_test_disability, y_test_disability, X_val_disability, y_val_disability = load_data('disability')

In [10]:
X_train_disability[0]

"Doesn't work? How do we know? When the country is at the point of legalizing silencers, and the right of the mentally ill to own assault weapons, it's laughable to think we have ANY gun controls."

In [11]:
X_train_disability[1]

'LoL. The mental retardation of the (d)onkeys is stunning.\nThey propose the craziest whackjob laws without one regard to the Constitution.'

In [12]:
X_train_disability[20]

'I can give examples of the peaceful Muslim missionaries murdering people 100 to 1 of your examples.  Islam is the religion of death.'

## Load BERTweet Model from_pretrained() with normalization=True

In [13]:
# For transformers v4.x+:
# bertweet_tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
# bertweet_tokenizer_heavy = AutoTokenizer.from_pretrained("vinai/bertweet-base",
#                                                     use_fast=False,
#                                                     normalization=True,
#                                                     add_special_tokens=True,
#                                                     return_attention_mask=True)
bertweet_tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base",
                                                    use_fast=False,
                                                    normalization=True,
                                                    add_special_tokens=True,
                                                    return_attention_mask=True)
bertweet_model = TFRobertaModel.from_pretrained("vinai/bertweet-base")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Metal device set to: Apple M1 Pro


Some layers from the model checkpoint at vinai/bertweet-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at vinai/bertweet-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [14]:
bertweet_tokenizer.tokenize(X_train_disability[1])

['LoL',
 '.',
 'The',
 'mental',
 'retar@@',
 'dation',
 'of',
 'the',
 '(',
 'd',
 ')',
 'on@@',
 'keys',
 'is',
 'stunning',
 '.',
 'They',
 'propose',
 'the',
 'craziest',
 'wha@@',
 'ck@@',
 'job',
 'laws',
 'without',
 'one',
 'regard',
 'to',
 'the',
 'Constitution',
 '.']

In [15]:
# bertweet_tokenizer_heavy.tokenize(X_train_disability[1])

In [16]:
bertweet_model.summary()

Model: "tf_roberta_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFRobertaMainLayer  multiple                 134899968 
 )                                                               
                                                                 
Total params: 134,899,968
Trainable params: 134,899,968
Non-trainable params: 0
_________________________________________________________________


## Tokenize Data Splits

In [None]:
bertweet_train_inputs_disability = tokenizing_pipeline(X_train_disability, bertweet_tokenizer)
bertweet_test_inputs_disability = tokenizing_pipeline(X_test_disability, bertweet_tokenizer)
bertweet_val_inputs_disability = tokenizing_pipeline(X_val_disability, bertweet_tokenizer)

In [None]:
len(X_train_disability)

In [None]:
len(X_val_disability)

In [None]:
len(X_test_disability)

## Calculate Class Weights for Disability

Get class weights for disability train set:

In [None]:
neg, pos = np.bincount(y_train_disability)
total = neg + pos
print('Disability Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

# Scaling by total/2 helps keep the loss to a similar magnitude.
# The sum of the weights of all examples stays the same.
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

disability_class_weight = {0: weight_for_0, 1: weight_for_1}

print('Disability Weight for class 0: {:.2f}'.format(weight_for_0))
print('Disability Weight for class 1: {:.2f}'.format(weight_for_1))

## Build Disability Model with half-frozen BERTweet layers

In [None]:
disability_model = build_bertweet_cls_model(num_train_layers=6, learning_rate=1e-5)
disability_model.summary()

In [None]:
america_la_tz = pytz.timezone('America/Los_Angeles')
start_time = datetime.datetime.now(tz=america_la_tz).isoformat()
print(str(start_time))

## Define Checkpoint

In [None]:
checkpoint_filepath = 'model_checkpoints/disability_only_best_weights.h5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    verbose=1,
    monitor='val_f1_score',
    mode='max',
    save_best_only=True)

## Train Disability Model 

In [None]:
disability_history = disability_model.fit(bertweet_train_inputs_disability,
                                          y_train_disability,
                                          validation_data=(bertweet_val_inputs_disability, y_val_disability),
#                                           batch_size=32,
                                          batch_size=1024,
#                                           epochs=5,
                                          epochs=1,
                                          class_weight=disability_class_weight,
                                          callbacks=[model_checkpoint_callback])

In [None]:
history = pd.DataFrame(disability_history.history)
plt.ylabel('F1 Score')
plt.xlabel('Epoch')
plt.title('Disability Only Train vs Val F1 Score for Half-Frozen Bertweet')
plt.xticks([0, 1, 2, 3, 4],['1', '2', '3', '4', '5'])
plt.plot(history['f1_score'], label="training", marker='o')
plt.plot(history['val_f1_score'], label="validation", marker='o')
plt.legend()
plt.show()

In [None]:
history = pd.DataFrame(disability_history.history)
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.title('Disability Only Train vs Val Loss for Half-Frozen Bertweet')
plt.xticks([0, 1, 2, 3, 4],['1', '2', '3', '4', '5'])
plt.plot(history['loss'], label="training", marker='o')
plt.plot(history['val_loss'], label="validation", marker='o')
plt.legend()
plt.show()

In [None]:
history = pd.DataFrame(disability_history.history)
plt.ylabel('F1 Score')
plt.xlabel('Epoch')
plt.title('Disability Only Train vs Val Binary Accuracy for Half-Frozen Bertweet')
plt.xticks([0, 1, 2, 3, 4],['1', '2', '3', '4', '5'])
plt.plot(history['binary_accuracy'], label="training", marker='o')
plt.plot(history['val_binary_accuracy'], label="validation", marker='o')
plt.legend()
plt.show()

In [None]:
end_time = datetime.datetime.now(tz=america_la_tz).isoformat()
print(str(end_time))

## Evaluate Disability Model on Test Set

In [None]:
disability_test_history = disability_model.evaluate(bertweet_test_inputs_disability, y_test_disability)

## Export Results to csv

In [None]:
disability_test_history_df = pd.DataFrame({
    'test_loss': disability_test_history[0],
    'test_binary_accuracy': disability_test_history[1],
    'test_precision': disability_test_history[2],
    'test_recall': disability_test_history[3],
    'test_f1_score': disability_test_history[4]})
disability_results_df = pd.concat([pd.DataFrame(disability_history.history), disability_test_history_df], axis=0)
disability_results_df

In [None]:
disability_results_df.to_csv('experiment_results/BERTweet_Disability_Only_take5.csv')

# Save Model Weights

In [None]:
disability_model.save_weights('saved_weights/BERTweet_Disability_Only_take5.h5')

In [None]:
disability_model.save('saved_models/BERTweet_Disability_Only_take5')