In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from nlp_helper_scripts.getData import (prepare_label, bert_glue_encode,pretrained_bert_model)
from bias_helpers.bias_functions import tprs, calculate_gaps
from bias_helpers.bias_deepview import DeepViewBias

import tensorflow as tf
import numpy as np
import matplotlib as mpl

dataset = load_dataset("LabHC/bias_in_bios")
MODEL_CHECKPOINT = "bert-base-uncased"
task = "bias_in_bios"
def single_preprocess_function(examples):
    # glue datasets field mapping
    task_to_keys = {
        "cola": ("sentence", None),
        "mnli": ("premise", "hypothesis"),
        "mnli-mm": ("premise", "hypothesis"),
        "mrpc": ("sentence1", "sentence2"),
        "qnli": ("question", "sentence"),
        "qqp": ("question1", "question2"),
        "rte": ("sentence1", "sentence2"),
        "sst2": ("sentence", None),
        "stsb": ("sentence1", "sentence2"),
        "wnli": ("sentence1", "sentence2"),
        "bias_in_bios": ("hard_text", None),
    }

    tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
    sentence1_key, sentence2_key = task_to_keys[task]
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key],
                         max_length=128,
                         padding='max_length',
                         truncation=True)
    return tokenizer(examples[sentence1_key],
                     examples[sentence2_key],
                     max_length=128,
                     padding='max_length',
                     truncation=True)

preprocessed_dataset = dataset.map(single_preprocess_function, batched=True)


In [None]:
import numpy as np

def bert_glue_encode(dataset):
    # Convert batch of encoded features to numpy array.
    input_ids = np.array(dataset["input_ids"], dtype="int32")
    attention_masks = np.array(dataset["attention_mask"], dtype="int32")
    token_type_ids = np.array(dataset["token_type_ids"], dtype="int32")
    labels = np.array(dataset["profession"], dtype="int32")
    
    #add check for test set since they may not have labels
    return (input_ids, attention_masks, token_type_ids) ,labels

x_train, y_train = bert_glue_encode(preprocessed_dataset['train'])
x_val, y_val = bert_glue_encode(preprocessed_dataset['test'])

model_y_train = prepare_label(y_train)
model_y_val = prepare_label(y_val)

In [None]:
from getData import pretrained_bert_model, prepare_dataset, classifier_model, finetuned_bert_and_classifier
import tensorflow as tf

task = 'bias_in_bios'
# %matplotlib qt

pt_embed = pretrained_bert_model()
pt_head = classifier_model(28)
whole_model, ft_bert, ft_classifier = finetuned_bert_and_classifier(28)


train_embeddings = pt_embed.predict(x_train, batch_size=64)
val_embeddings  = pt_embed.predict(x_val, batch_size=64)

2024-05-31 10:31:01.748648: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-31 10:31:01.777567: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-31 10:31:01.777593: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-31 10:31:01.778614: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-31 10:31:01.783526: I tensorflow/core/platform/cpu_feature_guar

 179/1548 [==>...........................] - ETA: 11:04

In [19]:
from transformers import create_optimizer
from sklearn.metrics import matthews_corrcoef
import numpy as np
import tensorflow as tf

num_classes = 28
if num_classes == 2:
    loss = tf.keras.losses.BinaryCrossentropy()
    metrics = tf.metrics.BinaryAccuracy()
else:
    loss = tf.keras.losses.CategoricalCrossentropy()
    metrics = tf.metrics.CategoricalAccuracy()



def matthews_correlation(y_true, y_pred):
    y_pred = tf.cast(tf.greater(y_pred, 0.5), tf.int32)
    return matthews_corrcoef(y_true, y_pred)


epochs = 3
batch_size = 16

steps_per_epoch = len(x_train[1])
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=num_warmup_steps, num_train_steps=num_train_steps)

In [20]:
pt_head.compile(optimizer=optimizer, loss=loss, metrics=[metrics])
_ = pt_head.fit(train_embeddings, model_y_train, validation_data=(val_embeddings, model_y_val), batch_size=batch_size, epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
whole_model.compile(optimizer=optimizer, loss=loss, metrics=[metrics])
_ = whole_model.fit(x_train, model_y_train, validation_data=(x_val, model_y_val), batch_size=batch_size, epochs=epochs)

In [None]:
saved_pretrained_classifier_only_path = './models/' + str(task)+ '/{}_pretrained_BERT_Classifier'.format(task.replace('/', '_'))

saved_finetuned_whole_model_path = './models/' + str(task)+ '/{}_finetuned_BERT'.format(task.replace('/', '_'))
saved_finetuned_embed_model_path = './models/' + str(task)+ '/{}_finetuned_BERT_Embeddings'.format(task.replace('/', '_'))
saved_finetuned_predict_model_path = './models/' + str(task)+ '/{}_finetuned_BERT_Predictor'.format(task.replace('/', '_'))


# pt_head.save(saved_pretrained_classifier_only_path, include_optimizer=False)
# whole_model.save(saved_finetuned_whole_model_path, include_optimizer=False)
ft_bert.save(saved_finetuned_embed_model_path, include_optimizer=False)
ft_classifier.save(saved_finetuned_predict_model_path, include_optimizer=False)