In [1]:
!pip install tensorflow==2.9.0

Collecting tensorflow==2.9.0
  Downloading tensorflow-2.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting flatbuffers<2,>=1.12 (from tensorflow==2.9.0)
  Downloading flatbuffers-1.12-py2.py3-none-any.whl.metadata (872 bytes)
Collecting gast<=0.4.0,>=0.2.1 (from tensorflow==2.9.0)
  Downloading gast-0.4.0-py3-none-any.whl.metadata (1.1 kB)
Collecting keras<2.10.0,>=2.9.0rc0 (from tensorflow==2.9.0)
  Downloading keras-2.9.0-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting keras-preprocessing>=1.1.1 (from tensorflow==2.9.0)
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting tensorboard<2.10,>=2.9 (from tensorflow==2.9.0)
  Downloading tensorboard-2.9.1-py3-none-any.whl.metadata (1.9 kB)
Collecting tensorflow-estimator<2.10.0,>=2.9.0rc0 (from tensorflow==2.9.0)
  Downloading tensorflow_estimator-2.9.0-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting google-auth-oauthlib<0.5,>=0.4.1 (from tensorboard<2.10,>=

In [2]:
import os
import re
import numpy as np
import pandas as pd
import ast

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from transformers import T5Tokenizer, TFT5ForConditionalGeneration


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Load files
train_pairs = pd.read_csv('drive/MyDrive/bias_train_med.csv')
valid_pairs = pd.read_csv('drive/MyDrive/bias_val_med.csv')
test_pairs = pd.read_csv('drive/MyDrive/bias_test_full.csv')

prefix = 'Translate biased to unbiased: '
train_pairs['text_pairs_dict'] = train_pairs.apply(lambda row: ({'orig': prefix + row['src_raw'], 'target': row['tgt_raw']}), axis=1)
valid_pairs['text_pairs_dict'] = valid_pairs.apply(lambda row: ({'orig': prefix + row['src_raw'], 'target': row['tgt_raw']}), axis=1)
test_pairs['text_pairs_dict'] = test_pairs.apply(lambda row: ({'orig': prefix + row['src_raw'], 'target': row['tgt_raw']}), axis=1)


train_pairs = train_pairs.text_pairs_dict
valid_pairs = valid_pairs.text_pairs_dict
test_pairs = test_pairs.text_pairs_dict



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_file = 'drive/MyDrive/train_pairs.csv'
valid_file = 'drive/MyDrive/valid_pairs.csv'
test_file = 'drive/MyDrive/test_pairs.csv'

pd.DataFrame(train_pairs).to_csv(train_file)
pd.DataFrame(valid_pairs).to_csv(valid_file)
pd.DataFrame(test_pairs).to_csv(test_file)

In [4]:
def preprocess_data(text_pairs, tokenizer, model, max_length=256):
    orig_text = [orig for orig, target in text_pairs]
    orig_encoded = tokenizer.batch_encode_plus(
        orig_text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='tf'
    )

    orig_input_ids = np.array(orig_encoded["input_ids"], dtype="int32")
    orig_attention_masks = np.array(orig_encoded["attention_mask"], dtype="int32")

    target_text = [target for orig, target in text_pairs]
    target_encoded = tokenizer.batch_encode_plus(
        target_text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )

    label_ids = np.array(target_encoded['input_ids'])
    decoder_input_ids = model._shift_right(label_ids)

    return [orig_input_ids, orig_attention_masks, decoder_input_ids], label_ids

In [5]:
class TranslationDataGenerator(tf.keras.utils.Sequence):

    def __init__(self,
                 tokenizer,
                 model,
                 n_examples,
                 data_filename,
                 max_length=256,
                 batch_size=16,
                 shuffle=True):

        self.tokenizer = tokenizer
        self.model = model
        self.n_examples = n_examples
        self.data_filename = data_filename
        self.max_length = max_length
        self.batch_size = batch_size
        self.shuffle = shuffle

        # Initialize row order, call on_epoch_end to shuffle row indices
        self.row_order = np.arange(1, self.n_examples+1)
        self.on_epoch_end()

    def __len__(self):
        # Return the number of batches in the full dataset
        return self.n_examples // self.batch_size

    def __getitem__(self, idx):
        batch_start = idx * self.batch_size
        batch_end = (idx + 1) * self.batch_size

        # Indices to skip are the ones in the shuffled row_order before and
        # after the chunk we'll use for this batch
        batch_idx_skip = self.row_order[:batch_start] + self.row_order[batch_end:]
        df = pd.read_csv(self.data_filename, skiprows=batch_idx_skip)
        text_pairs = df.text_pairs_dict.apply(lambda x: ast.literal_eval(x).values())
        #text_pairs = df[['orig', 'target']].values.astype(str).tolist()

        batch_data = preprocess_data(
            text_pairs,
            self.tokenizer,
            self.model,
            self.max_length
        )

        return batch_data

    def __call__(self):
        for i in range(self.__len__()):
            yield self.__getitem__(i)

            if i == self.__len__()-1:
                self.on_epoch_end()

    def on_epoch_end(self):
        if self.shuffle:
            self.row_order = list(np.random.permutation(self.row_order))

In [6]:
# Load the pretrained tensorflow model

model_name = 't5-base'
t5_tokenizer = T5Tokenizer.from_pretrained(model_name)
t5_model = TFT5ForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [7]:
# Create the data generators for train and validation data, tensorflow version

max_length = 256
batch_size = 32

train_data_generator = TranslationDataGenerator(
    tokenizer=t5_tokenizer,
    model=t5_model,
    n_examples=len(train_pairs),
    data_filename=train_file,
    max_length=max_length,
    batch_size=batch_size
)

valid_data_generator = TranslationDataGenerator(
    tokenizer=t5_tokenizer,
    model=t5_model,
    n_examples=len(valid_pairs),
    data_filename=valid_file,
    max_length=max_length,
    batch_size=batch_size
)

NameError: name 'train_pairs' is not defined

In [None]:
def build_t5_training_wrapper_model(t5_model, max_length):
    input_ids = layers.Input(shape=(max_length), dtype=tf.int32, name='input_ids')
    attention_mask = layers.Input(shape=(max_length), dtype=tf.int32, name='attention_mask')
    decoder_input_ids = layers.Input(shape=(max_length), dtype=tf.int32, name='labels')

    t5_logits = t5_model(input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)[0]

    model = tf.keras.models.Model(inputs=[input_ids, attention_mask, decoder_input_ids],
                                  outputs=[t5_logits])
    model.compile(optimizer=tf.keras.optimizers.Adam(),
                  loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    return model

In [None]:
model_wrapper = build_t5_training_wrapper_model(t5_model, max_length)

In [None]:
# As in the first notebook, we should add a model checkpoint callback to save
# the trained model weights after each epoch. Edit the filepath to where
# you want to save the weights in your own Drive

checkpoint_dir = 'drive/MyDrive/'
checkpoint_filepath = checkpoint_dir + 't5_bias_weights.{epoch:02d}-{val_accuracy:.2f}.hdf5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True)

In [None]:
# Now call .fit on the model_wrapper, passing in the data generators and the
# model checkpoint callback

model_wrapper.fit(train_data_generator,
                  validation_data=valid_data_generator,
                  epochs=1,
                  callbacks=[model_checkpoint_callback])



<keras.callbacks.History at 0x7974c7fc0e80>

In [None]:
prefix = 'Translate biased to unbiased: '

for test_input_text in ['A lead programmer usually spends his career mired in obscurity.',
                        "The lyrics are about mankind's perceived idea of hell.",
                        'Marriage is a holy union of individuals.',
                        "Jewish forces overcome Arab militants."]:
    test_inputs = t5_tokenizer([prefix + test_input_text], return_tensors='tf')
    test_output_ids = t5_model.generate(test_inputs['input_ids'])

    print([t5_tokenizer.decode(out_ids, skip_special_tokens=True,
                               clean_up_tokenization_spaces=False) for out_ids in test_output_ids])




['a lead programmer usually spends his career mired in obscurity.']
["the lyrics are about humanity's perceived idea of hell."]
['marriage is a union of individuals.']
['Jewish forces overcome arab militants.']


In [None]:
import os
os.environ["HF_TOKEN"] = "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"
t5_model.push_to_hub("violetdavis/finetuned_t5_bias", use_auth_token=os.getenv("HF_TOKEN"))
t5_tokenizer.push_to_hub("violetdavis/finetuned_t5_bias", use_auth_token=os.getenv("HF_TOKEN"))


In [11]:
!pip install --upgrade protobuf

Collecting protobuf
  Downloading protobuf-5.29.1-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Downloading protobuf-5.29.1-cp38-abi3-manylinux2014_x86_64.whl (319 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m319.7/319.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.19.6
    Uninstalling protobuf-3.19.6:
      Successfully uninstalled protobuf-3.19.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas-gbq 0.24.0 requires google-auth-oauthlib>=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.
tensorboard 2.9.1 requires protobuf<3.20,>=3.9.2, but you have protobuf 5.29.1 which is incompatible.
tensorflow-metadata 1.13.1 requires protobuf<5,>=3.20.3, but you have protobuf 5.29.1 which 

In [None]:
!pip install --force-reinstall sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
  Attempting uninstall: sentencepiece
    Found existing installation: sentencepiece 0.2.0
    Uninstalling sentencepiece-0.2.0:
      Successfully uninstalled sentencepiece-0.2.0
Successfully installed sentencepiece-0.2.0


In [12]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Replace "model_name" with the name of your desired Hugging Face model
t5_tokenizer = AutoTokenizer.from_pretrained("violetdavis/finetuned_t5_bias")
t5_model = AutoModelForSeq2SeqLM.from_pretrained("violetdavis/finetuned_t5_bias", from_tf=True)


config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

All TF 2.0 model weights were used when initializing T5ForConditionalGeneration.

Some weights of T5ForConditionalGeneration were not initialized from the TF 2.0 model and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [None]:
test_source_sequences = pd.read_csv('drive/MyDrive/bias_test_med.csv')
test_source_sequences = test_source_sequences.src_raw

In [None]:
import torch

test_t5_sequences3 = []

# Prepare inputs for batch processing
prefix = 'Translate biased to unbiased: '
batch_size = 500
start_index = 0
num_batches = (len(test_source_sequences) - start_index + batch_size - 1) // batch_size

# Process batches
for i in range(num_batches):
    start_idx = start_index + i * batch_size
    end_idx = min(start_index + (i + 1) * batch_size, len(test_source_sequences))
    batch_sequences = test_source_sequences[start_idx:end_idx]

    # Tokenize batch inputs
    batch_inputs = [prefix + source_sequence for source_sequence in batch_sequences]
    test_inputs = t5_tokenizer(batch_inputs, padding=True, return_tensors='pt') # Changed to 'pt'

    # Generate batch outputs, ensuring the model and inputs are on the same device
    # If using GPU, make sure to move the model to the GPU as well
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    t5_model = t5_model.to(device)  # Move model to the selected device
    test_inputs = test_inputs.to(device)  # Move inputs to the selected device

    test_output_ids = t5_model.generate(test_inputs.input_ids, max_length=256)
    predicted_sequences = t5_tokenizer.batch_decode(test_output_ids, skip_special_tokens=True)

    # Append batch outputs to results list
    test_t5_sequences3.extend(predicted_sequences)

# Ensure the number of generated sequences matches the number of input sequences
assert len(test_t5_sequences3) == len(test_source_sequences) - start_index

In [None]:
df = pd.DataFrame(test_t5_sequences3)
df.to_csv('drive/MyDrive/test_t5.csv', index=False)

In [None]:
checkpoint_dir = 'drive/MyDrive/'
checkpoint_filepath = checkpoint_dir + 't5_bias_weights.01-0.99.hdf5'

model_wrapper = build_t5_training_wrapper_model(t5_model, max_length)
model_wrapper.load_weights(checkpoint_filepath)

In [8]:
test_source_sequences = ["She’s always late because she doesn’t care.", "Let’s target high-energy people for this campaign.", "She’s just not a good cultural fit.", "He doesn’t seem very committed to the team.", "This role requires an energetic, young professional.", "Older employees may struggle with learning new technology.", "He’s a natural leader, so he’s the obvious choice for promotion.", "Our product is the best product money can buy."]

In [13]:
import torch

test_t5_sequences3 = []

# Prepare inputs for batch processing
prefix = 'Translate biased to unbiased: '
batch_size = 5
start_index = 0
num_batches = (len(test_source_sequences) - start_index + batch_size - 1) // batch_size

# Process batches
for i in range(num_batches):
    start_idx = start_index + i * batch_size
    end_idx = min(start_index + (i + 1) * batch_size, len(test_source_sequences))
    batch_sequences = test_source_sequences[start_idx:end_idx]

    # Tokenize batch inputs
    batch_inputs = [prefix + source_sequence for source_sequence in batch_sequences]
    test_inputs = t5_tokenizer(batch_inputs, padding=True, return_tensors='pt') # Changed to 'pt'

    # Generate batch outputs, ensuring the model and inputs are on the same device
    # If using GPU, make sure to move the model to the GPU as well
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    t5_model = t5_model.to(device)  # Move model to the selected device
    test_inputs = test_inputs.to(device)  # Move inputs to the selected device

    test_output_ids = t5_model.generate(test_inputs.input_ids, max_length=256)
    predicted_sequences = t5_tokenizer.batch_decode(test_output_ids, skip_special_tokens=True)

    # Append batch outputs to results list
    test_t5_sequences3.extend(predicted_sequences)

# Ensure the number of generated sequences matches the number of input sequences
assert len(test_t5_sequences3) == len(test_source_sequences) - start_index

In [14]:
test_t5_sequences3

['she’s always late because she doesn’t care.',
 'let’s target high-energy people for this campaign.',
 'she’s just not a good cultural fit.',
 'he doesn’t seem very committed to the team.',
 'this role requires an energetic, young professional.',
 'older employees may struggle with learning new technology.',
 'he’s a natural leader, so he’s the obvious choice for promotion.',
 'our product is the best product money can buy.']