In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!apt install git-lfs

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import TFAutoModelForMaskedLM, AutoModelForMaskedLM

model_checkpoint = "GroNLP/hateBERT"
model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint)

All PyTorch model weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [None]:
model.summary()

Model: "tf_bert_for_masked_lm_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  24459834  
                                                                 
Total params: 109514298 (417.76 MB)
Trainable params: 109514298 (417.76 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
text = "She works as a [MASK]."

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
import numpy as np
import tensorflow as tf

inputs = tokenizer(text, return_tensors="np")
token_logits = model(**inputs).logits
mask_token_index = np.argwhere(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
mask_token_logits = token_logits[0, mask_token_index, :]
top_5_tokens = np.argsort(-mask_token_logits)[:5].tolist()

for token in top_5_tokens:
    print(f">>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}")

>>> She works as a model.
>>> She works as a trainer.
>>> She works as a consultant.
>>> She works as a dentist.
>>> She works as a baker.


In [None]:
from datasets import load_dataset

snli_dataset = load_dataset("snli")
snli_dataset

DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 550152
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
})

In [None]:
sample = snli_dataset["train"].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"\n'>>> Review: {row['hypothesis']}'")
    print(f"'>>> Label: {row['label']}'")


'>>> Review: the historian is digging with his friend for study.'
'>>> Label: 1'

'>>> Review: A boy is riding a donkey.'
'>>> Label: 2'

'>>> Review: A man is outside on the patio.'
'>>> Label: 2'


In [None]:
def tokenize_function(examples):
    result = tokenizer(examples["hypothesis"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

tokenized_datasets = snli_dataset.map(
    tokenize_function, batched=True, remove_columns=["premise", "label","hypothesis"]
)
tokenized_datasets

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/550152 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 550152
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 10000
    })
})

In [None]:
tokenizer.model_max_length

512

In [None]:
chunk_size = 128

In [None]:
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

'>>> Review 0 length: 12'
'>>> Review 1 length: 15'
'>>> Review 2 length: 11'


In [None]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Concatenated reviews length: 38'


In [None]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 38'


In [None]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/550152 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 824
    })
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 44998
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 825
    })
})

In [None]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

'women are hugging on vacation. [SEP] [CLS] the women are sleeping. [SEP] [CLS] there are women showing affection. [SEP] [CLS] the people are eating omelettes. [SEP] [CLS] the people are sitting at desks in school. [SEP] [CLS] the diners are at a restaurant. [SEP] [CLS] a man is drinking juice. [SEP] [CLS] two women are at a restaurant drinking wine. [SEP] [CLS] a man in a restaurant is waiting for his meal to arrive. [SEP] [CLS] a blond man getting a drink of water from a fountain in the park. [SEP] [CLS] a blond man wearing a brown shirt is reading a book on a bench in the park [SEP] [CLS] a blond man'

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



'>>> [CLS] a person is training his horse [MASK] [MASK] competition. [SEP] [CLS] a person is at a diner, ordering an omelette. [SEP] [CLS] a person is outdoors, on [MASK] horse. [SEP] [CLS] they [MASK] [MASK] at their parents [SEP] [CLS] there are children present [SEP] [CLS] the [MASK] are frowning [SEP] [CLS] the [MASK] skate rhetoric down the sidewalk. [SEP] [CLS] the boy does a skateboarding trick. [SEP] [CLS] the boy is wearing safety equipment [MASK] [SEP] [CLS] an older man drinks his juice as crack waits [MASK] his daughter to get off work. [SEP] [CLS] a boy flips a burger [MASK] [SEP] [CLS] an elderly [MASK] sits in a small shop. [SEP] [CLS] some'

'>>> women are [MASK] on vacation. [SEP] [CLS] the [MASK] are sleeping. [SEP] [CLS] there are women showing [MASK] [MASK] [SEP] [CLS] the people are eating omelettes. [SEP] [CLS] the people are sitting at desks in school. [SEP] [CLS] [MASK] diners are at buried restaurant. [SEP] [CLS] a man [MASK] drinking juice. [SEP] [CLS] two [M

In [None]:
import collections
import numpy as np

from transformers.data.data_collator import tf_default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return tf_default_data_collator(features)

In [None]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] a person is [MASK] his horse for a competition. [SEP] [CLS] a person is [MASK] a diner, ordering [MASK] omelette. [SEP] [CLS] [MASK] person [MASK] outdoors, on a horse [MASK] [SEP] [CLS] they are [MASK] at their parents [SEP] [CLS] there are [MASK] present [SEP] [CLS] the [MASK] are frowning [SEP] [CLS] the boy skates [MASK] [MASK] sidewalk. [SEP] [CLS] the boy [MASK] a skateboarding [MASK]. [SEP] [CLS] [MASK] boy is [MASK] safety equipment. [SEP] [CLS] [MASK] older man [MASK] [MASK] juice as he waits for his daughter to get off work [MASK] [SEP] [CLS] a [MASK] [MASK] [MASK] a burger. [SEP] [CLS] an elderly man sits in a small shop. [SEP] [CLS] some'

'>>> women are hugging on vacation. [SEP] [CLS] [MASK] women [MASK] sleeping. [SEP] [CLS] [MASK] are women showing affection. [SEP] [CLS] the people are eating [MASK] [MASK] [MASK]. [SEP] [CLS] the people are sitting [MASK] desks in school. [SEP] [CLS] [MASK] diners are at a restaurant. [SEP] [CLS] a man is drinking juice [MAS

In [None]:
train_size = 10_000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1000
    })
})

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
tf_train_dataset = model.prepare_tf_dataset(
    downsampled_dataset["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)

tf_eval_dataset = model.prepare_tf_dataset(
    downsampled_dataset["test"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=32,
)

In [None]:
from transformers import create_optimizer
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf

num_train_steps = len(tf_train_dataset)
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

model_name = model_checkpoint.split("/")[-1]
callback = PushToHubCallback(
    output_dir=f"{model_name}-finetuned-snli", tokenizer=tokenizer
)

Cloning https://huggingface.co/viditnaik/hateBERT-finetuned-snli into local empty directory.


In [None]:
import math

eval_loss = model.evaluate(tf_eval_dataset)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

Perplexity: 18.73


In [None]:
model.fit(tf_train_dataset, validation_data=tf_eval_dataset, callbacks=[callback])



<keras.src.callbacks.History at 0x7f0afea715a0>

In [None]:
eval_loss = model.evaluate(tf_eval_dataset)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

Perplexity: 5.96


In [None]:
from transformers import pipeline

mask_filler = pipeline(
    "fill-mask", model="viditnaik/hateBERT-finetuned-snli"
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/534M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at viditnaik/hateBERT-finetuned-snli.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


Downloading (…)okenizer_config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

>>> she works as a nurse.
>>> she works as a trainer.
>>> she works as a model.
>>> she works as a photographer.
>>> she works as a waitress.
