In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!apt install git-lfs

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading

In [None]:
import transformers

def get_model_checkpoint():
    return "Davlan/xlm-roberta-base-finetuned-english"

def load_model(checkpoint):
    return transformers.TFAutoModelForMaskedLM.from_pretrained(checkpoint, from_pt=True)

model_checkpoint = get_model_checkpoint()
model = load_model(model_checkpoint)
model.summary()

masked_text = "This is a great <mask>."

def load_tokenizer(checkpoint):
    return transformers.AutoTokenizer.from_pretrained(checkpoint)

tokenizer = load_tokenizer(model_checkpoint)
tokenizer.mask_token_id


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFXLMRobertaForMaskedLM: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFXLMRobertaForMaskedLM from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaForMaskedLM from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFXLMRobertaForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaForMaskedLM for predictions without further training.


Model: "tfxlm_roberta_for_masked_lm_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFXLMRobertaMainL  multiple                  277453056 
 ayer)                                                           
                                                                 
 lm_head (TFXLMRobertaLMHea  multiple                  193240722 
 d)                                                              
                                                                 
Total params: 278295186 (1.04 GB)
Trainable params: 278295186 (1.04 GB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


tokenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

250001

In [None]:
import numpy as np
import tensorflow as tf

def prepare_inputs(text, tokenizer):
    return tokenizer(text, return_tensors="np")

def find_mask_token_index(inputs, tokenizer):
    return np.argwhere(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]

def get_top_tokens(logits, mask_index, num_tokens=5):
    mask_token_logits = logits[0, mask_index, :]
    return np.argsort(-mask_token_logits)[:num_tokens].tolist()
text = "This is a great <mask>."

inputs = prepare_inputs(text, tokenizer)
token_logits = model(**inputs).logits
mask_token_index = find_mask_token_index(inputs, tokenizer)
top_5_tokens = get_top_tokens(token_logits, mask_token_index)

def display_predictions(tokens, text, tokenizer):
    for token in tokens:
        print(f">>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}")

display_predictions(top_5_tokens, text, tokenizer)

>>> This is a great idea.
>>> This is a great story.
>>> This is a great day.
>>> This is a great song.
>>> This is a great time.


In [None]:
from datasets import load_dataset

def load_and_prepare_dataset(dataset_name, num_samples=3, seed=42):
    dataset = load_dataset(dataset_name)
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    return dataset, sample

def display_samples(sample):
    for row in sample:
        print(f"\n'>>> Review: {row['hypothesis']}'")

def tokenize_dataset(dataset, tokenizer):
    def tokenize_function(examples):
        result = tokenizer(examples["hypothesis"])
        if tokenizer.is_fast:
            result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
        return result
    return dataset.map(tokenize_function, batched=True, remove_columns=["premise", "hypothesis", "label"])

snli_dataset, sample = load_and_prepare_dataset("snli")
display_samples(sample)
tokenized_datasets = tokenize_dataset(snli_dataset, tokenizer)
print(tokenized_datasets)


Downloading builder script:   0%|          | 0.00/3.82k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]


'>>> Review: the historian is digging with his friend for study.'

'>>> Review: A boy is riding a donkey.'

'>>> Review: A man is outside on the patio.'


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/550152 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 550152
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 10000
    })
})


In [None]:
chunk_size = 128

In [None]:
def print_review_lengths(tokenized_samples):
    for idx, sample in enumerate(tokenized_samples["input_ids"]):
        print(f"'>>> Review {idx} length: {len(sample)}'")

def concatenate_and_print_length(tokenized_samples):
    concatenated = {k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()}
    total_length = len(concatenated["input_ids"])
    print(f"'>>> Concatenated reviews length: {total_length}'")
    return concatenated, total_length

def create_and_print_chunks(concatenated, total_length, chunk_size):
    chunks = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated.items()
    }
    for chunk in chunks["input_ids"]:
        print(f"'>>> Chunk length: {len(chunk)}'")

tokenized_samples = tokenized_datasets["train"][:3]
print_review_lengths(tokenized_samples)

concatenated, total_length = concatenate_and_print_length(tokenized_samples)
create_and_print_chunks(concatenated, total_length, chunk_size)


'>>> Review 0 length: 12'
'>>> Review 1 length: 17'
'>>> Review 2 length: 12'
'>>> Concatenated reviews length: 41'
'>>> Chunk length: 41'


In [None]:
# Revised Code
chunked_data = {key: [value[idx:idx + chunk_size] for idx in range(0, total_length, chunk_size)] for key, value in concatenated.items()}

for each_chunk in chunked_data["input_ids"]:
    print(f"'>>> Chunk length: {len(each_chunk)}'")


'>>> Chunk length: 41'


In [None]:
def split_into_chunks(data):
    # Combine all elements
    combined_data = {key: sum(data[key], []) for key in data.keys()}
    # Calculate total combined length
    combined_length = len(combined_data[next(iter(data))])
    # Adjust length to be a multiple of chunk_size
    adjusted_length = (combined_length // chunk_size) * chunk_size
    # Divide into chunks
    chunked_result = {
        key: [chunk[i:i + chunk_size] for i in range(0, adjusted_length, chunk_size)]
        for key, chunk in combined_data.items()
    }
    # Replicate input_ids to labels
    chunked_result["labels"] = chunked_result["input_ids"].copy()
    return chunked_result

processed_datasets = tokenized_datasets.map(split_into_chunks, batched=True)
processed_datasets


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/550152 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 901
    })
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 49186
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 904
    })
})

In [None]:
decoded_text = tokenizer.decode(processed_datasets["train"][1]["input_ids"])
print(decoded_text)

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

selected_samples = [processed_datasets["train"][index] for index in range(2)]
for sample in selected_samples:
    sample.pop("word_ids", None)

for batch in data_collator(selected_samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(batch)}'")


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


elderly man sits in a small shop.</s><s> Some women are hugging on vacation.</s><s> The women are sleeping.</s><s> There are women showing affection.</s><s> The people are eating omelettes.</s><s> The people are sitting at desks in school.</s><s> The diners are at a restaurant.</s><s> A man is drinking juice.</s><s> Two women are at a restaurant drinking wine.</s><s> A man in a restaurant is waiting for his meal to arrive.</s><s> A blond man getting a drink of water from a fountain in the park.</s><s> A blond man

'>>> <s> A person is training his horse for a competition.</s><s> A personbole at a diner, ordering an omelette.</s><s> A person is outdoors, on a horse.</s><s> They are smiling at their parents</s><s> There are<mask> present</s><s> The kids are<mask>wning</s><s> The boy skates<mask> the sidewalk.</s><s> The boy does a skateboarding trick.</s><s> The boy is wearing safety equipment.</s><s> An older man drinks his juice as he wait<mask><mask><mask> daughter to get off work<mas

In [None]:
import collections
import numpy as np
from transformers.data.data_collator import tf_default_data_collator

wwm_probability = 0.2

def apply_whole_word_masking(samples):
    for sample in samples:
        word_ids = sample.pop("word_ids")

        # Mapping tokens to their respective word indices
        token_to_word = collections.defaultdict(list)
        word_index = -1
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != word_index:
                    word_index = word_id
                token_to_word[word_index].append(idx)

        # Masking words based on probability
        random_mask = np.random.binomial(1, wwm_probability, len(token_to_word))
        input_ids = sample["input_ids"]
        labels = sample["labels"]
        updated_labels = [-100] * len(labels)
        for word_idx in np.nonzero(random_mask)[0]:
            for token_idx in token_to_word[word_idx.item()]:
                updated_labels[token_idx] = labels[token_idx]
                input_ids[token_idx] = tokenizer.mask_token_id
        sample["labels"] = updated_labels

    return tf_default_data_collator(samples)

sampled_data = [processed_datasets["train"][i] for i in range(2)]
processed_batch = apply_whole_word_masking(sampled_data)


In [None]:
train_size = 10_000
test_size = int(0.1 * train_size)

downsampled_dataset = processed_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1000
    })
})

In [None]:
tf_train_dataset = model.prepare_tf_dataset(
    downsampled_dataset["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32
)

tf_test_dataset = model.prepare_tf_dataset(
    downsampled_dataset["test"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=32
)


In [None]:
from transformers import create_optimizer
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf

# Calculate the number of training steps
num_training_steps = len(tf_train_dataset)
# Setting up the optimizer with warmup and weight decay
optimizer_config, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_training_steps,
    weight_decay_rate=0.01
)
# Compiling the model with the configured optimizer
model.compile(optimizer=optimizer_config)

# Enabling mixed-precision training with float16
tf.keras.mixed_precision.set_global_policy('mixed_float16')


In [None]:
import math

# Evaluating the model on the evaluation dataset and calculating perplexity
initial_eval_loss = model.evaluate(tf_test_dataset)
print(f"Initial Perplexity: {math.exp(initial_eval_loss):.2f}")

# Training the model
model.fit(tf_train_dataset, validation_data=tf_test_dataset)

# Re-evaluating the model to see improvements
final_eval_loss = model.evaluate(tf_test_dataset)
print(f"Final Perplexity: {math.exp(final_eval_loss):.2f}")


Initial Perplexity: 5.03




Final Perplexity: 3.70


In [None]:
from transformers import pipeline

mask_filler = pipeline(
    "fill-mask", model=model, tokenizer=tokenizer
)

preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

>>> This is a great idea.
>>> This is a great song.
>>> This is a great day.
>>> This is a great place.
>>> This is a great movie.


In [None]:
from transformers import AutoTokenizer, AutoModel , AutoModelForMaskedLM
from transformers import pipeline

! pip install honest
! pip install transformers
! pip install sentencepiece

from honest import honest

Collecting honest
  Downloading honest-0.2.1-py2.py3-none-any.whl (6.2 kB)
Collecting Unidecode==1.3.2 (from honest)
  Downloading Unidecode-1.3.2-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.7/235.7 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Unidecode, honest
Successfully installed Unidecode-1.3.2 honest-0.2.1


In [None]:
name_model = model
lang = "en"
k = 1

evaluator = honest.HonestEvaluator(lang)
masked_templates = evaluator.templates(data_set="binary")

  index_col=0, sep='\t').T.to_dict('dict')


In [None]:
# new_templates = {}

# for key, val in masked_templates.items():
#   new_key = key.replace("[M]", "<mask>")
#   new_templates[new_key] = val


# masked_templates = new_templates

In [None]:
def get_completion(sentence, template):
  start = sentence.find(template[:-4])+len(template[:-4])
  if tokenizer.eos_token != None and tokenizer.eos_token in sentence:
    end = sentence.find(tokenizer.eos_token)
  elif tokenizer.sep_token != None and tokenizer.sep_token in sentence:
    end = sentence.find(tokenizer.sep_token)
  else:
    end = len(sentence)-1

  return sentence[start:end]

tokenizer = tokenizer

In [None]:
nlp_fill = pipeline('fill-mask', model=model, tokenizer=tokenizer, top_k=k)

print("FILL EXAMPLE:",nlp_fill('all women likes to [M].'.replace('[M]',tokenizer.mask_token)))


# Fill templates (please check if the filled words contain any special character)
filled_templates = [[get_completion(fill['sequence'],masked_sentence) for fill in nlp_fill(masked_sentence.replace('[M]',tokenizer.mask_token))] for masked_sentence in masked_templates.keys()]

honest_score, honest_df = evaluator.honest_dataframe(filled_templates, masked_templates)
print(name_model, k, honest_score)

FILL EXAMPLE: [{'score': 0.12492531538009644, 'token': 1957, 'token_str': 'see', 'sequence': 'all women likes to see.'}]
<transformers.models.xlm_roberta.modeling_tf_xlm_roberta.TFXLMRobertaForMaskedLM object at 0x799c20464d60> 1 0.017955801104972375
