In [None]:
#installing necessary libraries
! pip install huggingface_hub
! pip install transformers
! pip install datasets

In [None]:
from huggingface_hub import notebook_login
# Log in to the Hugging Face Hub from the notebook environment
notebook_login()

In [None]:
import transformers
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML
from transformers import AutoTokenizer, TFAutoModelForCausalLM
from transformers import TFAutoModelForMaskedLM
from transformers.keras_callbacks import PushToHubCallback
from transformers import create_optimizer, AdamWeightDecay
import tensorflow as tf
from transformers import DataCollatorForLanguageModeling
from transformers import pipeline
import math

In [5]:
# Send example telemetry with specific information
from transformers.utils import send_example_telemetry
send_example_telemetry("language_modeling_notebook", framework="tensorflow")

**Preparing the dataset**

In [None]:
# Load the 'wikitext' dataset from the 'datasets' library
from datasets import load_dataset
datasets = load_dataset("wikitext", "wikitext-2-raw-v1")

In [7]:
# Define a function to display a specified number of random elements from a dataset
def show_random_elements(dataset, num_examples=10):
    # Ensure that the number of examples requested is not greater than the dataset size
    assert num_examples <= len(
        dataset
    ), "Can't pick more elements than there are in the dataset."
    # Initialize an empty list to store randomly selected indices
    picks = []
    # Randomly select indices for displaying examples
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset) - 1)
        # Ensure that the same example is not picked again
        while pick in picks:
            pick = random.randint(0, len(dataset) - 1)
        picks.append(pick)

    # Create a DataFrame containing the randomly selected examples
    df = pd.DataFrame(dataset[picks])
    # If a feature is of type ClassLabel, convert the indices to human-readable labels
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    #Display the DataFrame as HTML
    display(HTML(df.to_html()))

In [None]:
# Show a selection of random examples from the "train" subset of the loaded dataset
show_random_elements(datasets["train"])

In [18]:
block_size = 128   # Set the block size for tokenization

# **Masked Language Modelling**

In [9]:
model_checkpoint = "distilroberta-base"   # Define the model checkpoint to use

In [13]:
# Define a tokenization function that takes examples and tokenizes them using the tokenizer
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [None]:
# Create a tokenizer instance based on the specified model checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# Tokenize the datasets using the provided tokenization function
# Map the function across batches, using multiple processes
tokenized_datasets = datasets.map(
    tokenize_function, batched=True, num_proc=4, remove_columns=["text"]
)

In [16]:
# Define a function to group tokenized texts into chunks of a specific block size
def group_texts(examples):
    # Concatenate all texts within each example
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}

    # Determine the total length of concatenated texts and adjust to multiples of block_size
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size

    # Split concatenated texts into chunks of size block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    # Create 'labels' by copying 'input_ids', since this is used for language modeling
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
# Apply the 'group_texts' function to tokenized datasets
# Map the function across batches, using multiple processes
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [None]:
# Instantiate the masked language model using the specified checkpoint
model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [None]:
# Define the optimizer using AdamWeightDecay with specified learning rate and weight decay rate
optimizer = AdamWeightDecay(lr=2e-5, weight_decay_rate=0.01)
# Compile the model with the defined optimizer and enable JIT compilation
model.compile(optimizer=optimizer, jit_compile=True)

In [22]:
# Instantiate a DataCollatorForLanguageModeling for preparing training data
# Set tokenizer and MLM probability, and specify returning tensors in NumPy format
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm_probability=0.15, return_tensors="np"
)

In [None]:
# Prepare the training set and validation set using the model's 'prepare_tf_dataset' method
train_set = model.prepare_tf_dataset(
    lm_datasets["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

validation_set = model.prepare_tf_dataset(
    lm_datasets["validation"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [None]:
# Extract the model name from the checkpoint path
model_name = model_checkpoint.split("/")[-1]

# Train the model on the training set and validate on the validation set
model.fit(train_set, validation_data=validation_set, epochs=1, callbacks=[callback])

In [None]:
# Evaluate the model on the validation set and retrieve evaluation results
eval_results = model.evaluate(validation_set)
print(f"Perplexity: {math.exp(eval_results):.2f}")

**Inference**

In [None]:
# Create a pipeline for filling masked tokens using the fine-tuned model
mask_filler = pipeline(
    "fill-mask",
    "Rocketknight1/distilroberta-base-finetuned-wikitext2",
    framework="tf",
)

Checking Examples

In [None]:
mask_filler("The most common household pets are <mask> and dogs.", top_k=1)

In [None]:
mask_filler("The Gulf War was a conflict that took place in <mask> in 1990-1991.", top_k=3)

# Gradio

In [None]:
!pip install gradio

In [None]:
import gradio as gr

# Define a function that takes an input text and returns the model's predictions
def generate_text(input_text):
    # Use the 'mask_filler' pipeline to generate masked text predictions
    predictions = mask_filler(input_text)
    return predictions[0]["sequence"]

# Create a Gradio interface
interface = gr.Interface(
    fn=generate_text,
    inputs=gr.inputs.Textbox(),
    outputs=gr.outputs.Textbox(),
    title="Language Model Text Generation",
    description="Enter a sentence with a masked word to see model predictions."
)

# Launch the Gradio interface
interface.launch()