# Model Finetuning and Batch Inference

<img src="https://technical-training-assets.s3.us-west-2.amazonaws.com/Generic/ray_logo.png" width="20%" loading="lazy">

## Getting started

### Set up imports and utilities

In [None]:
import random
import torch
import transformers
import warnings

import numpy as np
import pandas as pd

from IPython.display import display, HTML
from typing import Any, Dict, List, Optional

transformers.set_seed(42)
warnings.simplefilter("ignore")

### Initialize Ray runtime

In [None]:
import ray

In [None]:
ray.init()

Open the Ray Dashboard

## Data ingest

### Load the dataset

In [None]:
from datasets import load_dataset
from utils import get_random_elements

In [None]:
hf_dataset = load_dataset("tatsu-lab/alpaca", split="train").train_test_split(
    test_size=0.2, seed=57
)
hf_dataset

### Display sample data

In [None]:
df = get_random_elements(dataset=hf_dataset["train"], num_examples=3)
display(HTML(df.to_html()))

### Convert to Ray Dataset

In [None]:
ray_dataset = ray.data.from_huggingface(hf_dataset)
ray_dataset

### Set up train and validation Ray datasets

In [None]:
SMALL_DATA = True

if SMALL_DATA:
    train_dataset = ray_dataset["train"].limit(100)
    validation_dataset = ray_dataset["test"].limit(100)
else:
    train_dataset = ray_dataset["train"]
    validation_dataset = ray_dataset["test"]

## Distributed preprocessing

### Implement preprocessing function

In [None]:
from ray.data.preprocessors import BatchMapper
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [None]:
def preprocess_function(batch: Dict[str, Any]) -> Dict[str, Any]:
    """
    Tokenizes the input and instruction pairs in a batch using the T5 tokenizer
    from the Google/Flan-T5-Base model, and returns a dictionary containing the
    encoded inputs and labels.

    Args:
        batch: A dictionary containing at least two keys, "instruction" and
        "input", whose values are lists of strings.

    Returns:
        A dictionary containing the encoded inputs and labels, as returned by
        the T5 tokenizer.
    """
    model_name = "google/flan-t5-base"
    tokenizer = T5Tokenizer.from_pretrained(model_name)

    encoded_inputs = tokenizer(
        list(batch["instruction"]),
        list(batch["input"]),
        padding="max_length",
        truncation=True,
        return_tensors="np",
    )

    encoded_inputs["labels"] = encoded_inputs["input_ids"].copy()

    return dict(encoded_inputs)

In [None]:
batch_preprocessor = BatchMapper(preprocess_function, batch_format="pandas", batch_size=4096)

## Distributed finetuning

### Initialize training logic for each worker

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
batch_size = 2
use_gpu = True

In [None]:
def trainer_init_per_worker(
    train_dataset: ray.data.Dataset,
    eval_dataset: Optional[ray.data.Dataset] = None,
    **config,
) -> Trainer:
    """
    Initializes a Hugging Face Trainer for training a T5 text generation model.

    Args:
        train_dataset (ray.data.Dataset): The dataset for training the model.
        eval_dataset (ray.data.Dataset, optional): The dataset for evaluating
        the model.
            Defaults to None.
        config: Additional arguments to configure the Trainer.

    Returns:
        Trainer: A Hugging Face Trainer for training the T5 model.
    """
    device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    model_name = "google/flan-t5-base"

    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

    training_args = TrainingArguments(
        "flan-t5-base-finetuned-alpaca",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        learning_rate=config.get("learning_rate", 2e-5),
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=config.get("epochs", 4),
        weight_decay=config.get("weight_decay", 0.01),
        push_to_hub=False,
        disable_tqdm=True,
    )

    hf_trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
    )

    print("Starting training...")
    return hf_trainer

### Define Trainer

In [None]:
from ray.air.config import RunConfig, ScalingConfig, CheckpointConfig
from ray.train.huggingface import HuggingFaceTrainer

In [None]:
num_workers = 2

In [None]:
trainer = HuggingFaceTrainer(
    trainer_init_per_worker=trainer_init_per_worker,
    scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    datasets={
        "train": train_dataset,
        "evaluation": validation_dataset,
    },
    run_config=RunConfig(
        checkpoint_config=CheckpointConfig(
            num_to_keep=1,
            checkpoint_score_attribute="eval_loss",
            checkpoint_score_order="min",
        ),
    ),
    preprocessor=batch_preprocessor,
)

### Run finetuning

In [None]:
result = trainer.fit()

### Try the finetuned model

In [None]:
model_name = "google/flan-t5-base"

In [None]:
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
checkpoint = result.checkpoint
finetuned_model = checkpoint.get_model(model)

In [None]:
instruction = "How many bees do I have?"  # Enter your own instruction here.
input_query = (
    "I don't have enough bees."  # Write additional context for the model here.
)

inputs = tokenizer(instruction, input_query, return_tensors="pt")
outputs = finetuned_model.generate(**inputs)

print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

## [Optional] Distributed hyperparameter tuning

In [None]:
from ray import tune
from ray.tune import Tuner
from ray.tune.schedulers.async_hyperband import ASHAScheduler

In [None]:
total_num_trials = 4
max_tune_epochs = 16

In [None]:
num_workers = 1
use_gpus = True

In [None]:
trainer = HuggingFaceTrainer(
    trainer_init_per_worker=trainer_init_per_worker,
    scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    datasets={
        "train": train_dataset,
        "evaluation": validation_dataset,
    },
    run_config=RunConfig(
        checkpoint_config=CheckpointConfig(
            num_to_keep=1,
            checkpoint_score_attribute="eval_loss",
            checkpoint_score_order="min",
        ),
    ),
    preprocessor=batch_preprocessor,
)

In [None]:
tuner = Tuner(
    trainer,
    param_space={
        "trainer_init_config": {
            "learning_rate": tune.choice([2e-5, 2e-4, 2e-3, 2e-2]),
            "epochs": tune.choice([2, 4, 8, max_tune_epochs]),
            "weight_decay": tune.choice([0.01, 0.1, 1.0, 10.0]),
        }
    },
    tune_config=tune.TuneConfig(
        metric="eval_loss",
        mode="min",
        num_samples=total_num_trials,
        scheduler=ASHAScheduler(
            max_t=max_tune_epochs,
        ),
    ),
    run_config=RunConfig(
        checkpoint_config=CheckpointConfig(
            num_to_keep=1,
            checkpoint_score_attribute="eval_loss",
            checkpoint_score_order="min",
        )
    ),
)

In [None]:
result_grid = tuner.fit()

## Distributed batch inference

In [None]:
from ray.train.predictor import Predictor
from ray.train.batch_predictor import BatchPredictor
from transformers import AutoTokenizer

In [None]:
class HuggingFaceModelPredictor(Predictor):
    """
    A Ray Predictor for HuggingFace models that generates text given input data.

    Args:
        model (transformers.PreTrainedModel): A trained HuggingFace model.
        tokenizer (Optional[transformers.PreTrainedTokenizerBase]): A tokenizer
        that can tokenize input text.
        preprocessor (Optional[Callable]): A function that takes raw input data
        and returns tokenized input data.
        use_gpu (bool): Whether to use a GPU or CPU for prediction.
    """

    def __init__(
        self,
        model: Any,
        tokenizer: Optional[Any] = None,
        preprocessor: Optional[Any] = None,
        use_gpu: bool = False,
    ) -> None:
        super().__init__(preprocessor)
        self.model = model
        self.use_gpu = use_gpu
        self.tokenizer = tokenizer

    @classmethod
    def from_checkpoint(
        cls,
        checkpoint: Any,
        model_cls: Any,
        *,
        tokenizer: Optional[Any] = None,
        use_gpu: bool = False,
        **get_model_kwargs: Any,
    ) -> "HuggingFaceModelPredictor":
        """
        Create a HuggingFaceModelPredictor from a checkpoint.

        Args:
            checkpoint (Any): A checkpoint containing a trained HuggingFace model.
            model_cls (Any): The type of HuggingFace model to load from the checkpoint.
            tokenizer (Optional[Any]): A tokenizer that can tokenize input text.
            use_gpu (bool): Whether to use a GPU or CPU for prediction.
            **get_model_kwargs (Any): Additional keyword arguments for loading
            the HuggingFace model.

        Returns:
            HuggingFaceModelPredictor: A Ray Predictor for the HuggingFace model.
        """
        if not tokenizer:
            tokenizer = AutoTokenizer
        if isinstance(tokenizer, type):
            tokenizer = checkpoint.get_tokenizer(tokenizer)
        return cls(
            checkpoint.get_model(model_cls, **get_model_kwargs),
            tokenizer=tokenizer,
            preprocessor=checkpoint.get_preprocessor(),
            use_gpu=use_gpu,
        )

    def _predict_numpy(
        self,
        data: Dict[str, Any],
        feature_columns: Optional[List[str]] = None,
        **generate_kwargs: Any,
    ) -> pd.DataFrame:
        """
        Generates text given input data.

        Args:
            data (Dict[str, Any]): A dictionary of input data.
            feature_columns (Optional[List[str]]): A list of feature column names
            to use for prediction.
            **generate_kwargs (Any): Additional keyword arguments for generating text.

        Returns:
            pd.DataFrame: A Pandas DataFrame with a single column "generated_output"
            containing the generated text.
        """
        # we get already tokenized text here because we have the tokenizer as an AIR preprocessor
        if feature_columns:
            data = {k: v for k, v in data.items() if k in feature_columns}

        data = {
            k: torch.from_numpy(v).to(device=self.model.device) for k, v in data.items()
        }
        generate_kwargs = {**data, **generate_kwargs}

        outputs = self.model.generate(**generate_kwargs)
        return pd.DataFrame(
            self.tokenizer.batch_decode(outputs, skip_special_tokens=True),
            columns=["generated_output"],
        )

In [None]:
predictor = BatchPredictor.from_checkpoint(
    checkpoint=result.checkpoint,
    predictor_cls=HuggingFaceModelPredictor,
    model_cls=T5ForConditionalGeneration,
    tokenizer=T5Tokenizer,
    use_gpu=use_gpu,
    device_map="auto",
    torch_dtype=torch.float16,
)

In [None]:
prediction = predictor.predict(
    validation_dataset,
    num_gpus_per_worker=int(use_gpu),
    batch_size=256,
    max_new_tokens=128,
)

In [None]:
prediction.to_pandas()

# Connect with the Ray community

You can learn and get more involved with the Ray community of developers and researchers:

* [**Ray documentation**](https://docs.ray.io/en/latest)

* [**Official Ray site**](https://www.ray.io/)  
Browse the ecosystem and use this site as a hub to get the information that you need to get going and building with Ray.

* [**Join the community on Slack**](https://forms.gle/9TSdDYUgxYs8SA9e8)  
Find friends to discuss your new learnings in our Slack space.

* [**Use the discussion board**](https://discuss.ray.io/)  
Ask questions, follow topics, and view announcements on this community forum.

* [**Join a meetup group**](https://www.meetup.com/Bay-Area-Ray-Meetup/)  
Tune in on meet-ups to listen to compelling talks, get to know other users, and meet the team behind Ray.

* [**Open an issue**](https://github.com/ray-project/ray/issues/new/choose)  
Ray is constantly evolving to improve developer experience. Submit feature requests, bug-reports, and get help via GitHub issues.

* [**Become a Ray contributor**](https://docs.ray.io/en/latest/ray-contribute/getting-involved.html)  
We welcome community contributions to improve our documentation and Ray framework.

<img src="https://technical-training-assets.s3.us-west-2.amazonaws.com/Generic/ray_logo.png" width="20%" loading="lazy">