In [None]:
# Setup Ray
import ray
ray.init()

In [None]:
# Preliminary imports
import transformers
import datasets

In [None]:
# Load the dataset (alpaca instruction dataset)
from datasets import load_dataset
tatsu = load_dataset("tatsu-lab/alpaca")

In [None]:
tatsu

In [None]:
tatsu['train'][0]

In [None]:
# Display random samples from the dataset.

import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=2):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    # Pick random elements from the dataset without replacement
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

show_random_elements(tatsu['train'])

In [None]:
# HF Dataset to Ray Dataset conversion
import ray.data

ray_dataset = ray.data.from_huggingface(tatsu)

ray_dataset

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [None]:
# Fetch the model and tokenizer for FLAN-T5

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

In [None]:
# Playing around with the model and tokenizer.
inputs = tokenizer("Generate a plan for how to maximize return on $200", return_tensors="pt")
outputs = model.generate(**inputs)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

In [None]:
# Preprocess the dataset.
# Write a preprocessing function that we use BatchMapper to apply to the dataset.

import pandas as pd
from ray.data.preprocessors import BatchMapper

def preprocess_function(sample):
    """Preprocesses a single text sample.
    Args: text: The text to preprocess.
    Returns: The preprocessed text.
    """
    ret = tokenizer(list(sample['instruction']), list(sample['input']), 
                    return_tensors="np",
                    padding=True,
                    truncation=True)
    ret["labels"] = ret["input_ids"].copy()
    return dict(ret)

batch_encoder = BatchMapper(preprocess_function, batch_format="pandas")

In [None]:
# Finetuning the model with Ray AIR

from transformers import TrainingArguments, Trainer
import numpy as np
import torch

batch_size = 16
use_gpu = False

def trainer_init_per_worker(train_dataset, eval_dataset = None, **config):
    print(f"Is CUDA available: {torch.cuda.is_available()}")
    tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
    model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
    args = TrainingArguments(
        "flan-t5-base-finetuned-alpaca",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        learning_rate=config.get("learning_rate", 2e-5),
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=config.get("epochs", 2),
        weight_decay=config.get("weight_decay", 0.01),
        push_to_hub=False,
        disable_tqdm=True,  # declutter the output a little
        no_cuda=not use_gpu,  # you need to explicitly set no_cuda if you want CPUs
    )

    hf_trainer = Trainer(
        model,
        args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer
    )

    print("Starting training")
    return hf_trainer

In [None]:
from ray.train.huggingface import HuggingFaceTrainer
from ray.air.config import RunConfig, ScalingConfig, CheckpointConfig

num_workers = 2

trainer = HuggingFaceTrainer(
    trainer_init_per_worker=trainer_init_per_worker,
    scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    datasets={"train": ray_dataset["train"], "evaluation": ray_dataset["train"]},
    run_config=RunConfig(
        checkpoint_config=CheckpointConfig(num_to_keep=1, checkpoint_score_attribute="eval_loss", checkpoint_score_order="min"),
    ),
    preprocessor=batch_encoder,
)

In [None]:
result = trainer.fit()

In [None]:
sample = tatsu['train'][4]

print(sample)

# Playing around with the model and tokenizer.
inputs = tokenizer(sample['instruction'], sample['input'], return_tensors="pt")
outputs = model.generate(**inputs)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))