(dolly_v2_deepspeed_instruction_finetune)=

# Dolly-V2-3B Instruction Fine-Tuning wiht Ray AIR and DeepSpeed

In this demonstration, we'll show how to use the Ray AIR for Dolly V2 3B model instruction fine-tuning using the deep-speed framework. Please uncomment the next two cells and install the following libraries dependencies.

This work builds upon [existing efforts](https://github.com/ray-project/ray/blob/master/doc/source/ray-air/examples/gptj_deepspeed_fine_tuning.ipynb) by incorporating an instruction fine-tuning component.

In [None]:
# conda install -c conda-forge mpi4py
# conda install gcc gxx_linux-64

In [None]:
# pip install "ray==2.5.1" "accelerate==0.16.0" "datasets==2.12.0" "transformers==4.26.0"  "torch==1.13.0" "deepspeed==0.9.2"

In [None]:
import numpy as np
import pandas as pd
import os
from transformers import AutoTokenizer
import ray.data
import ray
from datasets import load_dataset
import evaluate
from transformers import Trainer, TrainingArguments
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
)
from transformers.utils.logging import enable_progress_bar
import torch

import transformers
from ray.train.huggingface import TransformersTrainer
from ray.air.config import ScalingConfig

## Set up Ray <a name="setup"></a>

First, we will use 2 workers, each being assigned 1 GPU and 28 CPUs.

In [None]:
model_name = "databricks/dolly-v2-3b"
use_gpu = True
num_workers = 2
cpus_per_worker = 28

In [None]:
fq_ray_ip = #<replace-this-with-your-ray-server-ip-address>

In [None]:
pip_env = {
    "pip": [
        "datasets==2.12.0",
        "evaluate==0.4.0",
        "accelerate==0.16.0",  # https://github.com/OpenGVLab/InternImage/issues/111
        "transformers==4.26.0",
        "torch==1.13.0",
        "deepspeed==0.9.2",
        "ipython==8.14.0",
    ]
}

In [None]:
conda_env = {
    "conda": {
        "dependencies": ["mpi4py", "pip", pip_env]
    }  # pip install mpi4py won't work, use conda install instead
}

In [None]:
ray.init(
    f"ray://{fq_ray_ip}:10001",  # Note: the port and ip-address depends on your ray server setup.
    runtime_env=conda_env,
)

## Loading the dataset <a name="load"></a>

We will be fine-tuning the model on the [`alpaca-cleaned` dataset](https://datasets-server.huggingface.co/splits?dataset=yahma%2Falpaca-cleaned), comprised of 51,000 lines of Q&A. The aim will be to make the databricks model better at generating answer by following the instruction.

We will use `generate_prompt` function to prepare our dataset for instruction fine-tuning.

In [None]:
current_dataset = load_dataset("yahma/alpaca-cleaned")
current_dataset

In [None]:
def generate_prompt(data_point):
    # ref: https://github.com/tloen/alpaca-lora
    if data_point["instruction"]:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}

### Input:
{data_point["input"]}

### Response:
{data_point["output"]}"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}

### Response:
{data_point["output"]}"""

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
tokenizer.pad_token_id = 0
CUTOFF_LEN = 128

current_dataset = current_dataset.shuffle().map(
    lambda data_point: tokenizer(
        generate_prompt(data_point),
        truncation=True,
        max_length=CUTOFF_LEN,
        padding="max_length",
    ),
)

In [None]:
ray_datasets = ray.data.from_huggingface(current_dataset["train"])
ray_datasets

## Instruction fine-tuning the model with Ray AIR

In [None]:
def trainer_init_per_worker(train_dataset, eval_dataset=None, **config):
    batch_size = config.get("batch_size", 1)
    epochs = config.get("epochs", 1)
    warmup_steps = config.get("warmup_steps", 0)
    learning_rate = config.get("learning_rate", 0.00002)
    weight_decay = config.get("weight_decay", 0.01)

    deepspeed = {
        "fp16": {
            "enabled": "auto",
            "initial_scale_power": 8,
        },
        "optimizer": {
            "type": "AdamW",
            "params": {
                "lr": "auto",
                "betas": "auto",
                "eps": "auto",
            },
        },
        "zero_optimization": {
            "stage": 3,
            "offload_optimizer": {
                "device": "cpu",
                "pin_memory": True,
            },
            "offload_param": {
                "device": "cpu",
                "pin_memory": True,
            },
            "overlap_comm": True,
            "contiguous_gradients": True,
            "reduce_bucket_size": "auto",
            "stage3_prefetch_bucket_size": "auto",
            "stage3_param_persistence_threshold": "auto",
            "gather_16bit_weights_on_model_save": True,
            "round_robin_gradients": True,
        },
        "gradient_accumulation_steps": "auto",
        "gradient_clipping": "auto",
        "steps_per_print": 10,
        "train_batch_size": "auto",
        "train_micro_batch_size_per_gpu": "auto",
        "wall_clock_breakdown": False,
    }

    print(f"batch_size: {batch_size}")
    print("Preparing training arguments")
    training_args = TrainingArguments(
        output_dir="deepspeed-dolly",
        per_device_train_batch_size=batch_size,
        logging_steps=1,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        warmup_steps=warmup_steps,
        num_train_epochs=epochs,
        push_to_hub=False,
        disable_tqdm=False,
        fp16=True,
        gradient_accumulation_steps=16,
        deepspeed=deepspeed,
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
    tokenizer.pad_token_id = 0

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
    )

    enable_progress_bar()

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        tokenizer=tokenizer,
        data_collator=transformers.DataCollatorForLanguageModeling(
            tokenizer, mlm=False
        ),
    )
    return trainer

In [None]:
trainer = TransformersTrainer(
    trainer_init_per_worker=trainer_init_per_worker,
    trainer_init_config={
        "batch_size": 16,  # batch_size per device
        "epochs": 1,
    },
    scaling_config=ScalingConfig(
        num_workers=num_workers,
        use_gpu=use_gpu,
        resources_per_worker={
            "GPU": 1,
            "CPU": cpus_per_worker,
        },  # NOTE: huggingface transformers only support 1 GPU per worker.
    ),
    run_config=ray.air.RunConfig(
        sync_config=ray.tune.syncer.SyncConfig(
            sync_on_checkpoint=False  # Note: one can also set up a storage path to persist the model checkpoint to a cloud bucket
        )
    ),
    datasets={
        "train": ray_datasets,
    },
)

Finally, we call the `~ray.train.huggingface.TransformersTrainer.fit` method to start training with Ray AIR.

In [None]:
results = trainer.fit()