# Llama model pre-training on HPU


In [None]:
#!/usr/bin/env python

import os
from typing import Any, Dict

from torch.utils.data import DataLoader

import transformers
from transformers import HfArgumentParser, default_data_collator

from megatron import get_args, print_rank_0
from megatron.core import mpu
from megatron.data import gpt_dataset
from megatron.initialize import initialize_megatron
from megatron.data.data_samplers import build_pretraining_data_loader
from megatron.training import build_train_valid_test_datasets, update_train_iters

from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments



## Prepare environment
This example run on single node with 4 HPUs.

We recommend using a prebuilt container to run these examples. To run a container, you need Docker. See [Install Docker Engine](https://docs.docker.com/engine/install/) for installation instructions.

Next, follow [Run Using Containers](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html?highlight=installer#run-using-containers) to install the Habana drivers and container runtime.

### Get docker image
``` bash
docker pull vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
```
### Run docker image
``` bash
docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
# maybe should mapping your workspace volumns
```
### Install dependency
``` bash
# "optimum-habana>1.11.1" if exection mode "eager" or "eager.compile" 
# "ray>=2.20.0"
pip install ray[train] notebook transformers datasets evaluate peft accelerate scikit-learn optimum-habana

# install deepspeed
pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0

# this notebook verfied with packages' version:
# transformers==4.38.2
# datasets==2.19.1
# evaluate==0.4.2
# peft==0.4.0
# accelerate==0.27.2
# scikit-learn==1.4.2
# optimum-habana==1.11.1

# deepspeed==0.12.4+hpu.synapse.v1.15.0
```

In [None]:
class MegatronDataset:
    def __call__(self, config):
        def _train_valid_test_datasets_provider(train_val_test_num_samples):
            """Build train, valid, and test datasets."""
            args = get_args()
            print_rank_0("> building train, validation, and test datasets " "for GPT ...")
            train_ds, valid_ds, test_ds = gpt_dataset.build_train_valid_test_datasets(
                data_prefix=args.data_path,
                data_impl=args.data_impl,
                splits_string=args.split,
                train_valid_test_num_samples=train_val_test_num_samples,
                seq_length=args.seq_length,
                seed=args.seed,
                skip_warmup=(not args.mmap_warmup),
                train_data_prefix=args.train_data_path,
                valid_data_prefix=args.valid_data_path,
                test_data_prefix=args.test_data_path,
                data_cache_path=args.data_cache_path,
            )
            print_rank_0("> finished creating GPT datasets ...")

            return train_ds, valid_ds, test_ds

        args = get_args()
        update_train_iters(args)
        datasets = build_train_valid_test_datasets(_train_valid_test_datasets_provider)
        print_rank_0(datasets)
        return datasets


def load_datasets(config):
    dataset = MegatronDataset()
    return dataset(config)


In [None]:
class MegatronProcesser:
    def prepare(self, tokenizer, dataset, **kwargs):
        args = get_args()

        (train_dataloader, valid_dataloader, test_dataloader) = (None, None, None)

        print_rank_0("> building train, validation, and test datasets ...")
        iteration = kwargs.get("step", 0)
        if iteration:
            # passed value is starting step
            iteration -= 1
            args.consumed_train_samples = iteration * args.global_batch_size
            args.consumed_valid_samples = (
                (args.iteration // args.eval_interval) * args.eval_iters * args.global_batch_size
            )

        # Data loader only on rank 0 of each model parallel group.
        if args.use_dataset_only or mpu.get_tensor_model_parallel_rank() == 0:
            # Build datasets.
            train_ds, valid_ds, test_ds = dataset

            # Build dataloders.
            train_dataloader = build_pretraining_data_loader(train_ds, args.consumed_train_samples)
            valid_dataloader = build_pretraining_data_loader(valid_ds, args.consumed_valid_samples)
            test_dataloader = build_pretraining_data_loader(test_ds, 0)

        return train_dataloader, valid_dataloader, test_dataloader


In [None]:
def load_tokenizer(config):
    name = config["name"]
    load_config = config["config"]
    return transformers.AutoTokenizer.from_pretrained(name, **load_config)


In [None]:
class HuggingFaceModelFromConfig:
    def __call__(self, config):
        name = config["name"]
        self.model_config = config.get("config", {})
        self.auto_config = None
        if name is not None:
            self.auto_config = transformers.AutoConfig.from_pretrained(
                pretrained_model_name_or_path=name, **self.model_config
            )
        else:
            self.auto_config = transformers.AutoConfig.for_model(**self.model_config)
        self.model = transformers.AutoModelForCausalLM.from_config(self.auto_config)

        return self.model


def load_model(config):
    model = HuggingFaceModelFromConfig()
    return model(config)


In [None]:
class HFCustomerSamplerTrainer(GaudiTrainer):  # type: ignore
    def set_sampler(self, sampler):
        self.customer_sampler = sampler

    def get_train_dataloader(self) -> DataLoader:
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")

        train_dataloader, _, _ = self.customer_sampler.prepare(
            None, (self.train_dataset, None, None)
        )
        return train_dataloader


def get_trainer(config, training_args, datasets, tokenizer, model):
    gaudi_config = GaudiConfig.from_pretrained(
        training_args.gaudi_config_name,
        cache_dir=config.get("cache_dir", None),
        revision=config.get("model_revision", None),
        use_auth_token=True if config.get("use_auth_token") else None,
    )

    train_dataset, eval_dataset, test_dataset = datasets

    trainer = HFCustomerSamplerTrainer(
        model=model,
        gaudi_config=gaudi_config,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=None,
        tokenizer=tokenizer,
        # Data collator will default to DataCollatorWithPadding, so we change it.
        data_collator=default_data_collator,
        compute_metrics=None,
        preprocess_logits_for_metrics=None
    )
    return trainer


In [None]:
def pretrain_llama(config: Dict[str, Any]):

    # initialize_megatron(config["megatron_config"])
    initialize_megatron(ignore_unknown_args=True, external_args=config["megatron_config"], allow_no_cuda=True)

    datasets = load_datasets(config["datasets"])

    dataprocessor = MegatronProcesser()

    tokenizer = load_tokenizer(config["tokenizer"])

    training_args = GaudiTrainingArguments(**config["training_args"])

    model = load_model(config["model"])

    trainer = get_trainer(config, training_args, datasets, tokenizer, model)
    trainer.set_sampler(dataprocessor)

    result = trainer.train()
    trainer.save_model()
    print(result)


In [None]:
def main(num_workers):
    import ray
    from ray.train import ScalingConfig
    from ray.train.torch import TorchTrainer, TorchConfig

    pretrain_config = {
        "megatron_config": {
            "data_path": ["/root/workspace/bigscience/data/oscar/zh/tokenized_text_document"],
            "data_impl": "mmap",
            "micro_batch_size": 1,
            "global_batch_size": 4,
            "seq_length": 2048,
            "use_dataset_only": True,
            # "vocab_file": "/home/user/workspace/data/gpt2-vocab.json",
            "tokenizer_type": "HFTokenizer",
            "tokenizer_model": "huggyllama/llama-7b",
            # "merge_file": "/home/user/workspace/data/gpt2-merges.txt",
            "eval_interval": 1000,
            "train_samples": 300_000_000,
            "split": "949,50,1",
        },
        "datasets": {
        },
        "tokenizer": {
            "name": "huggyllama/llama-7b",
            "config": {}
        },
        "model": {
            "name": "huggyllama/llama-7b",
            "config": {
                "torch_dtype": "bfloat16",
            },
        },
        "training_args": {
            "per_device_train_batch_size": 1,
            "per_device_eval_batch_size": 1,
            "do_train": True,
            "do_eval": False,
            "save_strategy": "steps",
            "save_steps": 1000,
            "output_dir": "/tmp/pretrain-llama",
            "gaudi_config_name": "Habana/gpt2",
            "use_habana": True,
            "max_steps": 100000,
            "throughput_warmup_steps": 3,
            "use_lazy_mode": True,
            "overwrite_output_dir": True,
            "seed": 42,
            "bf16": True,
            "report_to":'tensorboard',
            "deepspeed": {
                "steps_per_print": 64,
                "train_batch_size": "auto",
                "train_micro_batch_size_per_gpu": "auto",
                "gradient_accumulation_steps": "auto",
                "gradient_checkpoint": True,
                "memory_efficient_linear": False,
                "bf16": {
                    "enabled": True
                },
                "gradient_clipping": 1.0,
                "zero_optimization": {
                    "stage": 3,
                    "overlap_comm": False,
                    "reduce_scatter": False,
                    "contiguous_gradients": False,
                    "stage3_gather_16bit_weights_on_model_save": True
                }
            },
        },
    }

    scaling_config = ScalingConfig(num_workers=num_workers,
                                   use_gpu=False,
                                   resources_per_worker={"CPU": 1, "HPU": 1})

    # Set backend to hccl in TorchConfig
    torch_config = TorchConfig(backend="hccl")
    runtime_env = {
        "env_vars": {
        }
    }

    ray.init(runtime_env=runtime_env)

    # Initialize a Ray TorchTrainer
    trainer = TorchTrainer(
        train_loop_per_worker=pretrain_llama,
        train_loop_config=pretrain_config,
        torch_config=torch_config,
        scaling_config=scaling_config
    )

    result = trainer.fit()
    print(result)

In [None]:
# set some environment variables
os.environ["RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES"] = "0"
main(num_workers=4)