In [20]:
import kfp
from kfp.dsl import component, pipeline

In [23]:
@component(
    base_image='python:3.9',
    packages_to_install=["git+https://github.com/kubeflow/trainer.git@master#subdirectory=sdk"],
)
def deepspeed_training_job() -> str:
    from kubeflow.trainer import CustomTrainer, TrainerClient

    def deepspeed_train_t5(args):
        import os
        import time
        import torch
        import torch.distributed as dist
        from torch.utils.data.distributed import DistributedSampler
        from transformers import T5Tokenizer, T5ForConditionalGeneration
        from datasets import Dataset
        import deepspeed
        import numpy as np
    
        # Initialize distributed environment.
        deepspeed.init_distributed(dist_backend="nccl")
        local_rank = int(os.environ["LOCAL_RANK"])
    
        # Define the Wikihow dataset class
        class wikihow(torch.utils.data.Dataset):
            def __init__(
                self,
                tokenizer,
                num_samples,
                input_length,
                output_length,
            ):
                self.dataset = Dataset.from_csv(args["DATASET_URL"])
                self.dataset = self.dataset.select(list(range(0, num_samples)))
                self.input_length = input_length
                self.tokenizer = tokenizer
                self.output_length = output_length
    
            def __len__(self):
                return self.dataset.shape[0]
    
            def clean_text(self, text):
                # Dataset contains empty values.
                if text is None:
                    return ""
                text = text.replace("Example of text:", "")
                text = text.replace("Example of Summary:", "")
                text = text.replace("\n", "")
                text = text.replace("``", "")
                text = text.replace('"', "")
    
                return text
    
            def convert_to_features(self, example_batch):
                input_ = self.clean_text(example_batch["text"])
                target_ = self.clean_text(example_batch["headline"])
    
                source = self.tokenizer(
                    input_,
                    max_length=self.input_length,
                    padding="max_length",
                    truncation=True,
                    return_tensors="pt",
                )
                targets = self.tokenizer(
                    target_,
                    max_length=self.output_length,
                    padding="max_length",
                    truncation=True,
                    return_tensors="pt",
                )
    
                return source, targets
    
            def __getitem__(self, index):
                source, targets = self.convert_to_features(self.dataset[index])
                return {
                    "source_ids": source["input_ids"].squeeze(),
                    "source_mask": source["attention_mask"].squeeze(),
                    "target_ids": targets["input_ids"].squeeze(),
                    "target_mask": targets["attention_mask"].squeeze(),
                }
    
        # Download model and tokenizer
        if dist.get_rank() == 0:
            print("-" * 100)
            print("Downloading T5 Model")
            print("-" * 100)
    
        model = T5ForConditionalGeneration.from_pretrained(args["MODEL_NAME"])
        tokenizer = T5Tokenizer.from_pretrained(args["MODEL_NAME"])
    
        # Download dataset.
        dataset = wikihow(tokenizer, num_samples=1500, input_length=512, output_length=150)
        train_loader = torch.utils.data.DataLoader(
            dataset, batch_size=4, sampler=DistributedSampler(dataset)
        )
    
        # Define DeepSpeed configuration.
        # Train batch size = micro batch size * gradient steps * GPUs (e.g. 2 x 1 x 8 = 16).
        ds_config = {
            "train_micro_batch_size_per_gpu": 2,
            "gradient_accumulation_steps": 1,
            "fp16": {"enabled": True},  # Enable mixed precision
            "optimizer": {
                "type": "AdamW",
                "params": {"lr": 0.002},
            },
            "scheduler": {
                "type": "WarmupLR",
                "params": {
                    "warmup_min_lr": 0,
                    "warmup_max_lr": 0.001,
                    "warmup_num_steps": 1000,
                },
            },
        }
    
        # Initialize model with DeepSpeed.
        model, _, _, _ = deepspeed.initialize(
            config=ds_config,
            model=model,
            model_parameters=model.parameters(),
        )
    
        # Start training process.
        if dist.get_rank() == 0:
            print("-" * 100)
            print("Starting DeepSpeed distributed training...")
            print("-" * 100)
    
        t0 = time.time()
        for epoch in range(1, 3):
            losses = []
            for batch_idx, batch in enumerate(train_loader):
                for key in batch.keys():
                    batch[key] = batch[key].to(local_rank)
                # Forward pass.
                output = model(
                    input_ids=batch["source_ids"],
                    attention_mask=batch["source_mask"],
                    labels=batch["target_ids"],
                )
                loss = output.loss
    
                # Run backpropagation.
                model.backward(loss)
                # Weight updates.
                model.step()
                losses.append(loss.item())
                if batch_idx % 10 == 0 and dist.get_rank() == 0:
                    print(
                        "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                            epoch,
                            batch_idx * len(batch),
                            len(train_loader.dataset),
                            100.0 * batch_idx / len(train_loader),
                            loss.item(),
                        )
                    )
    
            if dist.get_rank() == 0:
                print("-" * 100)
                print("Average Train Loss: {0:.4f}".format(np.mean(losses)))
                print("-" * 100)
    
        # Export model to S3.
        HOME_PATH = "/home/mpiuser"
        model.save_checkpoint(save_dir=HOME_PATH)
    
        if dist.get_rank() == 0:
            print("-" * 100)
            print(f"DeepSpeed training time: {int(time.time() - t0)} seconds")
            print("-" * 100)
    
            print("Upload T5 model to S3??")
            file_path = os.path.join(HOME_PATH, "global_step94/mp_rank_00_model_states.pt")
            print(file_path)
            # bucket = boto3.resource("s3").Bucket(args["BUCKET"])
            # bucket.upload_file(file_path, f"deepspeed/{file_path}")
            dist.destroy_process_group()
    
    
    MODEL_NAME = "t5-base"
    BUCKET_NAME = "deepseek-t5-base"
    args = {
        "DATASET_URL": "https://public-nlp-datasets.s3.us-west-2.amazonaws.com/wikihowAll.csv",
        "MODEL_NAME": MODEL_NAME,
        "BUCKET": BUCKET_NAME
    }

    for r in TrainerClient().list_runtimes():
        print(f"Name: {r.name}, Framework: {r.trainer.framework.value}, Trainer Type: {r.trainer.trainer_type.value}\n")
        print(f"Entrypoint: {r.trainer.entrypoint[:3]}\n")
        print(f"Runtime Accelerators: {r.trainer.accelerator_count} x {r.trainer.accelerator}")
    
        if r.name == "deepspeed-distributed":
            deepspeed_runtime = r
    
    job_id = TrainerClient().train(
        trainer=CustomTrainer(
            func=deepspeed_train_t5,
            func_args=args,
            packages_to_install=["boto3"], # Custom packages to install at runtime.
            num_nodes=2,
            resources_per_node={
                "cpu": 5,
                "memory": "16Gi",
                "gpu": 1, # Comment this line if you don't have GPUs.
            },
        ),
        runtime=deepspeed_runtime,
    )
    
    print(job_id)
    
    for s in TrainerClient().get_job(name=job_id).steps:
        print(f"Step: {s.name}, Status: {s.status}, Devices: {s.device} x {s.device_count}")

    return job_id

In [24]:
@pipeline(name='deepspeed-training-job-pipeline')
def deepspeed_training_job_pipeline():
    step1 = deepspeed_training_job()
    print(step1.output)

client = kfp.Client()
kfp.compiler.Compiler().compile(deepspeed_training_job_pipeline, 'deepspeed_training_job_pipeline.yaml')
run = client.create_run_from_pipeline_func(deepspeed_training_job_pipeline, arguments={}, enable_caching=False)

{{channel:task=deepspeed-training-job;name=Output;type=String;}}
