From 269bedc2cd8f0d5fe0861e7b011fcbd3ff779e0d Mon Sep 17 00:00:00 2001 From: woshiyyya Date: Mon, 21 Aug 2023 13:01:51 -0700 Subject: [PATCH 1/3] wip Signed-off-by: woshiyyya --- doc/source/ray-overview/examples.rst | 7 - doc/source/train/examples.rst | 8 - .../transformers/transformers_example.rst | 8 - python/ray/train/BUILD | 24 - .../train/examples/transformers/README.rst | 57 -- .../train/examples/transformers/__init__.py | 0 .../train/examples/transformers/cluster.yaml | 58 -- .../transformers/transformers_example.py | 629 ------------------ 8 files changed, 791 deletions(-) delete mode 100644 doc/source/train/examples/transformers/transformers_example.rst delete mode 100644 python/ray/train/examples/transformers/README.rst delete mode 100644 python/ray/train/examples/transformers/__init__.py delete mode 100644 python/ray/train/examples/transformers/cluster.yaml delete mode 100644 python/ray/train/examples/transformers/transformers_example.py diff --git a/doc/source/ray-overview/examples.rst b/doc/source/ray-overview/examples.rst index 5b1614b963c99..8ab9a46020f61 100644 --- a/doc/source/ray-overview/examples.rst +++ b/doc/source/ray-overview/examples.rst @@ -564,13 +564,6 @@ Ray Examples PyTorch Fashion MNIST Training Example - .. grid-item-card:: :bdg-secondary:`Code example` - :class-item: gallery-item pytorch training train - :link: train_transformers_example - :link-type: ref - - Transformers with PyTorch Training Example - .. grid-item-card:: :bdg-secondary:`Code example` :class-item: gallery-item tensorflow training train :link: tensorflow_mnist_example diff --git a/doc/source/train/examples.rst b/doc/source/train/examples.rst index 66808789518d6..01637764619ac 100644 --- a/doc/source/train/examples.rst +++ b/doc/source/train/examples.rst @@ -25,14 +25,6 @@ Distributed Training Examples using Ray Train PyTorch Fashion MNIST Training Example - .. grid-item-card:: - :img-top: /images/hugging.png - :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img - - .. button-ref:: train_transformers_example - - Transformers with PyTorch Training Example - .. grid-item-card:: :img-top: /images/tf_logo.png :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img diff --git a/doc/source/train/examples/transformers/transformers_example.rst b/doc/source/train/examples/transformers/transformers_example.rst deleted file mode 100644 index 7f7eeb4547fc6..0000000000000 --- a/doc/source/train/examples/transformers/transformers_example.rst +++ /dev/null @@ -1,8 +0,0 @@ -:orphan: - -.. _train_transformers_example : - -Ray Train Example for HuggingFace Transformers with PyTorch -=========================================================== - -.. literalinclude:: /../../python/ray/train/examples/transformers/transformers_example.py diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD index 3dda10edc6865..5dfa3c52ab6ea 100644 --- a/python/ray/train/BUILD +++ b/python/ray/train/BUILD @@ -75,30 +75,6 @@ py_test( deps = [":train_lib"] ) -py_test( - name = "transformers_example_gpu", - size = "medium", - main = "examples/transformers/transformers_example.py", - srcs = ["examples/transformers/transformers_example.py"], - tags = ["team:ml", "exclusive", "tune", "gpu_only"], - deps = [":train_lib"], - args = ["--model_name_or_path=bert-base-cased", "--task_name=mrpc", - "--max_length=32", "--per_device_train_batch_size=64", - "--max_train_steps=2", "--start_local", "--num_workers=2", "--use_gpu"] -) - -py_test( - name = "transformers_example_cpu", - size = "medium", - main = "examples/transformers/transformers_example.py", - srcs = ["examples/transformers/transformers_example.py"], - tags = ["team:ml", "exclusive", "tune"], - deps = [":train_lib"], - args = ["--model_name_or_path=bert-base-cased", "--task_name=mrpc", - "--max_length=32", "--per_device_train_batch_size=64", - "--max_train_steps=2", "--start_local", "--num_workers=2"] -) - py_test( name = "tune_cifar_torch_pbt_example", size = "medium", diff --git a/python/ray/train/examples/transformers/README.rst b/python/ray/train/examples/transformers/README.rst deleted file mode 100644 index f0435e4f4474c..0000000000000 --- a/python/ray/train/examples/transformers/README.rst +++ /dev/null @@ -1,57 +0,0 @@ -HuggingFace Transformers Glue Fine-tuning Example -================================================= - -We've ported the ``huggingface/transformers/examples/pytorch/text-classification/run_glue_no_trainer.py`` example to -Ray Train. This example enables fine-tuning the library models for sequence classification on the GLUE benchmark: General Language Understanding Evaluation. - -This script can fine-tune the following models: - CoLA, SST-2, MRPC, STS-B, QQP, MNLI, QNLI, RTE, WNLI. - -Additional information can be found at the `HuggingFace Repository -`_. - -Local process training ----------------------- - -To run an example tuning MRPC locally, without Ray: - -.. code-block:: bash - - export TASK_NAME=mrpc - - python transformers_example.py \ - --model_name_or_path bert-base-cased \ - --task_name $TASK_NAME \ - --max_length 128 \ - --per_device_train_batch_size 32 \ - --learning_rate 2e-5 \ - --num_train_epochs 3 \ - --output_dir /tmp/$TASK_NAME/ - -This is the same as running `run_glue_no_trainer.py `_. - -Distributed multi-node GPU training ------------------------------------ - -To run an example tuning MRPC on AWS with 8 GPUs across multiple nodes: - -.. code-block:: bash - - export TASK_NAME=mrpc - - ray up cluster.yaml - # (Optional) ray monitor cluster.yaml - ray submit cluster.yaml transformers_example.py \ - --model_name_or_path bert-base-cased \ - --task_name $TASK_NAME \ - --max_length 128 \ - --per_device_train_batch_size 32 \ - --learning_rate 2e-5 \ - --num_train_epochs 3 \ - --output_dir /tmp/$TASK_NAME/ \ - --address auto \ - --num_workers 8 \ - --use_gpu - -The example can also be run using :ref:`Ray Job Submission `, which is in beta starting with Ray 1.12. \ No newline at end of file diff --git a/python/ray/train/examples/transformers/__init__.py b/python/ray/train/examples/transformers/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/python/ray/train/examples/transformers/cluster.yaml b/python/ray/train/examples/transformers/cluster.yaml deleted file mode 100644 index 72e8676e01982..0000000000000 --- a/python/ray/train/examples/transformers/cluster.yaml +++ /dev/null @@ -1,58 +0,0 @@ -# An unique identifier for the head node and workers of this cluster. -cluster_name: transformer-cluster - -# The maximum number of workers nodes to launch in addition to the head -# node. This takes precedence over min_workers. min_workers default to 0. -min_workers: 3 -max_workers: 3 - -# Cloud-provider specific configuration. -provider: - type: aws - region: us-west-2 - -# How Ray will authenticate with newly launched nodes. -auth: - ssh_user: ubuntu - -available_node_types: - ray.head.default: - min_workers: 0 - max_workers: 0 - resources: {} - node_config: - InstanceType: g3.8xlarge - ImageId: latest_dlami - InstanceMarketOptions: - MarketType: spot - BlockDeviceMappings: - - DeviceName: /dev/sda1 - Ebs: - VolumeSize: 300 - - - ray.worker.default: - min_workers: 3 - max_workers: 3 - resources: {} - node_config: - InstanceType: g3.8xlarge - ImageId: latest_dlami - InstanceMarketOptions: - MarketType: spot - BlockDeviceMappings: - - DeviceName: /dev/sda1 - Ebs: - VolumeSize: 300 - - -setup_commands: - # This replaces the standard anaconda Ray installation - - pip install ray[tune] - - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl - - # Install Transformers - - git clone https://github.com/huggingface/transformers || true - - cd transformers && - pip install -U . && - pip install -r ./examples/pytorch/text-classification/requirements.txt diff --git a/python/ray/train/examples/transformers/transformers_example.py b/python/ray/train/examples/transformers/transformers_example.py deleted file mode 100644 index 48d2e3ca8a9ec..0000000000000 --- a/python/ray/train/examples/transformers/transformers_example.py +++ /dev/null @@ -1,629 +0,0 @@ -# coding=utf-8 -# This is a modified example originally from The HuggingFace Inc. team. -# Modified by Matthew Deng. -# Copyright 2021 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Finetuning a 🤗 Transformers model for sequence classification on GLUE.""" -import argparse -import logging -import math -import os -import random -from typing import Any, Dict - -import datasets -import transformers -from accelerate import Accelerator -from datasets import load_dataset, load_metric -from torch.utils.data.dataloader import DataLoader -from tqdm.auto import tqdm -from transformers import ( - AdamW, - AutoConfig, - AutoModelForSequenceClassification, - AutoTokenizer, - DataCollatorWithPadding, - PretrainedConfig, - SchedulerType, - default_data_collator, - get_scheduler, - set_seed, -) -from transformers.utils.versions import require_version - -import ray -from ray.train.huggingface import AccelerateTrainer -from ray.train import ScalingConfig - -logger = logging.getLogger(__name__) - -require_version( - "datasets>=1.8.0", - "To fix: pip install -r examples/pytorch/text-classification/requirements.txt", -) - -task_to_keys = { - "cola": ("sentence", None), - "mnli": ("premise", "hypothesis"), - "mrpc": ("sentence1", "sentence2"), - "qnli": ("question", "sentence"), - "qqp": ("question1", "question2"), - "rte": ("sentence1", "sentence2"), - "sst2": ("sentence", None), - "stsb": ("sentence1", "sentence2"), - "wnli": ("sentence1", "sentence2"), -} - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Finetune a transformers model on a text classification task" - ) - parser.add_argument( - "--task_name", - type=str, - default=None, - help="The name of the glue task to train on.", - choices=list(task_to_keys.keys()), - ) - parser.add_argument( - "--train_file", - type=str, - default=None, - help="A csv or a json file containing the training data.", - ) - parser.add_argument( - "--validation_file", - type=str, - default=None, - help="A csv or a json file containing the validation data.", - ) - parser.add_argument( - "--max_length", - type=int, - default=128, - help=( - "The maximum total input sequence length after tokenization. " - "Sequences longer than this will be truncated, sequences shorter " - "will be padded if `--pad_to_max_lengh` is passed." - ), - ) - parser.add_argument( - "--pad_to_max_length", - action="store_true", - help="If passed, pad all samples to `max_length`. Otherwise, dynamic " - "padding is used.", - ) - parser.add_argument( - "--model_name_or_path", - type=str, - help="Path to pretrained model or model identifier from " - "huggingface.co/models.", - required=True, - ) - parser.add_argument( - "--use_slow_tokenizer", - action="store_true", - help="If passed, will use a slow tokenizer (not backed by the 🤗 " - "Tokenizers library).", - ) - parser.add_argument( - "--per_device_train_batch_size", - type=int, - default=8, - help="Batch size (per device) for the training dataloader.", - ) - parser.add_argument( - "--per_device_eval_batch_size", - type=int, - default=8, - help="Batch size (per device) for the evaluation dataloader.", - ) - parser.add_argument( - "--learning_rate", - type=float, - default=5e-5, - help="Initial learning rate (after the potential warmup period) to use.", - ) - parser.add_argument( - "--weight_decay", type=float, default=0.0, help="Weight decay to use." - ) - parser.add_argument( - "--num_train_epochs", - type=int, - default=3, - help="Total number of training epochs to perform.", - ) - parser.add_argument( - "--max_train_steps", - type=int, - default=None, - help="Total number of training steps to perform. If provided, " - "overrides num_train_epochs.", - ) - parser.add_argument( - "--gradient_accumulation_steps", - type=int, - default=1, - help="Number of updates steps to accumulate before performing a " - "backward/update pass.", - ) - parser.add_argument( - "--lr_scheduler_type", - type=SchedulerType, - default="linear", - help="The scheduler type to use.", - choices=[ - "linear", - "cosine", - "cosine_with_restarts", - "polynomial", - "constant", - "constant_with_warmup", - ], - ) - parser.add_argument( - "--num_warmup_steps", - type=int, - default=0, - help="Number of steps for the warmup in the lr scheduler.", - ) - parser.add_argument( - "--output_dir", type=str, default=None, help="Where to store the final model." - ) - parser.add_argument( - "--seed", type=int, default=None, help="A seed for reproducible training." - ) - - # Ray arguments. - parser.add_argument( - "--start_local", action="store_true", help="Starts Ray on local machine." - ) - parser.add_argument( - "--address", type=str, default=None, help="Ray address to connect to." - ) - parser.add_argument( - "--num_workers", type=int, default=1, help="Number of workers to use." - ) - parser.add_argument( - "--use_gpu", action="store_true", help="If training should be done on GPUs." - ) - - args = parser.parse_args() - - # Sanity checks - if ( - args.task_name is None - and args.train_file is None - and args.validation_file is None - ): - raise ValueError("Need either a task name or a training/validation file.") - else: - if args.train_file is not None: - extension = args.train_file.split(".")[-1] - assert extension in [ - "csv", - "json", - ], "`train_file` should be a csv or a json file." - if args.validation_file is not None: - extension = args.validation_file.split(".")[-1] - assert extension in [ - "csv", - "json", - ], "`validation_file` should be a csv or a json file." - - if args.output_dir is not None: - os.makedirs(args.output_dir, exist_ok=True) - - return args - - -def train_func(config: Dict[str, Any]): - args = config["args"] - # Initialize the accelerator. We will let the accelerator handle device - # placement for us in this example. - accelerator = Accelerator(cpu=not args.use_gpu) - # Make one log on every process with the configuration for debugging. - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - level=logging.INFO, - ) - logger.info(accelerator.state) - - # Setup logging, we only want one process per machine to log things on - # the screen. accelerator.is_local_main_process is only True for one - # process per machine. - logger.setLevel( - logging.INFO if accelerator.is_local_main_process else logging.ERROR - ) - if accelerator.is_local_main_process: - datasets.utils.logging.set_verbosity_warning() - transformers.utils.logging.set_verbosity_info() - else: - datasets.utils.logging.set_verbosity_error() - transformers.utils.logging.set_verbosity_error() - - # If passed along, set the training seed now. - if args.seed is not None: - set_seed(args.seed) - - # Get the datasets: you can either provide your own CSV/JSON training and - # evaluation files (see below) or specify a GLUE benchmark task (the - # dataset will be downloaded automatically from the datasets Hub). - - # For CSV/JSON files, this script will use as labels the column called - # 'label' and as pair of sentences the sentences in columns called - # 'sentence1' and 'sentence2' if such column exists or the first two - # columns not named label if at least two columns are provided. - - # If the CSVs/JSONs contain only one non-label column, the script does - # single sentence classification on this single column. You can easily - # tweak this behavior (see below) - - # In distributed training, the load_dataset function guarantee that only - # one local process can concurrently download the dataset. - if args.task_name is not None: - # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset("glue", args.task_name) - else: - # Loading the dataset from local csv or json file. - data_files = {} - if args.train_file is not None: - data_files["train"] = args.train_file - if args.validation_file is not None: - data_files["validation"] = args.validation_file - extension = ( - args.train_file if args.train_file is not None else args.valid_file - ).split(".")[-1] - raw_datasets = load_dataset(extension, data_files=data_files) - # See more about loading any type of standard or custom dataset at - # https://huggingface.co/docs/datasets/loading_datasets.html. - - # Labels - if args.task_name is not None: - is_regression = args.task_name == "stsb" - if not is_regression: - label_list = raw_datasets["train"].features["label"].names - num_labels = len(label_list) - else: - num_labels = 1 - else: - # Trying to have good defaults here, don't hesitate to tweak to your - # needs. - is_regression = raw_datasets["train"].features["label"].dtype in [ - "float32", - "float64", - ] - if is_regression: - num_labels = 1 - else: - # A useful fast method: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique # noqa:E501 - label_list = raw_datasets["train"].unique("label") - label_list.sort() # Let's sort it for determinism - num_labels = len(label_list) - - # Load pretrained model and tokenizer - # - # In distributed training, the .from_pretrained methods guarantee that - # only one local process can concurrently download model & vocab. - config = AutoConfig.from_pretrained( - args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name - ) - tokenizer = AutoTokenizer.from_pretrained( - args.model_name_or_path, use_fast=not args.use_slow_tokenizer - ) - model = AutoModelForSequenceClassification.from_pretrained( - args.model_name_or_path, - from_tf=bool(".ckpt" in args.model_name_or_path), - config=config, - ) - - # Preprocessing the datasets - if args.task_name is not None: - sentence1_key, sentence2_key = task_to_keys[args.task_name] - else: - # Again, we try to have some nice defaults but don't hesitate to - # tweak to your use case. - non_label_column_names = [ - name for name in raw_datasets["train"].column_names if name != "label" - ] - if ( - "sentence1" in non_label_column_names - and "sentence2" in non_label_column_names - ): - sentence1_key, sentence2_key = "sentence1", "sentence2" - else: - if len(non_label_column_names) >= 2: - sentence1_key, sentence2_key = non_label_column_names[:2] - else: - sentence1_key, sentence2_key = non_label_column_names[0], None - - # Some models have set the order of the labels to use, - # so let's make sure we do use it. - label_to_id = None - if ( - model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id - and args.task_name is not None - and not is_regression - ): - # Some have all caps in their config, some don't. - label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()} - if list(sorted(label_name_to_id.keys())) == list( # noqa:C413 - sorted(label_list) - ): # noqa:C413 - logger.info( - f"The configuration of the model provided the following label " - f"correspondence: {label_name_to_id}. Using it!" - ) - label_to_id = { - i: label_name_to_id[label_list[i]] for i in range(num_labels) - } - else: - logger.warning( - "Your model seems to have been trained with labels, " - "but they don't match the dataset: ", - f"model labels: {list(sorted(label_name_to_id.keys()))}, " # noqa:C413,E501 - f"dataset labels: {list(sorted(label_list))}." # noqa:C413 - "\nIgnoring the model labels as a result.", - ) - elif args.task_name is None: - label_to_id = {v: i for i, v in enumerate(label_list)} - - if label_to_id is not None: - model.config.label2id = label_to_id - model.config.id2label = {id: label for label, id in config.label2id.items()} - - padding = "max_length" if args.pad_to_max_length else False - - def preprocess_function(examples): - # Tokenize the texts - texts = ( - (examples[sentence1_key],) - if sentence2_key is None - else (examples[sentence1_key], examples[sentence2_key]) - ) - result = tokenizer( - *texts, padding=padding, max_length=args.max_length, truncation=True - ) - - if "label" in examples: - if label_to_id is not None: - # Map labels to IDs (not necessary for GLUE tasks) - result["labels"] = [ - label_to_id[l] for l in examples["label"] # noqa:E741 - ] - else: - # In all cases, rename the column to labels because the model - # will expect that. - result["labels"] = examples["label"] - return result - - processed_datasets = raw_datasets.map( - preprocess_function, - batched=True, - remove_columns=raw_datasets["train"].column_names, - desc="Running tokenizer on dataset", - ) - - train_dataset = processed_datasets["train"] - eval_dataset = processed_datasets[ - "validation_matched" if args.task_name == "mnli" else "validation" - ] - - # Log a few random samples from the training set: - for index in random.sample(range(len(train_dataset)), 3): - logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") - - # DataLoaders creation: - if args.pad_to_max_length: - # If padding was already done ot max length, we use the default data - # collator that will just convert everything to tensors. - data_collator = default_data_collator - else: - # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for - # us (by padding to the maximum length of the samples passed). When - # using mixed precision, we add `pad_to_multiple_of=8` to pad all - # tensors to multiple of 8s, which will enable the use of Tensor - # Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). - data_collator = DataCollatorWithPadding( - tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None) - ) - - train_dataloader = DataLoader( - train_dataset, - shuffle=True, - collate_fn=data_collator, - batch_size=args.per_device_train_batch_size, - ) - eval_dataloader = DataLoader( - eval_dataset, - collate_fn=data_collator, - batch_size=args.per_device_eval_batch_size, - ) - - # Optimizer - # Split weights in two groups, one with weight decay and the other not. - no_decay = ["bias", "LayerNorm.weight"] - optimizer_grouped_parameters = [ - { - "params": [ - p - for n, p in model.named_parameters() - if not any(nd in n for nd in no_decay) - ], - "weight_decay": args.weight_decay, - }, - { - "params": [ - p - for n, p in model.named_parameters() - if any(nd in n for nd in no_decay) - ], - "weight_decay": 0.0, - }, - ] - optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) - - # Prepare everything with our `accelerator`. - model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( - model, optimizer, train_dataloader, eval_dataloader - ) - - # Note -> the training dataloader needs to be prepared before we grab - # his length below (cause its length will be shorter in multiprocess) - - # Scheduler and math around the number of training steps. - num_update_steps_per_epoch = math.ceil( - len(train_dataloader) / args.gradient_accumulation_steps - ) - if args.max_train_steps is None: - args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch - else: - args.num_train_epochs = math.ceil( - args.max_train_steps / num_update_steps_per_epoch - ) - - lr_scheduler = get_scheduler( - name=args.lr_scheduler_type, - optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, - ) - - # Get the metric function - if args.task_name is not None: - metric = load_metric("glue", args.task_name) - else: - metric = load_metric("accuracy") - - # Train! - total_batch_size = ( - args.per_device_train_batch_size - * accelerator.num_processes - * args.gradient_accumulation_steps - ) - - logger.info("***** Running training *****") - logger.info(f" Num examples = {len(train_dataset)}") - logger.info(f" Num Epochs = {args.num_train_epochs}") - logger.info( - f" Instantaneous batch size per device =" - f" {args.per_device_train_batch_size}" - ) - logger.info( - f" Total train batch size (w. parallel, distributed & accumulation) " - f"= {total_batch_size}" - ) - logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") - logger.info(f" Total optimization steps = {args.max_train_steps}") - # Only show the progress bar once on each machine. - progress_bar = tqdm( - range(args.max_train_steps), disable=not accelerator.is_local_main_process - ) - completed_steps = 0 - - for epoch in range(args.num_train_epochs): - model.train() - for step, batch in enumerate(train_dataloader): - outputs = model(**batch) - loss = outputs.loss - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if ( - step % args.gradient_accumulation_steps == 0 - or step == len(train_dataloader) - 1 - ): - optimizer.step() - lr_scheduler.step() - optimizer.zero_grad() - progress_bar.update(1) - completed_steps += 1 - - if completed_steps >= args.max_train_steps: - break - - model.eval() - for step, batch in enumerate(eval_dataloader): - outputs = model(**batch) - predictions = ( - outputs.logits.argmax(dim=-1) - if not is_regression - else outputs.logits.squeeze() - ) - metric.add_batch( - predictions=accelerator.gather(predictions), - references=accelerator.gather(batch["labels"]), - ) - - eval_metric = metric.compute() - logger.info(f"epoch {epoch}: {eval_metric}") - - if args.output_dir is not None: - accelerator.wait_for_everyone() - unwrapped_model = accelerator.unwrap_model(model) - unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save) - - if args.task_name == "mnli": - # Final evaluation on mismatched validation set - eval_dataset = processed_datasets["validation_mismatched"] - eval_dataloader = DataLoader( - eval_dataset, - collate_fn=data_collator, - batch_size=args.per_device_eval_batch_size, - ) - eval_dataloader = accelerator.prepare(eval_dataloader) - - model.eval() - for step, batch in enumerate(eval_dataloader): - outputs = model(**batch) - predictions = outputs.logits.argmax(dim=-1) - metric.add_batch( - predictions=accelerator.gather(predictions), - references=accelerator.gather(batch["labels"]), - ) - - eval_metric = metric.compute() - logger.info(f"mnli-mm: {eval_metric}") - - -def main(): - args = parse_args() - config = {"args": args} - - if args.start_local or args.address or args.num_workers > 1 or args.use_gpu: - if args.start_local: - # Start a local Ray runtime. - ray.init(num_cpus=args.num_workers + 2) - else: - # Connect to a Ray cluster for distributed training. - ray.init(address=args.address) - trainer = AccelerateTrainer( - train_func, - train_loop_config=config, - accelerate_config={}, - scaling_config=ScalingConfig( - num_workers=args.num_workers, use_gpu=args.use_gpu - ), - ) - results = trainer.fit() - print(results.metrics) - else: - # Run training locally. - train_func(config) - - -if __name__ == "__main__": - main() From a1145735e298aebec6122ac1d217bfa9610afe8b Mon Sep 17 00:00:00 2001 From: woshiyyya Date: Mon, 21 Aug 2023 16:39:46 -0700 Subject: [PATCH 2/3] update toc Signed-off-by: woshiyyya --- doc/source/_toc.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/doc/source/_toc.yml b/doc/source/_toc.yml index 5b6f0118cea2a..858914c5edae0 100644 --- a/doc/source/_toc.yml +++ b/doc/source/_toc.yml @@ -92,8 +92,6 @@ parts: title: "PyTorch Lightning Advanced Example" - file: train/examples/lightning/lightning_exp_tracking title: "PyTorch Lightning with Experiment Tracking Tools" - - file: train/examples/transformers/transformers_example - title: "HF Transformers Example" - file: train/examples/tf/tensorflow_mnist_example title: "TensorFlow MNIST Example" - file: train/examples/horovod/horovod_example From b1795d511db596f45bba95a8efdd68c40eb8eba0 Mon Sep 17 00:00:00 2001 From: woshiyyya Date: Tue, 22 Aug 2023 14:36:32 -0700 Subject: [PATCH 3/3] comment out persistence test suite Signed-off-by: woshiyyya --- .buildkite/pipeline.ml.yml | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/.buildkite/pipeline.ml.yml b/.buildkite/pipeline.ml.yml index 3ac57323299d5..7dcc6a59636ef 100644 --- a/.buildkite/pipeline.ml.yml +++ b/.buildkite/pipeline.ml.yml @@ -356,17 +356,18 @@ --test_env=RAY_AIR_NEW_PERSISTENCE_MODE=1 python/ray/train/... -- label: ":steam_locomotive: :octopus: :floppy_disk: New persistence mode: Train + Tune tests and examples" - conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TRAIN_AFFECTED"] - instance_size: medium - commands: - - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT - - TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh - - ./ci/env/env_info.sh - - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only - --test_tag_filters=tune,-gpu_only,-ray_air,-gpu,-doctest,-no_new_storage - --test_env=RAY_AIR_NEW_PERSISTENCE_MODE=1 - python/ray/train/... +# TODO(krfricke): Add new test for this suite +# - label: ":steam_locomotive: :octopus: :floppy_disk: New persistence mode: Train + Tune tests and examples" +# conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TRAIN_AFFECTED"] +# instance_size: medium +# commands: +# - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT +# - TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh +# - ./ci/env/env_info.sh +# - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only +# --test_tag_filters=tune,-gpu_only,-ray_air,-gpu,-doctest,-no_new_storage +# --test_env=RAY_AIR_NEW_PERSISTENCE_MODE=1 +# python/ray/train/... - label: ":octopus: :floppy_disk: New persistence mode: Tune tests and examples (small)"