From 269bedc2cd8f0d5fe0861e7b011fcbd3ff779e0d Mon Sep 17 00:00:00 2001
From: woshiyyya <xiaoyunxuan1998@gmail.com>
Date: Mon, 21 Aug 2023 13:01:51 -0700
Subject: [PATCH 1/3] wip

Signed-off-by: woshiyyya <xiaoyunxuan1998@gmail.com>
---
 doc/source/ray-overview/examples.rst          |   7 -
 doc/source/train/examples.rst                 |   8 -
 .../transformers/transformers_example.rst     |   8 -
 python/ray/train/BUILD                        |  24 -
 .../train/examples/transformers/README.rst    |  57 --
 .../train/examples/transformers/__init__.py   |   0
 .../train/examples/transformers/cluster.yaml  |  58 --
 .../transformers/transformers_example.py      | 629 ------------------
 8 files changed, 791 deletions(-)
 delete mode 100644 doc/source/train/examples/transformers/transformers_example.rst
 delete mode 100644 python/ray/train/examples/transformers/README.rst
 delete mode 100644 python/ray/train/examples/transformers/__init__.py
 delete mode 100644 python/ray/train/examples/transformers/cluster.yaml
 delete mode 100644 python/ray/train/examples/transformers/transformers_example.py

diff --git a/doc/source/ray-overview/examples.rst b/doc/source/ray-overview/examples.rst
index 5b1614b963c99..8ab9a46020f61 100644
--- a/doc/source/ray-overview/examples.rst
+++ b/doc/source/ray-overview/examples.rst
@@ -564,13 +564,6 @@ Ray Examples
 
         PyTorch Fashion MNIST Training Example
 
-    .. grid-item-card:: :bdg-secondary:`Code example`
-        :class-item: gallery-item pytorch training train
-        :link: train_transformers_example
-        :link-type: ref
-
-        Transformers with PyTorch Training Example
-
     .. grid-item-card:: :bdg-secondary:`Code example`
         :class-item: gallery-item tensorflow training train
         :link: tensorflow_mnist_example
diff --git a/doc/source/train/examples.rst b/doc/source/train/examples.rst
index 66808789518d6..01637764619ac 100644
--- a/doc/source/train/examples.rst
+++ b/doc/source/train/examples.rst
@@ -25,14 +25,6 @@ Distributed Training Examples using Ray Train
 
             PyTorch Fashion MNIST Training Example
 
-    .. grid-item-card::
-        :img-top: /images/hugging.png
-        :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img
-
-        .. button-ref:: train_transformers_example
-
-            Transformers with PyTorch Training Example
-
     .. grid-item-card::
         :img-top: /images/tf_logo.png
         :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img
diff --git a/doc/source/train/examples/transformers/transformers_example.rst b/doc/source/train/examples/transformers/transformers_example.rst
deleted file mode 100644
index 7f7eeb4547fc6..0000000000000
--- a/doc/source/train/examples/transformers/transformers_example.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-:orphan:
-
-.. _train_transformers_example :
-
-Ray Train Example for HuggingFace Transformers with PyTorch
-===========================================================
-
-.. literalinclude:: /../../python/ray/train/examples/transformers/transformers_example.py
diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD
index 3dda10edc6865..5dfa3c52ab6ea 100644
--- a/python/ray/train/BUILD
+++ b/python/ray/train/BUILD
@@ -75,30 +75,6 @@ py_test(
     deps = [":train_lib"]
 )
 
-py_test(
-    name = "transformers_example_gpu",
-    size = "medium",
-    main = "examples/transformers/transformers_example.py",
-    srcs = ["examples/transformers/transformers_example.py"],
-    tags = ["team:ml", "exclusive", "tune", "gpu_only"],
-    deps = [":train_lib"],
-    args = ["--model_name_or_path=bert-base-cased", "--task_name=mrpc",
-    "--max_length=32", "--per_device_train_batch_size=64",
-    "--max_train_steps=2", "--start_local", "--num_workers=2", "--use_gpu"]
-)
-
-py_test(
-    name = "transformers_example_cpu",
-    size = "medium",
-    main = "examples/transformers/transformers_example.py",
-    srcs = ["examples/transformers/transformers_example.py"],
-    tags = ["team:ml", "exclusive", "tune"],
-    deps = [":train_lib"],
-    args = ["--model_name_or_path=bert-base-cased", "--task_name=mrpc",
-    "--max_length=32", "--per_device_train_batch_size=64",
-    "--max_train_steps=2", "--start_local", "--num_workers=2"]
-)
-
 py_test(
     name = "tune_cifar_torch_pbt_example",
     size = "medium",
diff --git a/python/ray/train/examples/transformers/README.rst b/python/ray/train/examples/transformers/README.rst
deleted file mode 100644
index f0435e4f4474c..0000000000000
--- a/python/ray/train/examples/transformers/README.rst
+++ /dev/null
@@ -1,57 +0,0 @@
-HuggingFace Transformers Glue Fine-tuning Example
-=================================================
-
-We've ported the ``huggingface/transformers/examples/pytorch/text-classification/run_glue_no_trainer.py`` example to
-Ray Train. This example enables fine-tuning the library models for sequence classification on the GLUE benchmark: General Language Understanding Evaluation.
-
-This script can fine-tune the following models:
- CoLA, SST-2, MRPC, STS-B, QQP, MNLI, QNLI, RTE, WNLI.
-
-Additional information can be found at the `HuggingFace Repository
-<https://github.com/huggingface/transformers/tree/master/examples/pytorch/text-classification>`_.
-
-Local process training
-----------------------
-
-To run an example tuning MRPC locally, without Ray:
-
-.. code-block:: bash
-
-    export TASK_NAME=mrpc
-
-    python transformers_example.py \
-      --model_name_or_path bert-base-cased \
-      --task_name $TASK_NAME \
-      --max_length 128 \
-      --per_device_train_batch_size 32 \
-      --learning_rate 2e-5 \
-      --num_train_epochs 3 \
-      --output_dir /tmp/$TASK_NAME/
-
-This is the same as running `run_glue_no_trainer.py <https://github
-.com/huggingface/transformers/blob/master/examples/pytorch/text-classification/run_glue_no_trainer.py>`_.
-
-Distributed multi-node GPU training
------------------------------------
-
-To run an example tuning MRPC on AWS with 8 GPUs across multiple nodes:
-
-.. code-block:: bash
-
-    export TASK_NAME=mrpc
-
-    ray up cluster.yaml
-    # (Optional) ray monitor cluster.yaml
-    ray submit cluster.yaml transformers_example.py \
-      --model_name_or_path bert-base-cased \
-      --task_name $TASK_NAME \
-      --max_length 128 \
-      --per_device_train_batch_size 32 \
-      --learning_rate 2e-5 \
-      --num_train_epochs 3 \
-      --output_dir /tmp/$TASK_NAME/ \
-      --address auto \
-      --num_workers 8 \
-      --use_gpu
-
-The example can also be run using :ref:`Ray Job Submission <jobs-overview>`, which is in beta starting with Ray 1.12.
\ No newline at end of file
diff --git a/python/ray/train/examples/transformers/__init__.py b/python/ray/train/examples/transformers/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/python/ray/train/examples/transformers/cluster.yaml b/python/ray/train/examples/transformers/cluster.yaml
deleted file mode 100644
index 72e8676e01982..0000000000000
--- a/python/ray/train/examples/transformers/cluster.yaml
+++ /dev/null
@@ -1,58 +0,0 @@
-# An unique identifier for the head node and workers of this cluster.
-cluster_name: transformer-cluster
-
-# The maximum number of workers nodes to launch in addition to the head
-# node. This takes precedence over min_workers. min_workers default to 0.
-min_workers: 3
-max_workers: 3
-
-# Cloud-provider specific configuration.
-provider:
-    type: aws
-    region: us-west-2
-
-# How Ray will authenticate with newly launched nodes.
-auth:
-    ssh_user: ubuntu
-
-available_node_types:
-    ray.head.default:
-        min_workers: 0
-        max_workers: 0
-        resources: {}
-        node_config:
-            InstanceType: g3.8xlarge
-            ImageId: latest_dlami
-            InstanceMarketOptions:
-                MarketType: spot
-            BlockDeviceMappings:
-                - DeviceName: /dev/sda1
-                  Ebs:
-                      VolumeSize: 300
-
-
-    ray.worker.default:
-        min_workers: 3
-        max_workers: 3
-        resources: {}
-        node_config:
-            InstanceType: g3.8xlarge
-            ImageId: latest_dlami
-            InstanceMarketOptions:
-                MarketType: spot
-            BlockDeviceMappings:
-                - DeviceName: /dev/sda1
-                  Ebs:
-                      VolumeSize: 300
-
-
-setup_commands:
-    # This replaces the standard anaconda Ray installation
-    - pip install ray[tune]
-    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
-
-    # Install Transformers
-    - git clone https://github.com/huggingface/transformers || true
-    - cd transformers &&
-      pip install -U . &&
-      pip install -r ./examples/pytorch/text-classification/requirements.txt
diff --git a/python/ray/train/examples/transformers/transformers_example.py b/python/ray/train/examples/transformers/transformers_example.py
deleted file mode 100644
index 48d2e3ca8a9ec..0000000000000
--- a/python/ray/train/examples/transformers/transformers_example.py
+++ /dev/null
@@ -1,629 +0,0 @@
-# coding=utf-8
-# This is a modified example originally from The HuggingFace Inc. team.
-# Modified by Matthew Deng.
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Finetuning a 🤗 Transformers model for sequence classification on GLUE."""
-import argparse
-import logging
-import math
-import os
-import random
-from typing import Any, Dict
-
-import datasets
-import transformers
-from accelerate import Accelerator
-from datasets import load_dataset, load_metric
-from torch.utils.data.dataloader import DataLoader
-from tqdm.auto import tqdm
-from transformers import (
-    AdamW,
-    AutoConfig,
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
-    DataCollatorWithPadding,
-    PretrainedConfig,
-    SchedulerType,
-    default_data_collator,
-    get_scheduler,
-    set_seed,
-)
-from transformers.utils.versions import require_version
-
-import ray
-from ray.train.huggingface import AccelerateTrainer
-from ray.train import ScalingConfig
-
-logger = logging.getLogger(__name__)
-
-require_version(
-    "datasets>=1.8.0",
-    "To fix: pip install -r examples/pytorch/text-classification/requirements.txt",
-)
-
-task_to_keys = {
-    "cola": ("sentence", None),
-    "mnli": ("premise", "hypothesis"),
-    "mrpc": ("sentence1", "sentence2"),
-    "qnli": ("question", "sentence"),
-    "qqp": ("question1", "question2"),
-    "rte": ("sentence1", "sentence2"),
-    "sst2": ("sentence", None),
-    "stsb": ("sentence1", "sentence2"),
-    "wnli": ("sentence1", "sentence2"),
-}
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description="Finetune a transformers model on a text classification task"
-    )
-    parser.add_argument(
-        "--task_name",
-        type=str,
-        default=None,
-        help="The name of the glue task to train on.",
-        choices=list(task_to_keys.keys()),
-    )
-    parser.add_argument(
-        "--train_file",
-        type=str,
-        default=None,
-        help="A csv or a json file containing the training data.",
-    )
-    parser.add_argument(
-        "--validation_file",
-        type=str,
-        default=None,
-        help="A csv or a json file containing the validation data.",
-    )
-    parser.add_argument(
-        "--max_length",
-        type=int,
-        default=128,
-        help=(
-            "The maximum total input sequence length after tokenization. "
-            "Sequences longer than this will be truncated, sequences shorter "
-            "will be padded if `--pad_to_max_lengh` is passed."
-        ),
-    )
-    parser.add_argument(
-        "--pad_to_max_length",
-        action="store_true",
-        help="If passed, pad all samples to `max_length`. Otherwise, dynamic "
-        "padding is used.",
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        type=str,
-        help="Path to pretrained model or model identifier from "
-        "huggingface.co/models.",
-        required=True,
-    )
-    parser.add_argument(
-        "--use_slow_tokenizer",
-        action="store_true",
-        help="If passed, will use a slow tokenizer (not backed by the 🤗 "
-        "Tokenizers library).",
-    )
-    parser.add_argument(
-        "--per_device_train_batch_size",
-        type=int,
-        default=8,
-        help="Batch size (per device) for the training dataloader.",
-    )
-    parser.add_argument(
-        "--per_device_eval_batch_size",
-        type=int,
-        default=8,
-        help="Batch size (per device) for the evaluation dataloader.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=5e-5,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument(
-        "--weight_decay", type=float, default=0.0, help="Weight decay to use."
-    )
-    parser.add_argument(
-        "--num_train_epochs",
-        type=int,
-        default=3,
-        help="Total number of training epochs to perform.",
-    )
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=None,
-        help="Total number of training steps to perform. If provided, "
-        "overrides num_train_epochs.",
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a "
-        "backward/update pass.",
-    )
-    parser.add_argument(
-        "--lr_scheduler_type",
-        type=SchedulerType,
-        default="linear",
-        help="The scheduler type to use.",
-        choices=[
-            "linear",
-            "cosine",
-            "cosine_with_restarts",
-            "polynomial",
-            "constant",
-            "constant_with_warmup",
-        ],
-    )
-    parser.add_argument(
-        "--num_warmup_steps",
-        type=int,
-        default=0,
-        help="Number of steps for the warmup in the lr scheduler.",
-    )
-    parser.add_argument(
-        "--output_dir", type=str, default=None, help="Where to store the final model."
-    )
-    parser.add_argument(
-        "--seed", type=int, default=None, help="A seed for reproducible training."
-    )
-
-    # Ray arguments.
-    parser.add_argument(
-        "--start_local", action="store_true", help="Starts Ray on local machine."
-    )
-    parser.add_argument(
-        "--address", type=str, default=None, help="Ray address to connect to."
-    )
-    parser.add_argument(
-        "--num_workers", type=int, default=1, help="Number of workers to use."
-    )
-    parser.add_argument(
-        "--use_gpu", action="store_true", help="If training should be done on GPUs."
-    )
-
-    args = parser.parse_args()
-
-    # Sanity checks
-    if (
-        args.task_name is None
-        and args.train_file is None
-        and args.validation_file is None
-    ):
-        raise ValueError("Need either a task name or a training/validation file.")
-    else:
-        if args.train_file is not None:
-            extension = args.train_file.split(".")[-1]
-            assert extension in [
-                "csv",
-                "json",
-            ], "`train_file` should be a csv or a json file."
-        if args.validation_file is not None:
-            extension = args.validation_file.split(".")[-1]
-            assert extension in [
-                "csv",
-                "json",
-            ], "`validation_file` should be a csv or a json file."
-
-    if args.output_dir is not None:
-        os.makedirs(args.output_dir, exist_ok=True)
-
-    return args
-
-
-def train_func(config: Dict[str, Any]):
-    args = config["args"]
-    # Initialize the accelerator. We will let the accelerator handle device
-    # placement for us in this example.
-    accelerator = Accelerator(cpu=not args.use_gpu)
-    # Make one log on every process with the configuration for debugging.
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    logger.info(accelerator.state)
-
-    # Setup logging, we only want one process per machine to log things on
-    # the screen. accelerator.is_local_main_process is only True for one
-    # process per machine.
-    logger.setLevel(
-        logging.INFO if accelerator.is_local_main_process else logging.ERROR
-    )
-    if accelerator.is_local_main_process:
-        datasets.utils.logging.set_verbosity_warning()
-        transformers.utils.logging.set_verbosity_info()
-    else:
-        datasets.utils.logging.set_verbosity_error()
-        transformers.utils.logging.set_verbosity_error()
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    # Get the datasets: you can either provide your own CSV/JSON training and
-    # evaluation files (see below) or specify a GLUE benchmark task (the
-    # dataset will be downloaded automatically from the datasets Hub).
-
-    # For CSV/JSON files, this script will use as labels the column called
-    # 'label' and as pair of sentences the sentences in columns called
-    # 'sentence1' and 'sentence2' if such column exists or the first two
-    # columns not named label if at least two columns are provided.
-
-    # If the CSVs/JSONs contain only one non-label column, the script does
-    # single sentence classification on this single column. You can easily
-    # tweak this behavior (see below)
-
-    # In distributed training, the load_dataset function guarantee that only
-    # one local process can concurrently download the dataset.
-    if args.task_name is not None:
-        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset("glue", args.task_name)
-    else:
-        # Loading the dataset from local csv or json file.
-        data_files = {}
-        if args.train_file is not None:
-            data_files["train"] = args.train_file
-        if args.validation_file is not None:
-            data_files["validation"] = args.validation_file
-        extension = (
-            args.train_file if args.train_file is not None else args.valid_file
-        ).split(".")[-1]
-        raw_datasets = load_dataset(extension, data_files=data_files)
-    # See more about loading any type of standard or custom dataset at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
-
-    # Labels
-    if args.task_name is not None:
-        is_regression = args.task_name == "stsb"
-        if not is_regression:
-            label_list = raw_datasets["train"].features["label"].names
-            num_labels = len(label_list)
-        else:
-            num_labels = 1
-    else:
-        # Trying to have good defaults here, don't hesitate to tweak to your
-        # needs.
-        is_regression = raw_datasets["train"].features["label"].dtype in [
-            "float32",
-            "float64",
-        ]
-        if is_regression:
-            num_labels = 1
-        else:
-            # A useful fast method:
-            # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique # noqa:E501
-            label_list = raw_datasets["train"].unique("label")
-            label_list.sort()  # Let's sort it for determinism
-            num_labels = len(label_list)
-
-    # Load pretrained model and tokenizer
-    #
-    # In distributed training, the .from_pretrained methods guarantee that
-    # only one local process can concurrently download model & vocab.
-    config = AutoConfig.from_pretrained(
-        args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        args.model_name_or_path, use_fast=not args.use_slow_tokenizer
-    )
-    model = AutoModelForSequenceClassification.from_pretrained(
-        args.model_name_or_path,
-        from_tf=bool(".ckpt" in args.model_name_or_path),
-        config=config,
-    )
-
-    # Preprocessing the datasets
-    if args.task_name is not None:
-        sentence1_key, sentence2_key = task_to_keys[args.task_name]
-    else:
-        # Again, we try to have some nice defaults but don't hesitate to
-        # tweak to your use case.
-        non_label_column_names = [
-            name for name in raw_datasets["train"].column_names if name != "label"
-        ]
-        if (
-            "sentence1" in non_label_column_names
-            and "sentence2" in non_label_column_names
-        ):
-            sentence1_key, sentence2_key = "sentence1", "sentence2"
-        else:
-            if len(non_label_column_names) >= 2:
-                sentence1_key, sentence2_key = non_label_column_names[:2]
-            else:
-                sentence1_key, sentence2_key = non_label_column_names[0], None
-
-    # Some models have set the order of the labels to use,
-    # so let's make sure we do use it.
-    label_to_id = None
-    if (
-        model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id
-        and args.task_name is not None
-        and not is_regression
-    ):
-        # Some have all caps in their config, some don't.
-        label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
-        if list(sorted(label_name_to_id.keys())) == list(  # noqa:C413
-            sorted(label_list)
-        ):  # noqa:C413
-            logger.info(
-                f"The configuration of the model provided the following label "
-                f"correspondence: {label_name_to_id}. Using it!"
-            )
-            label_to_id = {
-                i: label_name_to_id[label_list[i]] for i in range(num_labels)
-            }
-        else:
-            logger.warning(
-                "Your model seems to have been trained with labels, "
-                "but they don't match the dataset: ",
-                f"model labels: {list(sorted(label_name_to_id.keys()))}, "  # noqa:C413,E501
-                f"dataset labels: {list(sorted(label_list))}."  # noqa:C413
-                "\nIgnoring the model labels as a result.",
-            )
-    elif args.task_name is None:
-        label_to_id = {v: i for i, v in enumerate(label_list)}
-
-    if label_to_id is not None:
-        model.config.label2id = label_to_id
-        model.config.id2label = {id: label for label, id in config.label2id.items()}
-
-    padding = "max_length" if args.pad_to_max_length else False
-
-    def preprocess_function(examples):
-        # Tokenize the texts
-        texts = (
-            (examples[sentence1_key],)
-            if sentence2_key is None
-            else (examples[sentence1_key], examples[sentence2_key])
-        )
-        result = tokenizer(
-            *texts, padding=padding, max_length=args.max_length, truncation=True
-        )
-
-        if "label" in examples:
-            if label_to_id is not None:
-                # Map labels to IDs (not necessary for GLUE tasks)
-                result["labels"] = [
-                    label_to_id[l] for l in examples["label"]  # noqa:E741
-                ]
-            else:
-                # In all cases, rename the column to labels because the model
-                # will expect that.
-                result["labels"] = examples["label"]
-        return result
-
-    processed_datasets = raw_datasets.map(
-        preprocess_function,
-        batched=True,
-        remove_columns=raw_datasets["train"].column_names,
-        desc="Running tokenizer on dataset",
-    )
-
-    train_dataset = processed_datasets["train"]
-    eval_dataset = processed_datasets[
-        "validation_matched" if args.task_name == "mnli" else "validation"
-    ]
-
-    # Log a few random samples from the training set:
-    for index in random.sample(range(len(train_dataset)), 3):
-        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
-
-    # DataLoaders creation:
-    if args.pad_to_max_length:
-        # If padding was already done ot max length, we use the default data
-        # collator that will just convert everything to tensors.
-        data_collator = default_data_collator
-    else:
-        # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for
-        # us (by padding to the maximum length of the samples passed). When
-        # using mixed precision, we add `pad_to_multiple_of=8` to pad all
-        # tensors to multiple of 8s, which will enable the use of Tensor
-        # Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
-        data_collator = DataCollatorWithPadding(
-            tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)
-        )
-
-    train_dataloader = DataLoader(
-        train_dataset,
-        shuffle=True,
-        collate_fn=data_collator,
-        batch_size=args.per_device_train_batch_size,
-    )
-    eval_dataloader = DataLoader(
-        eval_dataset,
-        collate_fn=data_collator,
-        batch_size=args.per_device_eval_batch_size,
-    )
-
-    # Optimizer
-    # Split weights in two groups, one with weight decay and the other not.
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [
-                p
-                for n, p in model.named_parameters()
-                if not any(nd in n for nd in no_decay)
-            ],
-            "weight_decay": args.weight_decay,
-        },
-        {
-            "params": [
-                p
-                for n, p in model.named_parameters()
-                if any(nd in n for nd in no_decay)
-            ],
-            "weight_decay": 0.0,
-        },
-    ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
-
-    # Prepare everything with our `accelerator`.
-    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
-        model, optimizer, train_dataloader, eval_dataloader
-    )
-
-    # Note -> the training dataloader needs to be prepared before we grab
-    # his length below (cause its length will be shorter in multiprocess)
-
-    # Scheduler and math around the number of training steps.
-    num_update_steps_per_epoch = math.ceil(
-        len(train_dataloader) / args.gradient_accumulation_steps
-    )
-    if args.max_train_steps is None:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-    else:
-        args.num_train_epochs = math.ceil(
-            args.max_train_steps / num_update_steps_per_epoch
-        )
-
-    lr_scheduler = get_scheduler(
-        name=args.lr_scheduler_type,
-        optimizer=optimizer,
-        num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=args.max_train_steps,
-    )
-
-    # Get the metric function
-    if args.task_name is not None:
-        metric = load_metric("glue", args.task_name)
-    else:
-        metric = load_metric("accuracy")
-
-    # Train!
-    total_batch_size = (
-        args.per_device_train_batch_size
-        * accelerator.num_processes
-        * args.gradient_accumulation_steps
-    )
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Num Epochs = {args.num_train_epochs}")
-    logger.info(
-        f"  Instantaneous batch size per device ="
-        f" {args.per_device_train_batch_size}"
-    )
-    logger.info(
-        f"  Total train batch size (w. parallel, distributed & accumulation) "
-        f"= {total_batch_size}"
-    )
-    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
-    logger.info(f"  Total optimization steps = {args.max_train_steps}")
-    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(
-        range(args.max_train_steps), disable=not accelerator.is_local_main_process
-    )
-    completed_steps = 0
-
-    for epoch in range(args.num_train_epochs):
-        model.train()
-        for step, batch in enumerate(train_dataloader):
-            outputs = model(**batch)
-            loss = outputs.loss
-            loss = loss / args.gradient_accumulation_steps
-            accelerator.backward(loss)
-            if (
-                step % args.gradient_accumulation_steps == 0
-                or step == len(train_dataloader) - 1
-            ):
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.zero_grad()
-                progress_bar.update(1)
-                completed_steps += 1
-
-            if completed_steps >= args.max_train_steps:
-                break
-
-        model.eval()
-        for step, batch in enumerate(eval_dataloader):
-            outputs = model(**batch)
-            predictions = (
-                outputs.logits.argmax(dim=-1)
-                if not is_regression
-                else outputs.logits.squeeze()
-            )
-            metric.add_batch(
-                predictions=accelerator.gather(predictions),
-                references=accelerator.gather(batch["labels"]),
-            )
-
-        eval_metric = metric.compute()
-        logger.info(f"epoch {epoch}: {eval_metric}")
-
-    if args.output_dir is not None:
-        accelerator.wait_for_everyone()
-        unwrapped_model = accelerator.unwrap_model(model)
-        unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
-
-    if args.task_name == "mnli":
-        # Final evaluation on mismatched validation set
-        eval_dataset = processed_datasets["validation_mismatched"]
-        eval_dataloader = DataLoader(
-            eval_dataset,
-            collate_fn=data_collator,
-            batch_size=args.per_device_eval_batch_size,
-        )
-        eval_dataloader = accelerator.prepare(eval_dataloader)
-
-        model.eval()
-        for step, batch in enumerate(eval_dataloader):
-            outputs = model(**batch)
-            predictions = outputs.logits.argmax(dim=-1)
-            metric.add_batch(
-                predictions=accelerator.gather(predictions),
-                references=accelerator.gather(batch["labels"]),
-            )
-
-        eval_metric = metric.compute()
-        logger.info(f"mnli-mm: {eval_metric}")
-
-
-def main():
-    args = parse_args()
-    config = {"args": args}
-
-    if args.start_local or args.address or args.num_workers > 1 or args.use_gpu:
-        if args.start_local:
-            # Start a local Ray runtime.
-            ray.init(num_cpus=args.num_workers + 2)
-        else:
-            # Connect to a Ray cluster for distributed training.
-            ray.init(address=args.address)
-        trainer = AccelerateTrainer(
-            train_func,
-            train_loop_config=config,
-            accelerate_config={},
-            scaling_config=ScalingConfig(
-                num_workers=args.num_workers, use_gpu=args.use_gpu
-            ),
-        )
-        results = trainer.fit()
-        print(results.metrics)
-    else:
-        # Run training locally.
-        train_func(config)
-
-
-if __name__ == "__main__":
-    main()

From a1145735e298aebec6122ac1d217bfa9610afe8b Mon Sep 17 00:00:00 2001
From: woshiyyya <xiaoyunxuan1998@gmail.com>
Date: Mon, 21 Aug 2023 16:39:46 -0700
Subject: [PATCH 2/3] update toc

Signed-off-by: woshiyyya <xiaoyunxuan1998@gmail.com>
---
 doc/source/_toc.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/doc/source/_toc.yml b/doc/source/_toc.yml
index 5b6f0118cea2a..858914c5edae0 100644
--- a/doc/source/_toc.yml
+++ b/doc/source/_toc.yml
@@ -92,8 +92,6 @@ parts:
                 title: "PyTorch Lightning Advanced Example"
               - file: train/examples/lightning/lightning_exp_tracking
                 title: "PyTorch Lightning with Experiment Tracking Tools"
-              - file: train/examples/transformers/transformers_example
-                title: "HF Transformers Example"
               - file: train/examples/tf/tensorflow_mnist_example
                 title: "TensorFlow MNIST Example"
               - file: train/examples/horovod/horovod_example

From b1795d511db596f45bba95a8efdd68c40eb8eba0 Mon Sep 17 00:00:00 2001
From: woshiyyya <xiaoyunxuan1998@gmail.com>
Date: Tue, 22 Aug 2023 14:36:32 -0700
Subject: [PATCH 3/3] comment out persistence test suite

Signed-off-by: woshiyyya <xiaoyunxuan1998@gmail.com>
---
 .buildkite/pipeline.ml.yml | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/.buildkite/pipeline.ml.yml b/.buildkite/pipeline.ml.yml
index 3ac57323299d5..7dcc6a59636ef 100644
--- a/.buildkite/pipeline.ml.yml
+++ b/.buildkite/pipeline.ml.yml
@@ -356,17 +356,18 @@
       --test_env=RAY_AIR_NEW_PERSISTENCE_MODE=1
       python/ray/train/...
 
-- label: ":steam_locomotive: :octopus: :floppy_disk: New persistence mode: Train + Tune tests and examples"
-  conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TRAIN_AFFECTED"]
-  instance_size: medium
-  commands:
-    - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
-    - TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh
-    - ./ci/env/env_info.sh
-    - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only 
-      --test_tag_filters=tune,-gpu_only,-ray_air,-gpu,-doctest,-no_new_storage
-      --test_env=RAY_AIR_NEW_PERSISTENCE_MODE=1
-      python/ray/train/...
+# TODO(krfricke): Add new test for this suite
+# - label: ":steam_locomotive: :octopus: :floppy_disk: New persistence mode: Train + Tune tests and examples"
+#   conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TRAIN_AFFECTED"]
+#   instance_size: medium
+#   commands:
+#     - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
+#     - TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh
+#     - ./ci/env/env_info.sh
+#     - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only 
+#       --test_tag_filters=tune,-gpu_only,-ray_air,-gpu,-doctest,-no_new_storage
+#       --test_env=RAY_AIR_NEW_PERSISTENCE_MODE=1
+#       python/ray/train/...
 
 
 - label: ":octopus: :floppy_disk: New persistence mode: Tune tests and examples (small)"