Add simple example for how to use torch_xla (#7048)

JackCaoG · web-flow · commit ae63cd1e9f0c · 2024-05-14T11:32:57.000-07:00
diff --git a/examples/README.md b/examples/README.md
@@ -0,0 +1,2 @@
+## Overview
+This repo aims to provide some basic examples of how to run an existing pytorch model with PyTorch/XLA. train_resnet_base.py is a minimal trainer to run ResNet50 with fake data on a single device. Other examples will import the train_resnet_base and demonstrate how to enable different features(distributed training, profiling, dynamo etc) on PyTorch/XLA.The objective of this repository is to offer fundamental examples of executing an existing PyTorch model utilizing PyTorch/XLA. train_resnet_base.py acts as a bare-bones trainer for running ResNet50 with simulated data on an individual device. Additional examples will import train_resnet_base and illustrate how to activate various features (e.g., distributed training, profiling, dynamo) on PyTorch/XLA.
diff --git a/examples/train_resnet_base.py b/examples/train_resnet_base.py
@@ -0,0 +1,70 @@
+from torch_xla import runtime as xr
+import torch_xla.utils.utils as xu
+import torch_xla.core.xla_model as xm
+import torch_xla.distributed.parallel_loader as pl
+
+import time
+import itertools
+
+import torch
+import torch_xla
+import torchvision
+import torch.optim as optim
+import torch.nn as nn
+
+
+def _train_update(step, loss, tracker, epoch):
+  print(f'epoch: {epoch}, step: {step}, loss: {loss}, rate: {tracker.rate()}')
+
+
+class TrainResNetBase():
+
+  def __init__(self):
+    img_dim = 224
+    self.batch_size = 128
+    self.num_steps = 300
+    self.num_epochs = 1
+    train_dataset_len = 1200000  # Roughly the size of Imagenet dataset.
+    # For the purpose of this example, we are going to use fake data.
+    train_loader = xu.SampleGenerator(
+        data=(torch.zeros(self.batch_size, 3, img_dim, img_dim),
+              torch.zeros(self.batch_size, dtype=torch.int64)),
+        sample_count=train_dataset_len // self.batch_size // xr.world_size())
+
+    self.device = torch_xla.device()
+    self.train_device_loader = pl.MpDeviceLoader(train_loader, self.device)
+    self.model = torchvision.models.resnet50().to(self.device)
+    self.optimizer = optim.SGD(self.model.parameters(), weight_decay=1e-4)
+    self.loss_fn = nn.CrossEntropyLoss()
+
+  def run_optimizer(self):
+    self.optimizer.step()
+
+  def start_training(self):
+
+    def train_loop_fn(loader, epoch):
+      tracker = xm.RateTracker()
+      self.model.train()
+      loader = itertools.islice(loader, self.num_steps)
+      for step, (data, target) in enumerate(loader):
+        self.optimizer.zero_grad()
+        output = self.model(data)
+        loss = self.loss_fn(output, target)
+        loss.backward()
+        self.run_optimizer()
+        tracker.add(self.batch_size)
+        if step % 10 == 0:
+          xm.add_step_closure(_train_update, args=(step, loss, tracker, epoch))
+
+    for epoch in range(1, self.num_epochs + 1):
+      xm.master_print('Epoch {} train begin {}'.format(
+          epoch, time.strftime('%l:%M%p %Z on %b %d, %Y')))
+      train_loop_fn(self.train_device_loader, epoch)
+      xm.master_print('Epoch {} train end {}'.format(
+          epoch, time.strftime('%l:%M%p %Z on %b %d, %Y')))
+    xm.wait_device_ops()
+
+
+if __name__ == '__main__':
+  base = TrainResNetBase()
+  base.start_training()
diff --git a/examples/train_resnet_ddp.py b/examples/train_resnet_ddp.py
@@ -0,0 +1,24 @@
+from train_resnet_base import TrainResNetBase
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+import torch.optim as optim
+import torch_xla.distributed.xla_multiprocessing as xmp
+
+
+class TrainResNetDDP(TrainResNetBase):
+
+  def __init__(self):
+    super().__init__()
+    dist.init_process_group('xla', init_method='xla://')
+    self.model = DDP(
+        self.model, gradient_as_bucket_view=True, broadcast_buffers=False)
+    self.optimizer = optim.SGD(self.model.parameters(), weight_decay=1e-4)
+
+
+def _mp_fn(index):
+  ddp = TrainResNetDDP()
+  ddp.start_training()
+
+
+if __name__ == '__main__':
+  xmp.spawn(_mp_fn, args=())
diff --git a/examples/train_resnet_profile.py b/examples/train_resnet_profile.py
@@ -0,0 +1,25 @@
+import os
+
+from train_resnet_base import TrainResNetBase
+import torch_xla.debug.profiler as xp
+
+# check https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md#environment-variables
+os.environ["XLA_IR_DEBUG"] = "1"
+os.environ["XLA_HLO_DEBUG"] = "1"
+
+if __name__ == '__main__':
+  base = TrainResNetBase()
+  profile_port = 9012
+  profile_logdir = "/tmp/profile/"
+  duration_ms = 30000
+  assert os.path.exists(profile_logdir)
+  server = xp.start_server(profile_port)
+  # Ideally you want to start the profile tracing after the initial compilation, for example
+  # at step 5.
+  xp.trace_detached(
+      f'localhost:{profile_port}', profile_logdir, duration_ms=duration_ms)
+  base.start_training()
+  # You can view the profile at tensorboard by
+  # 1. pip install tensorflow tensorboard-plugin-profile
+  # 2. tensorboard --logdir /tmp/profile/ --port 6006
+  # For more detail plase take a look at https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm
diff --git a/examples/train_resnet_xla_ddp.py b/examples/train_resnet_xla_ddp.py
@@ -0,0 +1,18 @@
+from train_resnet_base import TrainResNetBase
+import torch_xla.distributed.xla_multiprocessing as xmp
+import torch_xla.core.xla_model as xm
+
+
+class TrainResNetXLADDP(TrainResNetBase):
+
+  def run_optimizer(self):
+    xm.optimizer_step(self.optimizer)
+
+
+def _mp_fn(index):
+  xla_ddp = TrainResNetXLADDP()
+  xla_ddp.start_training()
+
+
+if __name__ == '__main__':
+  xmp.spawn(_mp_fn, args=())

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+## Overview`
	`2`	+This repo aims to provide some basic examples of how to run an existing pytorch model with PyTorch/XLA. train_resnet_base.py is a minimal trainer to run ResNet50 with fake data on a single device. Other examples will import the train_resnet_base and demonstrate how to enable different features(distributed training, profiling, dynamo etc) on PyTorch/XLA.The objective of this repository is to offer fundamental examples of executing an existing PyTorch model utilizing PyTorch/XLA. train_resnet_base.py acts as a bare-bones trainer for running ResNet50 with simulated data on an individual device. Additional examples will import train_resnet_base and illustrate how to activate various features (e.g., distributed training, profiling, dynamo) on PyTorch/XLA.