pytorch
diff --git a/‎dist_run.py‎
Lines changed: 7 additions & 3 deletions b/‎dist_run.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎docs/quantization.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/quantization.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎install/requirements.txt‎
Lines changed: 3 additions & 0 deletions b/‎install/requirements.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎torchchat/cli/builder.py‎
Lines changed: 19 additions & 8 deletions b/‎torchchat/cli/builder.py‎
Lines changed: 19 additions & 8 deletions
diff --git a/‎torchchat/cli/cli.py‎
Lines changed: 22 additions & 2 deletions b/‎torchchat/cli/cli.py‎
Lines changed: 22 additions & 2 deletions
@@ -20,14 +20,14 @@
 from torch.distributed.pipelining import PipelineStage, ScheduleGPipe
 from torchchat.cli.builder import _initialize_tokenizer, TokenizerArgs
 
-from torchchat.distributed.logging_utils import SingletonLogger
-
 # TODO - these are not distributed specific, consider moving to new package
 from torchchat.distributed.checkpoint_utils import (
     get_hf_config_file,
     load_weights_from_hf_format,
     load_weights_from_torchchat_format,
 )
+
+from torchchat.distributed.logging_utils import SingletonLogger
 from torchchat.distributed.utils import (
     bytes_to_readable,
     Color as color,
@@ -153,7 +153,9 @@ def _load_model_weights(
         # This format stands for:
         # single binary file, OR
         # multiple binary files without index files.
-        load_weights_from_torchchat_format(stage_module, distribution, device, model_config)
+        load_weights_from_torchchat_format(
+            stage_module, distribution, device, model_config
+        )
     else:
         raise ValueError(f"Unknown checkpoint format: {chpt_from}")
 
@@ -593,9 +595,11 @@ def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
     parser.add_argument(
         "model_name",
         type=str,
+        default="llama3",
         help="Name of the model to load",
         choices=NAME_TO_DISTRIBUTION_AND_DTYPE.keys(),
     )
+
     parser.add_argument("--pp", type=int, default=1, help="Pipeline parallel degree")
     parser.add_argument(
         "--ntokens",
 
@@ -120,6 +120,8 @@ python3 torchchat.py generate llama3 --pte-path llama3.pte  --prompt "Hello my n
 
 ## Experimental TorchAO lowbit kernels
 
+WARNING: These kernels only work on devices with ARM CPUs, for example on Mac computers with Apple Silicon.
+
 ### Use
 
 #### linear:a8wxdq
 
@@ -30,3 +30,6 @@ streamlit
 
 # Server mode
 flask
+
+# eval
+lm_eval==0.4.2
@@ -17,6 +17,8 @@
 import torch.nn as nn
 
 from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.elastic.multiprocessing.errors import record
+from torch.distributed.elastic.utils.distributed import get_free_port
 
 from torchchat.distributed import launch_distributed, ParallelDims, parallelize_llama
 
@@ -55,7 +57,10 @@ class BuilderArgs:
     device: Optional[str] = None
     precision: torch.dtype = torch.float32
     setup_caches: bool = False
-    use_distributed: bool = False
+    distributed: bool = False
+    pp: int = 1
+    tp: int = 1
+    chpt_from: str = "hf"
     is_chat_model: bool = False
     prefill_possible: bool = False
     dynamic_shapes: bool = False
@@ -157,7 +162,11 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
                 dtype = torch.float16
         else:
             dtype = name_to_dtype(args.dtype, args.device)
-
+        # distributed args
+        distributed = getattr(args, "distributed", False)
+        pp = getattr(args, "pp", 1)
+        tp = getattr(args, "tp", 1)
+        chpt_from = getattr(args, "chpt_from", "hf")
         return cls(
             checkpoint_dir=checkpoint_dir,
             checkpoint_path=checkpoint_path,
@@ -171,7 +180,10 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
             device=args.device,
             precision=dtype,
             setup_caches=(output_dso_path or output_pte_path),
-            use_distributed=args.distributed,
+            distributed=distributed,
+            pp=pp,
+            tp=tp,
+            chpt_from=chpt_from,
             is_chat_model=is_chat_model,
             dynamic_shapes=getattr(args, "dynamic_shapes", False),
             max_seq_length=getattr(args, "max_seq_length", None),
@@ -481,14 +493,14 @@ def _maybe_parallelize_model(
 
 
 def _load_model(builder_args: BuilderArgs) -> Model:
-    world_mesh, parallel_dims = _maybe_init_distributed(builder_args)
+    # world_mesh, parallel_dims = _maybe_init_distributed(builder_args)
     if builder_args.gguf_path:
         model = _load_model_gguf(builder_args)
-    elif builder_args.use_distributed:
-        model = _init_model_on_meta_device(builder_args)
+    # elif builder_args.use_distributed:
+    #    model = _init_model_on_meta_device(builder_args)
     else:
         model = _load_model_default(builder_args)
-    model = _maybe_parallelize_model(model, builder_args, world_mesh, parallel_dims)
+    # model = _maybe_parallelize_model(model, builder_args, world_mesh, parallel_dims)
 
     model = model.to(device=builder_args.device, dtype=builder_args.precision)
     return model.eval()
@@ -502,7 +514,6 @@ def _initialize_model(
     support_tensor_subclass: bool = True,
 ) -> Model:
     print("Loading model...")
-
     if builder_args.gguf_path and (builder_args.dso_path or builder_args.pte_path):
         print("Setting gguf_kwargs for generate.")
         is_dso = builder_args.dso_path is not None
 
@@ -405,8 +405,7 @@ def _add_distributed_args(parser) -> None:
     parser.add_argument(
         "--distributed",
         action="store_true",
-        help=argparse.SUPPRESS,
-        # "Whether to enable distributed inference",
+        help="Whether to enable distributed inference",
     )
     parser.add_argument(
         "--dcp-dir",
@@ -415,6 +414,27 @@ def _add_distributed_args(parser) -> None:
         help=argparse.SUPPRESS,
         # "Use the specified model checkpoint directory",
     )
+    parser.add_argument(
+        "--pp",
+        "--pipeline-parallel",
+        type=int,
+        default=1,
+        help="Pipeline parallel degree",
+    )
+    parser.add_argument(
+        "--tp",
+        "--tensor-parallel",
+        type=int,
+        default=2,
+        help="Tensor parallel degree",
+    )
+    parser.add_argument(
+        "--chpt-from",
+        type=str,
+        default="hf",  # TODO: change to torchchat once we support it well
+        help="Checkpoint format to load from",
+        choices=["hf", "torchchat"],
+    )
 
 
 # Add CLI Args related to custom model inputs