pytorch · tianyu-l · Oct 16, 2025 · Oct 16, 2025
@@ -57,7 +57,7 @@ In a real-world scenario, `torchft_lighthouse` would likely be on a different ma
 ### Using semi-synchronous training (Example 2)
 
 TorchFT provides algorithms that do not require per-step synchronization and
-the replica groups can sychronize weights every N steps.
+the replica groups can synchronize weights every N steps.
 
 **Note on Batch Sizes**: For DiLoCo, there's an important distinction in batch size terminology:
 

@@ -319,7 +319,7 @@ def _build_metric_logger(
         logger_container.add_logger(tensorboard_logger)
 
     if logger_container.number_of_loggers == 0:
-        logger.debug("No loggers enabled, returning an emtpy LoggerContainer")
+        logger.debug("No loggers enabled, returning an empty LoggerContainer")
     return logger_container
 
 

@@ -25,7 +25,7 @@
 
 class QuantizationConverter(ModelConverter):
     """
-    Base class for quantization converters, which implements generic validation re-usable across all quantization converters.
+    Base class for quantization converters, which implements generic validation reusable across all quantization converters.
     """
 
     enabled: bool = False

@@ -224,7 +224,7 @@ def _validate_config(self) -> None:
     def register_tyro_rules(registry: tyro.constructors.ConstructorRegistry) -> None:
         @registry.primitive_rule
         def list_str_rule(type_info: tyro.constructors.PrimitiveTypeInfo):
-            """Support for comma separate string parsing"""
+            """Support for comma separated string parsing"""
             if type_info.type != list[str]:
                 return None
             return tyro.constructors.PrimitiveConstructorSpec(

diff --git a/torchtitan/experiments/simple_fsdp/README.md b/torchtitan/experiments/simple_fsdp/README.md
@@ -53,7 +53,7 @@ SimpleFSDP relies on compiler backend to perform optimizations (i.e., bucketing
     - "aot_eager_autobucketing": perform autobucketing at aten fx-level, and perform code execution with aot_eager backend.
 
 
-users can specify the pass (e.g., "aot_eager_autobucketing") via addtional configs:
+users can specify the pass (e.g., "aot_eager_autobucketing") via additional configs:
 
 ```bash
 --job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config  --compile.model_backend_override "aot_eager_autobucketing"

diff --git a/torchtitan/experiments/vlm/README.md b/torchtitan/experiments/vlm/README.md
@@ -34,10 +34,10 @@ This results in a very simple and general interface to train modern VLM with int
 ### Dataloader
 This approach requires the dataloader to handle the following aspect:
 - [x] Interleave the correct precise numbers of image tokens in the inputs token based on encoder's patch size and input images' size
-- [x] Convert images/videos to 1D sequence of patchs:
+- [x] Convert images/videos to 1D sequence of patches:
   - `rearrange(pixels, 'n (t pt) (h ph) (w pw) c -> n (t h w) (pt p pw c)', pt=temporal_ps, ph=patch_size, pw=patch_size)`
   - Pad all image patches sequence to a fixed length and return `pixel_values.shape == [N, L, D]`
-- [x] Return a `grid_thw.shape == [N, L, 3]` to keep track of the location indicies of each patches in the images. Padding image can be tracked in the same tensors with values `-1`.
+- [x] Return a `grid_thw.shape == [N, L, 3]` to keep track of the location indices of each patches in the images. Padding image can be tracked in the same tensors with values `-1`.
 - [x] LLM Sample / Document Packing.
 - [x] Captioning dataset: CC12M
 - [x] Interleaved dataset: Obelics

diff --git a/torchtitan/experiments/vlm/job_config.py b/torchtitan/experiments/vlm/job_config.py
@@ -25,7 +25,7 @@ class Data:
     """
     packing_buffer_size: int = 0
     """ Set to a value >0 to enable sample packing.
-    This control the buffer uses to store training samples avaliable for packing.
+    This control the buffer uses to store training samples available for packing.
     """
 
 

@@ -182,6 +182,6 @@ def create_attention_mask(*args, **kwargs):
     """Create an attention mask using compiled create_block_mask.
 
     This function is cached to avoid recreating BlockMasks for the same
-    argumens.
+    arguments.
     """
     return _compiled_create_block_mask(*args, **kwargs)
@@ -141,7 +141,7 @@ def from_hf(self, hf_state_dict: dict[str, Any]) -> dict[str, Any]:
                         layer_num,
                         value.device_mesh,
                     )
-                else:  # keep this path to be compatibile with offline conversion
+                else:  # keep this path to be compatible with offline conversion
                     stacked_value = self._concatenate_expert_weights(
                         expert_weights_by_layer,
                         titan_abstract_key,

@@ -48,7 +48,7 @@ def _calculate_strided_shard_shard_indices(
         dim_size_to_split: int,
     ) -> tuple[int, int]:
         """
-        Given a [StridedShard(dim=i), Shard(dim=i)] placement, caculate the start index
+        Given a [StridedShard(dim=i), Shard(dim=i)] placement, calculate the start index
         and end index on dim-i for GPU rank (strided_shard_dim_degree, shard_dim_rank)
 
         GPU Layout (strided_shard_rank, shard_rank):
@@ -68,12 +68,12 @@ def _calculate_strided_shard_shard_indices(
                     2   │    GPU(2, 1)    │
                         └─────────────────┘
 
-        Calulate the start_index from inner dimesion (Shard(dim=i)) to outer demension (StridedShard(dim=i)).
+        Calculate the start_index from inner dimension (Shard(dim=i)) to outer dimension (StridedShard(dim=i)).
         """
 
         block_size = dim_size_to_split // (strided_shard_dim_degree * shard_dim_degree)
 
-        # Error out if can not evenly divded
+        # Error out if can not evenly divided
         if (
             block_size * (strided_shard_dim_degree * shard_dim_degree)
             != dim_size_to_split