diff --git a/docs/torchft.md b/docs/torchft.md index b39eddb6db..afcb50eee6 100644 --- a/docs/torchft.md +++ b/docs/torchft.md @@ -57,7 +57,7 @@ In a real-world scenario, `torchft_lighthouse` would likely be on a different ma ### Using semi-synchronous training (Example 2) TorchFT provides algorithms that do not require per-step synchronization and -the replica groups can sychronize weights every N steps. +the replica groups can synchronize weights every N steps. **Note on Batch Sizes**: For DiLoCo, there's an important distinction in batch size terminology: diff --git a/torchtitan/components/metrics.py b/torchtitan/components/metrics.py index 878b2b315d..6905fb5b53 100644 --- a/torchtitan/components/metrics.py +++ b/torchtitan/components/metrics.py @@ -319,7 +319,7 @@ def _build_metric_logger( logger_container.add_logger(tensorboard_logger) if logger_container.number_of_loggers == 0: - logger.debug("No loggers enabled, returning an emtpy LoggerContainer") + logger.debug("No loggers enabled, returning an empty LoggerContainer") return logger_container diff --git a/torchtitan/components/quantization/__init__.py b/torchtitan/components/quantization/__init__.py index fb0935d006..de94c37b3e 100644 --- a/torchtitan/components/quantization/__init__.py +++ b/torchtitan/components/quantization/__init__.py @@ -25,7 +25,7 @@ class QuantizationConverter(ModelConverter): """ - Base class for quantization converters, which implements generic validation re-usable across all quantization converters. + Base class for quantization converters, which implements generic validation reusable across all quantization converters. """ enabled: bool = False diff --git a/torchtitan/config/manager.py b/torchtitan/config/manager.py index 314a2f2926..10f4440a4c 100644 --- a/torchtitan/config/manager.py +++ b/torchtitan/config/manager.py @@ -224,7 +224,7 @@ def _validate_config(self) -> None: def register_tyro_rules(registry: tyro.constructors.ConstructorRegistry) -> None: @registry.primitive_rule def list_str_rule(type_info: tyro.constructors.PrimitiveTypeInfo): - """Support for comma separate string parsing""" + """Support for comma separated string parsing""" if type_info.type != list[str]: return None return tyro.constructors.PrimitiveConstructorSpec( diff --git a/torchtitan/experiments/simple_fsdp/README.md b/torchtitan/experiments/simple_fsdp/README.md index a6d2f72dda..4b4aa85b18 100644 --- a/torchtitan/experiments/simple_fsdp/README.md +++ b/torchtitan/experiments/simple_fsdp/README.md @@ -53,7 +53,7 @@ SimpleFSDP relies on compiler backend to perform optimizations (i.e., bucketing - "aot_eager_autobucketing": perform autobucketing at aten fx-level, and perform code execution with aot_eager backend. -users can specify the pass (e.g., "aot_eager_autobucketing") via addtional configs: +users can specify the pass (e.g., "aot_eager_autobucketing") via additional configs: ```bash --job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config --compile.model_backend_override "aot_eager_autobucketing" diff --git a/torchtitan/experiments/vlm/README.md b/torchtitan/experiments/vlm/README.md index 13704fba46..3e9d121560 100644 --- a/torchtitan/experiments/vlm/README.md +++ b/torchtitan/experiments/vlm/README.md @@ -34,10 +34,10 @@ This results in a very simple and general interface to train modern VLM with int ### Dataloader This approach requires the dataloader to handle the following aspect: - [x] Interleave the correct precise numbers of image tokens in the inputs token based on encoder's patch size and input images' size -- [x] Convert images/videos to 1D sequence of patchs: +- [x] Convert images/videos to 1D sequence of patches: - `rearrange(pixels, 'n (t pt) (h ph) (w pw) c -> n (t h w) (pt p pw c)', pt=temporal_ps, ph=patch_size, pw=patch_size)` - Pad all image patches sequence to a fixed length and return `pixel_values.shape == [N, L, D]` -- [x] Return a `grid_thw.shape == [N, L, 3]` to keep track of the location indicies of each patches in the images. Padding image can be tracked in the same tensors with values `-1`. +- [x] Return a `grid_thw.shape == [N, L, 3]` to keep track of the location indices of each patches in the images. Padding image can be tracked in the same tensors with values `-1`. - [x] LLM Sample / Document Packing. - [x] Captioning dataset: CC12M - [x] Interleaved dataset: Obelics diff --git a/torchtitan/experiments/vlm/job_config.py b/torchtitan/experiments/vlm/job_config.py index 48be31e923..ba0c10a2b8 100644 --- a/torchtitan/experiments/vlm/job_config.py +++ b/torchtitan/experiments/vlm/job_config.py @@ -25,7 +25,7 @@ class Data: """ packing_buffer_size: int = 0 """ Set to a value >0 to enable sample packing. - This control the buffer uses to store training samples avaliable for packing. + This control the buffer uses to store training samples available for packing. """ diff --git a/torchtitan/models/attention.py b/torchtitan/models/attention.py index bf963a5b5f..f941430e31 100644 --- a/torchtitan/models/attention.py +++ b/torchtitan/models/attention.py @@ -182,6 +182,6 @@ def create_attention_mask(*args, **kwargs): """Create an attention mask using compiled create_block_mask. This function is cached to avoid recreating BlockMasks for the same - argumens. + arguments. """ return _compiled_create_block_mask(*args, **kwargs) diff --git a/torchtitan/models/qwen3/model/state_dict_adapter.py b/torchtitan/models/qwen3/model/state_dict_adapter.py index edc15d9da0..bc55f53c6a 100644 --- a/torchtitan/models/qwen3/model/state_dict_adapter.py +++ b/torchtitan/models/qwen3/model/state_dict_adapter.py @@ -141,7 +141,7 @@ def from_hf(self, hf_state_dict: dict[str, Any]) -> dict[str, Any]: layer_num, value.device_mesh, ) - else: # keep this path to be compatibile with offline conversion + else: # keep this path to be compatible with offline conversion stacked_value = self._concatenate_expert_weights( expert_weights_by_layer, titan_abstract_key, diff --git a/torchtitan/models/utils.py b/torchtitan/models/utils.py index f978bbdf17..6b2b404218 100644 --- a/torchtitan/models/utils.py +++ b/torchtitan/models/utils.py @@ -48,7 +48,7 @@ def _calculate_strided_shard_shard_indices( dim_size_to_split: int, ) -> tuple[int, int]: """ - Given a [StridedShard(dim=i), Shard(dim=i)] placement, caculate the start index + Given a [StridedShard(dim=i), Shard(dim=i)] placement, calculate the start index and end index on dim-i for GPU rank (strided_shard_dim_degree, shard_dim_rank) GPU Layout (strided_shard_rank, shard_rank): @@ -68,12 +68,12 @@ def _calculate_strided_shard_shard_indices( 2 │ GPU(2, 1) │ └─────────────────┘ - Calulate the start_index from inner dimesion (Shard(dim=i)) to outer demension (StridedShard(dim=i)). + Calculate the start_index from inner dimension (Shard(dim=i)) to outer dimension (StridedShard(dim=i)). """ block_size = dim_size_to_split // (strided_shard_dim_degree * shard_dim_degree) - # Error out if can not evenly divded + # Error out if can not evenly divided if ( block_size * (strided_shard_dim_degree * shard_dim_degree) != dim_size_to_split