Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/torchft.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ In a real-world scenario, `torchft_lighthouse` would likely be on a different ma
### Using semi-synchronous training (Example 2)

TorchFT provides algorithms that do not require per-step synchronization and
the replica groups can sychronize weights every N steps.
the replica groups can synchronize weights every N steps.

**Note on Batch Sizes**: For DiLoCo, there's an important distinction in batch size terminology:

Expand Down
2 changes: 1 addition & 1 deletion torchtitan/components/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ def _build_metric_logger(
logger_container.add_logger(tensorboard_logger)

if logger_container.number_of_loggers == 0:
logger.debug("No loggers enabled, returning an emtpy LoggerContainer")
logger.debug("No loggers enabled, returning an empty LoggerContainer")
return logger_container


Expand Down
2 changes: 1 addition & 1 deletion torchtitan/components/quantization/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

class QuantizationConverter(ModelConverter):
"""
Base class for quantization converters, which implements generic validation re-usable across all quantization converters.
Base class for quantization converters, which implements generic validation reusable across all quantization converters.
"""

enabled: bool = False
Expand Down
2 changes: 1 addition & 1 deletion torchtitan/config/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ def _validate_config(self) -> None:
def register_tyro_rules(registry: tyro.constructors.ConstructorRegistry) -> None:
@registry.primitive_rule
def list_str_rule(type_info: tyro.constructors.PrimitiveTypeInfo):
"""Support for comma separate string parsing"""
"""Support for comma separated string parsing"""
if type_info.type != list[str]:
return None
return tyro.constructors.PrimitiveConstructorSpec(
Expand Down
2 changes: 1 addition & 1 deletion torchtitan/experiments/simple_fsdp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ SimpleFSDP relies on compiler backend to perform optimizations (i.e., bucketing
- "aot_eager_autobucketing": perform autobucketing at aten fx-level, and perform code execution with aot_eager backend.


users can specify the pass (e.g., "aot_eager_autobucketing") via addtional configs:
users can specify the pass (e.g., "aot_eager_autobucketing") via additional configs:

```bash
--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config --compile.model_backend_override "aot_eager_autobucketing"
Expand Down
4 changes: 2 additions & 2 deletions torchtitan/experiments/vlm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,10 @@ This results in a very simple and general interface to train modern VLM with int
### Dataloader
This approach requires the dataloader to handle the following aspect:
- [x] Interleave the correct precise numbers of image tokens in the inputs token based on encoder's patch size and input images' size
- [x] Convert images/videos to 1D sequence of patchs:
- [x] Convert images/videos to 1D sequence of patches:
- `rearrange(pixels, 'n (t pt) (h ph) (w pw) c -> n (t h w) (pt p pw c)', pt=temporal_ps, ph=patch_size, pw=patch_size)`
- Pad all image patches sequence to a fixed length and return `pixel_values.shape == [N, L, D]`
- [x] Return a `grid_thw.shape == [N, L, 3]` to keep track of the location indicies of each patches in the images. Padding image can be tracked in the same tensors with values `-1`.
- [x] Return a `grid_thw.shape == [N, L, 3]` to keep track of the location indices of each patches in the images. Padding image can be tracked in the same tensors with values `-1`.
- [x] LLM Sample / Document Packing.
- [x] Captioning dataset: CC12M
- [x] Interleaved dataset: Obelics
Expand Down
2 changes: 1 addition & 1 deletion torchtitan/experiments/vlm/job_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class Data:
"""
packing_buffer_size: int = 0
""" Set to a value >0 to enable sample packing.
This control the buffer uses to store training samples avaliable for packing.
This control the buffer uses to store training samples available for packing.
"""


Expand Down
2 changes: 1 addition & 1 deletion torchtitan/models/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,6 @@ def create_attention_mask(*args, **kwargs):
"""Create an attention mask using compiled create_block_mask.

This function is cached to avoid recreating BlockMasks for the same
argumens.
arguments.
"""
return _compiled_create_block_mask(*args, **kwargs)
2 changes: 1 addition & 1 deletion torchtitan/models/qwen3/model/state_dict_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def from_hf(self, hf_state_dict: dict[str, Any]) -> dict[str, Any]:
layer_num,
value.device_mesh,
)
else: # keep this path to be compatibile with offline conversion
else: # keep this path to be compatible with offline conversion
stacked_value = self._concatenate_expert_weights(
expert_weights_by_layer,
titan_abstract_key,
Expand Down
6 changes: 3 additions & 3 deletions torchtitan/models/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def _calculate_strided_shard_shard_indices(
dim_size_to_split: int,
) -> tuple[int, int]:
"""
Given a [StridedShard(dim=i), Shard(dim=i)] placement, caculate the start index
Given a [StridedShard(dim=i), Shard(dim=i)] placement, calculate the start index
and end index on dim-i for GPU rank (strided_shard_dim_degree, shard_dim_rank)

GPU Layout (strided_shard_rank, shard_rank):
Expand All @@ -68,12 +68,12 @@ def _calculate_strided_shard_shard_indices(
2 │ GPU(2, 1) │
└─────────────────┘

Calulate the start_index from inner dimesion (Shard(dim=i)) to outer demension (StridedShard(dim=i)).
Calculate the start_index from inner dimension (Shard(dim=i)) to outer dimension (StridedShard(dim=i)).
"""

block_size = dim_size_to_split // (strided_shard_dim_degree * shard_dim_degree)

# Error out if can not evenly divded
# Error out if can not evenly divided
if (
block_size * (strided_shard_dim_degree * shard_dim_degree)
!= dim_size_to_split
Expand Down