Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 25 additions & 47 deletions torchrec/distributed/sharding/rw_sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,61 +580,44 @@ def create_output_dist(
def get_total_num_buckets_runtime_device(
total_num_buckets: Optional[List[int]],
runtime_device: torch.device,
tensor_cache: Dict[
str,
Tuple[torch.Tensor, List[torch.Tensor]],
],
dtype: torch.dtype = torch.int32,
) -> Optional[torch.Tensor]:
if total_num_buckets is None:
return None
cache_key: str = "__total_num_buckets"
if cache_key not in tensor_cache:
tensor_cache[cache_key] = (
torch.tensor(
total_num_buckets,
device=runtime_device,
dtype=dtype,
),
[],
)
return tensor_cache[cache_key][0]

return torch.tensor(
total_num_buckets,
device=runtime_device,
dtype=dtype,
)


@torch.fx.wrap
def get_block_sizes_runtime_device(
block_sizes: List[int],
runtime_device: torch.device,
tensor_cache: Dict[
str,
Tuple[torch.Tensor, List[torch.Tensor]],
],
embedding_shard_metadata: Optional[List[List[int]]] = None,
dtype: torch.dtype = torch.int32,
) -> Tuple[torch.Tensor, List[torch.Tensor]]:
cache_key: str = "__block_sizes"
if cache_key not in tensor_cache:
tensor_cache[cache_key] = (
torch.tensor(
block_sizes,
device=runtime_device,
dtype=dtype,
),
(
[]
if embedding_shard_metadata is None
else [
torch.tensor(
row_pos,
device=runtime_device,
dtype=dtype,
)
for row_pos in embedding_shard_metadata
]
),
)

return tensor_cache[cache_key]
return (
torch.tensor(
block_sizes,
device=runtime_device,
dtype=dtype,
),
(
[]
if embedding_shard_metadata is None
else [
torch.tensor(
row_pos,
device=runtime_device,
dtype=dtype,
)
for row_pos in embedding_shard_metadata
]
),
)


class InferRwSparseFeaturesDist(BaseSparseFeaturesDist[InputDistOutputs]):
Expand Down Expand Up @@ -671,9 +654,6 @@ def __init__(
self.feature_block_sizes.append(
(hash_size + block_divisor - 1) // block_divisor
)
self.tensor_cache: Dict[
str, Tuple[torch.Tensor, Optional[List[torch.Tensor]]]
] = {}

self._dist = KJTOneToAll(
splits=self._world_size * [self._num_features],
Expand All @@ -693,14 +673,12 @@ def forward(self, sparse_features: KeyedJaggedTensor) -> InputDistOutputs:
block_sizes, block_bucketize_row_pos = get_block_sizes_runtime_device(
self.feature_block_sizes,
sparse_features.device(),
self.tensor_cache,
self._embedding_shard_metadata,
sparse_features.values().dtype,
)
total_num_buckets = get_total_num_buckets_runtime_device(
self._feature_total_num_buckets,
sparse_features.device(),
self.tensor_cache,
sparse_features.values().dtype,
)

Expand Down
Loading