Skip to content

Commit

Permalink
Fixes vllm-project#1556 double free (vllm-project#3347)
Browse files Browse the repository at this point in the history
  • Loading branch information
br3no authored and starmpcc committed Mar 14, 2024
1 parent 93d7317 commit 9d61785
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 2 deletions.
87 changes: 87 additions & 0 deletions tests/core/test_block_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,3 +274,90 @@ def test_reset():
# Resetting block manager frees all allocated blocks.
block_manager.reset()
assert block_manager.get_num_free_gpu_blocks() == original_blocks


def test_sliding_window_multi_seq():
"""
Tests that memory allocation and deallocation is handled
correctly with multiple sequences that exceed the sliding
window's capacity.
"""
block_size = 1
num_cpu_blocks = 8
num_gpu_blocks = 8
sliding_window = 2
block_manager = BlockSpaceManager(block_size,
num_cpu_blocks,
num_gpu_blocks,
sliding_window=sliding_window,
watermark=0)

assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks

parent = Sequence(1, "one two three", [0, 1, 2], block_size)
seq_group = SequenceGroup("1", [parent], SamplingParams(), time.time(),
None)
block_manager.allocate(seq_group)

# assert the number of blocks allocated is correct
# the parent seq has len 3, but since sliding_window is 2,
# we will use at most 2 blocks
assert block_manager.get_num_free_gpu_blocks(
) == num_gpu_blocks - sliding_window

# Fork prompt and copy block tables.
child = parent.fork(2)
block_manager.fork(parent, child)

# assert the number of blocks allocated is correct
# forking does not increase memory consumption
assert block_manager.get_num_free_gpu_blocks(
) == num_gpu_blocks - sliding_window

# assert both parent and child share all blocks
assert block_manager.get_block_table(
parent) == block_manager.get_block_table(child)

token_id = 4
# Append token to child. Block is shared so copy on write occurs.
child.append_token_id(token_id, {token_id: Logprob(0.0)})
block_manager.append_slot(child)

# assert the number of blocks allocated is correct
# we will use now one block more. Each seq will use 2 blocks,
# but only one can be shared
assert block_manager.get_num_free_gpu_blocks(
) == num_gpu_blocks - sliding_window - 1

token_id = 5
parent.append_token_id(token_id, {token_id: Logprob(0.0)})
block_manager.append_slot(parent)

# assert the number of blocks allocated is correct
# no change, because both sequences are still just sharing one block
assert block_manager.get_num_free_gpu_blocks(
) == num_gpu_blocks - sliding_window - 1

block_table_parent = block_manager.get_block_table(parent)
block_table_child = block_manager.get_block_table(child)

assert block_table_parent != block_table_child

# assert both blocks are sharing the second-last block
assert block_table_parent[-2] == block_table_child[-2]

# now let's clean up...
block_manager.free(parent)

# assert the number of blocks allocated is correct
# We have freed one seq, reducing the ref count of two blocks by one.
# One of the two was only used by the parent seq, so this is now free.
# The child seq still consumes sliding_window blocks
assert block_manager.get_num_free_gpu_blocks(
) == num_gpu_blocks - sliding_window

# free all blocks
block_manager.free(child)

# assert all blocks are free now
assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
17 changes: 15 additions & 2 deletions vllm/core/block_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,12 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
# Thus, it is always safe from OOM.
src_block_table = self.block_tables[parent_seq.seq_id]
self.block_tables[child_seq.seq_id] = src_block_table.copy()
for block in src_block_table:
# When using a sliding window, blocks will be eventually reused.
# In this case the block tables will contain repeated blocks.
# When forking, we must make sure that each block's `ref_count`
# is only incremented by one, so we deduplicate them by wrapping
# them in a set.
for block in set(src_block_table):
block.ref_count += 1

def _get_physical_blocks(
Expand Down Expand Up @@ -393,7 +398,15 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
return block_number_mapping

def _free_block_table(self, block_table: BlockTable) -> None:
for block in set(block_table):
# when using a sliding window, each seq will only use up
# to `self.block_sliding_window` blocks. When freeing
# the block table, we must make sure to not free blocks more
# than once. If no sliding window is used, there is no block
# reuse in the block table, so we must free all blocks.
blocks_to_free = (block_table[-self.block_sliding_window:]
if self.block_sliding_window is not None else
block_table)
for block in set(blocks_to_free):
if block.device == Device.GPU:
self.gpu_allocator.free(block)
else:
Expand Down

0 comments on commit 9d61785

Please sign in to comment.