[Distributed] Separate prefill and decode

kwen2501 · kwen2501 · commit b4630f2accd2 · 2024-09-18T01:39:49.000-07:00
diff --git a/dist_run.py b/dist_run.py
@@ -187,8 +187,8 @@ def _create_padded_prompts(
 
 def _batch_decode_next_tokens(
     output: torch.Tensor,
-    prompt_lengths: List[int],
     tokenizer,
+    prompt_lengths: Optional[List[int]] = None,
 ) -> List[Tuple[int, str]]:
     """
     Decode the next token for each prompt in the batch.
@@ -201,7 +201,8 @@ def _batch_decode_next_tokens(
     results = []
 
     for i in range(batch_size):
-        next_token_logits = output[i, prompt_lengths[i] - 1, :]
+        pos = prompt_lengths[i] - 1 if prompt_lengths is not None else 0
+        next_token_logits = output[i, pos, :]
 
         # Argmax (deterministic) TODO: add temperature
         next_token = torch.argmax(next_token_logits, dim=-1)
@@ -293,7 +294,7 @@ def main(args):
         logger.info(f"Model: {model}")
 
     mbs = 1  # number of micro-batches
-    mb_size = 5  # micro-batch size
+    mb_size = 4  # micro-batch size
     batch_size = mbs * mb_size  # total batch size
 
     seqlen = 4096  # sequence length
@@ -309,13 +310,14 @@ def main(args):
     activation = torch.rand(
         mb_size, seqlen, dim, device=device, dtype=model_dtype
     )
-    example_args = mb_ids if pp_rank == 0 else activation
+    example_inputs = (mb_ids if pp_rank == 0 else activation,)
+    example_outputs = (activation,)
 
     # Load weights
     logger.info(f"Loading weights for {pp_rank=} on {device=}")
-
     with CUDATrackTime() as timer:
         _load_model_weights(model, distribution, device=device, model_config=config)
+        model.to(device)
 
     logger.info(
         f"{color.green}Total weight loading time: {timer.get_time()} {timer.unit} for stage {rank}{color.reset}"
@@ -334,22 +336,24 @@ def main(args):
     model.setup_input_pos(input_pos)
     model.eval()
 
-    logger.info(f"Creating pipeline stage {pp_rank=}, {pp_degree=}")
-    stage = PipelineStage(
+    logger.info(f"Creating pipeline stage for prefill {pp_rank=}, {pp_degree=}")
+    prefill_stage = PipelineStage(
         model,
         pp_rank,
         pp_degree,
         device,
-        input_args=(example_args,),
+        input_args=example_inputs,
+        output_args=example_outputs,
         group=pp_group,
     )
+    # create schedule
+    prefill_schedule = ScheduleGPipe(prefill_stage, mbs)
 
     prompt = [
-        "What is snow?",
-        "Where does Santa Claus live?",
-        "What is PyTorch?",
-        "Write a poem about the beauty of the night sky.",
-        "What is the capital of France, Germany and Switzerland?",
+        "What is a computer?",
+        "Where does Santa live?",
+        "Who is Abraham Lincoln?",
+        "How are models trained?",
     ]
 
     """
@@ -390,9 +394,10 @@ def main(args):
     padded_sequence, prompt_lengths = _create_padded_prompts(
         input_ids, tokenizer, seqlen, start_pos, device
     )
-
-    # create schedule
-    schedule = ScheduleGPipe(stage, mbs)
+    # TODO: figure out how to set input_pos for each prompt in the batch then we
+    # can remove this limitation.
+    s = set(prompt_lengths)
+    assert len(s) == 1, f"prompt_lengths should be the same, got {s}"
 
     # with CUDATrackTime() as timer:
     first_pp_rank = 0
@@ -408,25 +413,92 @@ def main(args):
     res = [[] for _ in range(total_prompts)]
     num_tokens = 40
 
+    # Prefill phase
+    # Run context input through pipeline, in 1 step
+    with torch.no_grad():
+        if pp_rank == first_pp_rank:
+            output = prefill_schedule.step(padded_sequence)
+        elif pp_rank == last_pp_rank:
+            output = prefill_schedule.step()
+        else:  # middle pp ranks
+            prefill_schedule.step()
+
+    # Decode the output -- first generated token
+    if pp_rank == last_pp_rank:
+        decode_results = _batch_decode_next_tokens(
+            output=output,
+            tokenizer=tokenizer,
+            prompt_lengths=prompt_lengths,
+        )
+        for i in range(len(decode_results)):
+            new_token[i, 0] = torch.tensor(
+                [decode_results[i][0]], device=device
+            )  # token_id in int form
+        if tp_rank == 0:
+            logger.info(
+                f"{color.green} {'* Prefill *'} "
+                f"responses ====>>>> {color.blue} {decode_results=}{color.reset}"
+            )
+
+    # seqlen = 1 now
+    seqlen_decode = 1
+    mb_ids = torch.randint(0, config.vocab_size, (mb_size, seqlen_decode), device=device)
+    activation = torch.rand(
+        mb_size, seqlen_decode, dim, device=device, dtype=model_dtype
+    )
+    example_inputs = (mb_ids if pp_rank == 0 else activation,)
+    example_outputs = (activation,)
+
+    input_pos = torch.tensor([prompt_lengths[0]], device=device)
+    model.setup_input_pos(input_pos)
+
+    logger.info(f"Creating pipeline stage for decode {pp_rank=}, {pp_degree=}")
+    decode_stage = PipelineStage(
+        model,
+        pp_rank,
+        pp_degree,
+        device,
+        input_args=example_inputs,
+        output_args=example_outputs,
+        group=pp_group,
+    )
+    # create schedule
+    decode_schedule = ScheduleGPipe(decode_stage, mbs)
+
     # Decoding
     with torch.no_grad():
-        for step in range(num_tokens):
+        for step in range(num_tokens - 1):
+            # sendrecv between last and first ranks, only if:
+            # first_pp_rank != last_pp_rank.
+            if pp_rank == last_pp_rank and pp_rank != first_pp_rank:
+                dist.send(
+                    new_token,
+                    dst=first_pp_rank_global_id,
+                    group=pp_group,
+                )
+            elif pp_rank == first_pp_rank and pp_rank != last_pp_rank:
+                dist.recv(
+                    new_token,
+                    src=last_pp_rank_global_id,
+                    group=pp_group,
+                )
+
             # Run data through pipeline
             if pp_rank == first_pp_rank:
-                output = schedule.step(padded_sequence)
+                output = decode_schedule.step(new_token)
             elif pp_rank == last_pp_rank:
-                output = schedule.step()
+                output = decode_schedule.step()
             else:  # middle pp ranks
-                schedule.step()
+                decode_schedule.step()
 
             # Decode the output
             if pp_rank == last_pp_rank:
                 decode_results = _batch_decode_next_tokens(
-                    output=output, prompt_lengths=prompt_lengths, tokenizer=tokenizer
+                    output=output, tokenizer=tokenizer
                 )
                 if tp_rank == 0:
                     logger.info(
-                        f"{color.green} {'Prefill' if step == 0 else '* Decode *'} "
+                        f"{color.green} {'* Decode *'} "
                         f"responses ====>>>> {color.blue} {decode_results=}{color.reset}"
                     )
                 # decode results returns both token_id (int) and token_str (readable), hence [0] and [1]
@@ -436,28 +508,8 @@ def main(args):
                         [decode_results[i][0]], device=device
                     )  # decode_results[i][0]
 
-            # sendrecv between last and first ranks, only if:
-            # first_pp_rank != last_pp_rank.
-            if pp_rank == last_pp_rank and pp_rank != first_pp_rank:
-                dist.send(
-                    new_token,
-                    dst=first_pp_rank_global_id,
-                    group=pp_group,
-                )
-            elif pp_rank == first_pp_rank and pp_rank != last_pp_rank:
-                dist.recv(
-                    new_token,
-                    src=last_pp_rank_global_id,
-                    group=pp_group,
-                )
-
-            # Update input sequence with new token
-            if pp_rank == first_pp_rank:
-                _update_padded_sequence(padded_sequence, new_token, prompt_lengths)
-
-            # increment prompt lengths for next token
-            for i in range(len(prompt_lengths)):
-                prompt_lengths[i] += 1
+            input_pos += 1
+            model.setup_input_pos(input_pos)
 
     # Display the decoding results