diff --git a/.ci/scripts/validate.sh b/.ci/scripts/validate.sh index bc8146d03..12107f8d3 100644 --- a/.ci/scripts/validate.sh +++ b/.ci/scripts/validate.sh @@ -284,7 +284,7 @@ function eval_model_sanity_check() { echo "*************************************************" if [ "$DTYPE" != "float16" ]; then python3 -W ignore export.py --dtype ${DTYPE} --quant "$QUANT_OPTIONS" --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1 - python3 -W ignore eval.py --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" --limit 5 > "$MODEL_DIR/output_eval_aoti" || exit 1 + python3 -W ignore eval.py --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" --limit 5 > "$MODEL_DIR/output_eval_aoti" || exit 1 cat "$MODEL_DIR/output_eval_aoti" fi; diff --git a/cli.py b/cli.py index a06ff5de1..f36d93346 100644 --- a/cli.py +++ b/cli.py @@ -61,27 +61,14 @@ def add_arguments_for_verb(parser, verb: str) -> None: help="Model name for well-known models", ) - parser.add_argument( - "--chat", - action="store_true", - help="Whether to start an interactive chat session", - ) + if verb in ["browser", "chat", "generate"]: + _add_generation_args(parser) + parser.add_argument( "--distributed", action="store_true", help="Whether to enable distributed inference", ) - parser.add_argument( - "--gui", - action="store_true", - help="Whether to use a web UI for an interactive chat session", - ) - parser.add_argument( - "--prompt", - type=str, - default="Hello, my name is", - help="Input prompt", - ) parser.add_argument( "--is-chat-model", action="store_true", @@ -93,54 +80,17 @@ def add_arguments_for_verb(parser, verb: str) -> None: default=None, help="Initialize torch seed", ) - parser.add_argument( - "--num-samples", - type=int, - default=1, - help="Number of samples", - ) - parser.add_argument( - "--max-new-tokens", - type=int, - default=200, - help="Maximum number of new tokens", - ) - parser.add_argument( - "--top-k", - type=int, - default=200, - help="Top-k for sampling", - ) - parser.add_argument( - "--temperature", type=float, default=0.8, help="Temperature for sampling" - ) parser.add_argument( "--compile", action="store_true", help="Whether to compile the model with torch.compile", ) - parser.add_argument( - "--compile-prefill", - action="store_true", - help="Whether to compile the prefill. Improves prefill perf, but has higher compile times.", - ) - parser.add_argument( - "--sequential-prefill", - action="store_true", - help="Whether to perform prefill sequentially. Only used for model debug.", - ) parser.add_argument( "--profile", type=Path, default=None, help="Profile path.", ) - parser.add_argument( - "--speculate-k", - type=int, - default=5, - help="Speculative execution depth", - ) parser.add_argument( "--draft-checkpoint-path", type=Path, @@ -171,30 +121,10 @@ def add_arguments_for_verb(parser, verb: str) -> None: default=None, help="Use the specified model tokenizer file", ) - parser.add_argument( - "--output-pte-path", - type=str, - default=None, - help="Output to the specified ExecuTorch .pte model file", - ) - parser.add_argument( - "--output-dso-path", - type=str, - default=None, - help="Output to the specified AOT Inductor .dso model file", - ) - parser.add_argument( - "--dso-path", - type=Path, - default=None, - help="Use the specified AOT Inductor .dso model file", - ) - parser.add_argument( - "--pte-path", - type=Path, - default=None, - help="Use the specified ExecuTorch .pte model file", - ) + + _add_exported_model_input_args(parser) + _add_export_output_path_args(parser) + parser.add_argument( "--dtype", default="fast", @@ -259,6 +189,40 @@ def add_arguments_for_verb(parser, verb: str) -> None: _add_cli_metadata_args(parser) +# Add CLI Args representing user provided exported model files +def _add_export_output_path_args(parser) -> None: + output_path_parser = parser.add_argument_group("Export Output Path Args", "Specify the output path for the exported model files") + output_path_parser.add_argument( + "--output-pte-path", + type=str, + default=None, + help="Output to the specified ExecuTorch .pte model file", + ) + output_path_parser.add_argument( + "--output-dso-path", + type=str, + default=None, + help="Output to the specified AOT Inductor .dso model file", + ) + + +# Add CLI Args representing user provided exported model files +def _add_exported_model_input_args(parser) -> None: + exported_model_path_parser = parser.add_argument_group("Exported Model Path Args", "Specify the path of the exported model files to ingest") + exported_model_path_parser.add_argument( + "--dso-path", + type=Path, + default=None, + help="Use the specified AOT Inductor .dso model file", + ) + exported_model_path_parser.add_argument( + "--pte-path", + type=Path, + default=None, + help="Use the specified ExecuTorch .pte model file", + ) + + # Add CLI Args that are relevant to any subcommand execution def _add_cli_metadata_args(parser) -> None: parser.add_argument( @@ -297,22 +261,81 @@ def _configure_artifact_inventory_args(parser, verb: str) -> None: ) +# Add CLI Args specific to user prompted generation +def _add_generation_args(parser) -> None: + generator_parser = parser.add_argument_group("Generation Args", "Configs for generating output based on provided prompt") + generator_parser.add_argument( + "--prompt", + type=str, + default="Hello, my name is", + help="Input prompt for manual output generation", + ) + generator_parser.add_argument( + "--chat", + action="store_true", + help="Whether to start an interactive chat session", + ) + generator_parser.add_argument( + "--gui", + action="store_true", + help="Whether to use a web UI for an interactive chat session", + ) + generator_parser.add_argument( + "--num-samples", + type=int, + default=1, + help="Number of samples", + ) + generator_parser.add_argument( + "--max-new-tokens", + type=int, + default=200, + help="Maximum number of new tokens", + ) + generator_parser.add_argument( + "--top-k", + type=int, + default=200, + help="Top-k for sampling", + ) + generator_parser.add_argument( + "--temperature", type=float, default=0.8, help="Temperature for sampling" + ) + generator_parser.add_argument( + "--compile-prefill", + action="store_true", + help="Whether to compile the prefill. Improves prefill perf, but has higher compile times.", + ) + generator_parser.add_argument( + "--sequential-prefill", + action="store_true", + help="Whether to perform prefill sequentially. Only used for model debug.", + ) + generator_parser.add_argument( + "--speculate-k", + type=int, + default=5, + help="Speculative execution depth", + ) + + # Add CLI Args specific to Model Evaluation def _add_evaluation_args(parser) -> None: - parser.add_argument( + eval_parser = parser.add_argument_group("Evaluation Args", "Configs for evaluating model performance") + eval_parser.add_argument( "--tasks", nargs="+", type=str, default=["wikitext"], help="List of lm-eluther tasks to evaluate. Usage: --tasks task1 task2", ) - parser.add_argument( + eval_parser.add_argument( "--limit", type=int, default=None, help="Number of samples to evaluate", ) - parser.add_argument( + eval_parser.add_argument( "--max-seq-length", type=int, default=None,