You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
在调用
sh run_sft.sh
时报错。但是使用python supervised_finetuning.py 加参数时可以运行。
WARNING:torch.distributed.run:
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
usage: supervised_finetuning.py [-h] [--model_type MODEL_TYPE] [--model_name_or_path MODEL_NAME_OR_PATH] [--tokenizer_name_or_path TOKENIZER_NAME_OR_PATH] [--load_in_8bit [LOAD_IN_8BIT]]
[--cache_dir CACHE_DIR] [--use_fast_tokenizer [USE_FAST_TOKENIZER]] [--torch_dtype {auto,bfloat16,float16,float32}] [--device_map DEVICE_MAP]
[--trust_remote_code [TRUST_REMOTE_CODE]] [--no_trust_remote_code] [--dataset_name DATASET_NAME] [--dataset_config_name DATASET_CONFIG_NAME]
[--train_file_dir TRAIN_FILE_DIR] [--validation_file_dir VALIDATION_FILE_DIR] [--max_source_length MAX_SOURCE_LENGTH] [--max_target_length MAX_TARGET_LENGTH]
[--max_train_samples MAX_TRAIN_SAMPLES] [--max_eval_samples MAX_EVAL_SAMPLES] [--overwrite_cache [OVERWRITE_CACHE]]
[--validation_split_percentage VALIDATION_SPLIT_PERCENTAGE] [--preprocessing_num_workers PREPROCESSING_NUM_WORKERS] --output_dir OUTPUT_DIR
[--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]] [--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]] [--do_predict [DO_PREDICT]] [--evaluation_strategy {no,steps,epoch}]
[--prediction_loss_only [PREDICTION_LOSS_ONLY]] [--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE] [--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE]
[--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE] [--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE] [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS]
[--eval_accumulation_steps EVAL_ACCUMULATION_STEPS] [--eval_delay EVAL_DELAY] [--learning_rate LEARNING_RATE] [--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1]
[--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON] [--max_grad_norm MAX_GRAD_NORM] [--num_train_epochs NUM_TRAIN_EPOCHS] [--max_steps MAX_STEPS]
[--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau}] [--warmup_ratio WARMUP_RATIO]
[--warmup_steps WARMUP_STEPS] [--log_level {debug,info,warning,error,critical,passive}] [--log_level_replica {debug,info,warning,error,critical,passive}]
[--log_on_each_node [LOG_ON_EACH_NODE]] [--no_log_on_each_node] [--logging_dir LOGGING_DIR] [--logging_strategy {no,steps,epoch}]
[--logging_first_step [LOGGING_FIRST_STEP]] [--logging_steps LOGGING_STEPS] [--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]] [--no_logging_nan_inf_filter]
[--save_strategy {no,steps,epoch}] [--save_steps SAVE_STEPS] [--save_total_limit SAVE_TOTAL_LIMIT] [--save_safetensors [SAVE_SAFETENSORS]]
[--save_on_each_node [SAVE_ON_EACH_NODE]] [--no_cuda [NO_CUDA]] [--use_mps_device [USE_MPS_DEVICE]] [--seed SEED] [--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]]
[--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]] [--fp16_opt_level FP16_OPT_LEVEL] [--half_precision_backend {auto,cuda_amp,apex,cpu_amp}]
[--bf16_full_eval [BF16_FULL_EVAL]] [--fp16_full_eval [FP16_FULL_EVAL]] [--tf32 TF32] [--local_rank LOCAL_RANK] [--ddp_backend {nccl,gloo,mpi,ccl}]
[--tpu_num_cores TPU_NUM_CORES] [--tpu_metrics_debug [TPU_METRICS_DEBUG]] [--debug DEBUG] [--dataloader_drop_last [DATALOADER_DROP_LAST]] [--eval_steps EVAL_STEPS]
[--dataloader_num_workers DATALOADER_NUM_WORKERS] [--past_index PAST_INDEX] [--run_name RUN_NAME] [--disable_tqdm DISABLE_TQDM]
[--remove_unused_columns [REMOVE_UNUSED_COLUMNS]] [--no_remove_unused_columns] [--label_names LABEL_NAMES [LABEL_NAMES ...]]
[--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]] [--metric_for_best_model METRIC_FOR_BEST_MODEL] [--greater_is_better GREATER_IS_BETTER]
[--ignore_data_skip [IGNORE_DATA_SKIP]] [--sharded_ddp SHARDED_DDP] [--fsdp FSDP] [--fsdp_min_num_params FSDP_MIN_NUM_PARAMS] [--fsdp_config FSDP_CONFIG]
[--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP] [--deepspeed DEEPSPEED] [--label_smoothing_factor LABEL_SMOOTHING_FACTOR]
[--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit}]
[--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]] [--group_by_length [GROUP_BY_LENGTH]] [--length_column_name LENGTH_COLUMN_NAME] [--report_to REPORT_TO [REPORT_TO ...]]
[--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS] [--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB] [--dataloader_pin_memory [DATALOADER_PIN_MEMORY]]
[--no_dataloader_pin_memory] [--skip_memory_metrics [SKIP_MEMORY_METRICS]] [--no_skip_memory_metrics] [--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]]
[--push_to_hub [PUSH_TO_HUB]] [--resume_from_checkpoint RESUME_FROM_CHECKPOINT] [--hub_model_id HUB_MODEL_ID] [--hub_strategy {end,every_save,checkpoint,all_checkpoints}]
[--hub_token HUB_TOKEN] [--hub_private_repo [HUB_PRIVATE_REPO]] [--gradient_checkpointing [GRADIENT_CHECKPOINTING]]
[--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]] [--fp16_backend {auto,cuda_amp,apex,cpu_amp}] [--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID]
[--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION] [--push_to_hub_token PUSH_TO_HUB_TOKEN] [--mp_parameters MP_PARAMETERS]
[--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]] [--full_determinism [FULL_DETERMINISM]] [--torchdynamo TORCHDYNAMO] [--ray_scope RAY_SCOPE] [--ddp_timeout DDP_TIMEOUT]
[--torch_compile [TORCH_COMPILE]] [--torch_compile_backend TORCH_COMPILE_BACKEND] [--torch_compile_mode TORCH_COMPILE_MODE] [--xpu_backend {mpi,ccl,gloo}]
[--use_peft [USE_PEFT]] [--no_use_peft] [--target_modules TARGET_MODULES] [--lora_rank LORA_RANK] [--lora_dropout LORA_DROPOUT] [--lora_alpha LORA_ALPHA]
[--modules_to_save MODULES_TO_SAVE] [--peft_path PEFT_PATH]
supervised_finetuning.py: error: the following arguments are required: --output_dir
usage: supervised_finetuning.py [-h] [--model_type MODEL_TYPE] [--model_name_or_path MODEL_NAME_OR_PATH] [--tokenizer_name_or_path TOKENIZER_NAME_OR_PATH] [--load_in_8bit [LOAD_IN_8BIT]]
[--cache_dir CACHE_DIR] [--use_fast_tokenizer [USE_FAST_TOKENIZER]] [--torch_dtype {auto,bfloat16,float16,float32}] [--device_map DEVICE_MAP]
[--trust_remote_code [TRUST_REMOTE_CODE]] [--no_trust_remote_code] [--dataset_name DATASET_NAME] [--dataset_config_name DATASET_CONFIG_NAME]
[--train_file_dir TRAIN_FILE_DIR] [--validation_file_dir VALIDATION_FILE_DIR] [--max_source_length MAX_SOURCE_LENGTH] [--max_target_length MAX_TARGET_LENGTH]
[--max_train_samples MAX_TRAIN_SAMPLES] [--max_eval_samples MAX_EVAL_SAMPLES] [--overwrite_cache [OVERWRITE_CACHE]]
[--validation_split_percentage VALIDATION_SPLIT_PERCENTAGE] [--preprocessing_num_workers PREPROCESSING_NUM_WORKERS] --output_dir OUTPUT_DIR
[--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]] [--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]] [--do_predict [DO_PREDICT]] [--evaluation_strategy {no,steps,epoch}]
[--prediction_loss_only [PREDICTION_LOSS_ONLY]] [--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE] [--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE]
[--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE] [--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE] [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS]
[--eval_accumulation_steps EVAL_ACCUMULATION_STEPS] [--eval_delay EVAL_DELAY] [--learning_rate LEARNING_RATE] [--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1]
[--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON] [--max_grad_norm MAX_GRAD_NORM] [--num_train_epochs NUM_TRAIN_EPOCHS] [--max_steps MAX_STEPS]
[--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau}] [--warmup_ratio WARMUP_RATIO]
[--warmup_steps WARMUP_STEPS] [--log_level {debug,info,warning,error,critical,passive}] [--log_level_replica {debug,info,warning,error,critical,passive}]
[--log_on_each_node [LOG_ON_EACH_NODE]] [--no_log_on_each_node] [--logging_dir LOGGING_DIR] [--logging_strategy {no,steps,epoch}]
[--logging_first_step [LOGGING_FIRST_STEP]] [--logging_steps LOGGING_STEPS] [--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]] [--no_logging_nan_inf_filter]
[--save_strategy {no,steps,epoch}] [--save_steps SAVE_STEPS] [--save_total_limit SAVE_TOTAL_LIMIT] [--save_safetensors [SAVE_SAFETENSORS]]
[--save_on_each_node [SAVE_ON_EACH_NODE]] [--no_cuda [NO_CUDA]] [--use_mps_device [USE_MPS_DEVICE]] [--seed SEED] [--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]]
[--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]] [--fp16_opt_level FP16_OPT_LEVEL] [--half_precision_backend {auto,cuda_amp,apex,cpu_amp}]
[--bf16_full_eval [BF16_FULL_EVAL]] [--fp16_full_eval [FP16_FULL_EVAL]] [--tf32 TF32] [--local_rank LOCAL_RANK] [--ddp_backend {nccl,gloo,mpi,ccl}]
[--tpu_num_cores TPU_NUM_CORES] [--tpu_metrics_debug [TPU_METRICS_DEBUG]] [--debug DEBUG] [--dataloader_drop_last [DATALOADER_DROP_LAST]] [--eval_steps EVAL_STEPS]
[--dataloader_num_workers DATALOADER_NUM_WORKERS] [--past_index PAST_INDEX] [--run_name RUN_NAME] [--disable_tqdm DISABLE_TQDM]
[--remove_unused_columns [REMOVE_UNUSED_COLUMNS]] [--no_remove_unused_columns] [--label_names LABEL_NAMES [LABEL_NAMES ...]]
[--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]] [--metric_for_best_model METRIC_FOR_BEST_MODEL] [--greater_is_better GREATER_IS_BETTER]
[--ignore_data_skip [IGNORE_DATA_SKIP]] [--sharded_ddp SHARDED_DDP] [--fsdp FSDP] [--fsdp_min_num_params FSDP_MIN_NUM_PARAMS] [--fsdp_config FSDP_CONFIG]
[--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP] [--deepspeed DEEPSPEED] [--label_smoothing_factor LABEL_SMOOTHING_FACTOR]
[--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit}]
[--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]] [--group_by_length [GROUP_BY_LENGTH]] [--length_column_name LENGTH_COLUMN_NAME] [--report_to REPORT_TO [REPORT_TO ...]]
[--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS] [--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB] [--dataloader_pin_memory [DATALOADER_PIN_MEMORY]]
[--no_dataloader_pin_memory] [--skip_memory_metrics [SKIP_MEMORY_METRICS]] [--no_skip_memory_metrics] [--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]]
[--push_to_hub [PUSH_TO_HUB]] [--resume_from_checkpoint RESUME_FROM_CHECKPOINT] [--hub_model_id HUB_MODEL_ID] [--hub_strategy {end,every_save,checkpoint,all_checkpoints}]
[--hub_token HUB_TOKEN] [--hub_private_repo [HUB_PRIVATE_REPO]] [--gradient_checkpointing [GRADIENT_CHECKPOINTING]]
[--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]] [--fp16_backend {auto,cuda_amp,apex,cpu_amp}] [--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID]
[--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION] [--push_to_hub_token PUSH_TO_HUB_TOKEN] [--mp_parameters MP_PARAMETERS]
[--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]] [--full_determinism [FULL_DETERMINISM]] [--torchdynamo TORCHDYNAMO] [--ray_scope RAY_SCOPE] [--ddp_timeout DDP_TIMEOUT]
[--torch_compile [TORCH_COMPILE]] [--torch_compile_backend TORCH_COMPILE_BACKEND] [--torch_compile_mode TORCH_COMPILE_MODE] [--xpu_backend {mpi,ccl,gloo}]
[--use_peft [USE_PEFT]] [--no_use_peft] [--target_modules TARGET_MODULES] [--lora_rank LORA_RANK] [--lora_dropout LORA_DROPOUT] [--lora_alpha LORA_ALPHA]
[--modules_to_save MODULES_TO_SAVE] [--peft_path PEFT_PATH]
supervised_finetuning.py: error: the following arguments are required: --output_dir
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 1740890) of binary: /usr/local/anaconda3/envs/hj-glm6b2/bin/python
Traceback (most recent call last):
File "/usr/local/anaconda3/envs/hj-glm6b2/bin/torchrun", line 8, in
sys.exit(main())
File "/usr/local/anaconda3/envs/hj-glm6b2/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/usr/local/anaconda3/envs/hj-glm6b2/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main
run(args)
File "/usr/local/anaconda3/envs/hj-glm6b2/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/usr/local/anaconda3/envs/hj-glm6b2/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/usr/local/anaconda3/envs/hj-glm6b2/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
supervised_finetuning.py FAILED
Failures:
[1]:
time : 2023-06-29_18:21:05
host : JoinShareAIPC
rank : 1 (local_rank: 1)
exitcode : 2 (pid: 1740891)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
Root Cause (first observed failure):
[0]:
time : 2023-06-29_18:21:05
host : JoinShareAIPC
rank : 0 (local_rank: 0)
exitcode : 2 (pid: 1740890)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
run_sft.sh: 2: --model_type: not found
run_sft.sh: 3: --model_name_or_path: not found
run_sft.sh: 4: --train_file_dir: not found
run_sft.sh: 5: --validation_file_dir: not found
run_sft.sh: 6: --per_device_train_batch_size: not found
run_sft.sh: 7: --per_device_eval_batch_size: not found
run_sft.sh: 8: --do_train: not found
run_sft.sh: 9: --do_eval: not found
run_sft.sh: 10: --use_peft: not found
run_sft.sh: 11: --fp16: not found
run_sft.sh: 12: --max_train_samples: not found
run_sft.sh: 13: --max_eval_samples: not found
run_sft.sh: 14: --num_train_epochs: not found
run_sft.sh: 15: --learning_rate: not found
run_sft.sh: 16: --warmup_ratio: not found
run_sft.sh: 17: --weight_decay: not found
run_sft.sh: 18: --logging_strategy: not found
run_sft.sh: 19: --logging_steps: not found
run_sft.sh: 20: --eval_steps: not found
run_sft.sh: 21: --evaluation_strategy: not found
run_sft.sh: 22: --save_steps: not found
run_sft.sh: 23: --save_strategy: not found
run_sft.sh: 24: --save_total_limit: not found
run_sft.sh: 25: --gradient_accumulation_steps: not found
run_sft.sh: 26: --preprocessing_num_workers: not found
run_sft.sh: 27: --max_source_length: not found
run_sft.sh: 28: --max_target_length: not found
run_sft.sh: 29: --output_dir: not found
run_sft.sh: 30: --overwrite_output_dir: not found
run_sft.sh: 31: --ddp_timeout: not found
run_sft.sh: 32: --logging_first_step: not found
run_sft.sh: 33: --target_modules: not found
run_sft.sh: 34: --lora_rank: not found
run_sft.sh: 35: --lora_alpha: not found
run_sft.sh: 36: --lora_dropout: not found
run_sft.sh: 37: --torch_dtype: not found
run_sft.sh: 38: --device_map: not found
run_sft.sh: 39: --report_to: not found
run_sft.sh: 40: --ddp_find_unused_parameters: not found
run_sft.sh: 41: --gradient_checkpointing: not found
The text was updated successfully, but these errors were encountered:
在调用
sh run_sft.sh
时报错。但是使用python supervised_finetuning.py 加参数时可以运行。
WARNING:torch.distributed.run:
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
usage: supervised_finetuning.py [-h] [--model_type MODEL_TYPE] [--model_name_or_path MODEL_NAME_OR_PATH] [--tokenizer_name_or_path TOKENIZER_NAME_OR_PATH] [--load_in_8bit [LOAD_IN_8BIT]]
[--cache_dir CACHE_DIR] [--use_fast_tokenizer [USE_FAST_TOKENIZER]] [--torch_dtype {auto,bfloat16,float16,float32}] [--device_map DEVICE_MAP]
[--trust_remote_code [TRUST_REMOTE_CODE]] [--no_trust_remote_code] [--dataset_name DATASET_NAME] [--dataset_config_name DATASET_CONFIG_NAME]
[--train_file_dir TRAIN_FILE_DIR] [--validation_file_dir VALIDATION_FILE_DIR] [--max_source_length MAX_SOURCE_LENGTH] [--max_target_length MAX_TARGET_LENGTH]
[--max_train_samples MAX_TRAIN_SAMPLES] [--max_eval_samples MAX_EVAL_SAMPLES] [--overwrite_cache [OVERWRITE_CACHE]]
[--validation_split_percentage VALIDATION_SPLIT_PERCENTAGE] [--preprocessing_num_workers PREPROCESSING_NUM_WORKERS] --output_dir OUTPUT_DIR
[--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]] [--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]] [--do_predict [DO_PREDICT]] [--evaluation_strategy {no,steps,epoch}]
[--prediction_loss_only [PREDICTION_LOSS_ONLY]] [--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE] [--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE]
[--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE] [--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE] [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS]
[--eval_accumulation_steps EVAL_ACCUMULATION_STEPS] [--eval_delay EVAL_DELAY] [--learning_rate LEARNING_RATE] [--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1]
[--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON] [--max_grad_norm MAX_GRAD_NORM] [--num_train_epochs NUM_TRAIN_EPOCHS] [--max_steps MAX_STEPS]
[--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau}] [--warmup_ratio WARMUP_RATIO]
[--warmup_steps WARMUP_STEPS] [--log_level {debug,info,warning,error,critical,passive}] [--log_level_replica {debug,info,warning,error,critical,passive}]
[--log_on_each_node [LOG_ON_EACH_NODE]] [--no_log_on_each_node] [--logging_dir LOGGING_DIR] [--logging_strategy {no,steps,epoch}]
[--logging_first_step [LOGGING_FIRST_STEP]] [--logging_steps LOGGING_STEPS] [--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]] [--no_logging_nan_inf_filter]
[--save_strategy {no,steps,epoch}] [--save_steps SAVE_STEPS] [--save_total_limit SAVE_TOTAL_LIMIT] [--save_safetensors [SAVE_SAFETENSORS]]
[--save_on_each_node [SAVE_ON_EACH_NODE]] [--no_cuda [NO_CUDA]] [--use_mps_device [USE_MPS_DEVICE]] [--seed SEED] [--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]]
[--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]] [--fp16_opt_level FP16_OPT_LEVEL] [--half_precision_backend {auto,cuda_amp,apex,cpu_amp}]
[--bf16_full_eval [BF16_FULL_EVAL]] [--fp16_full_eval [FP16_FULL_EVAL]] [--tf32 TF32] [--local_rank LOCAL_RANK] [--ddp_backend {nccl,gloo,mpi,ccl}]
[--tpu_num_cores TPU_NUM_CORES] [--tpu_metrics_debug [TPU_METRICS_DEBUG]] [--debug DEBUG] [--dataloader_drop_last [DATALOADER_DROP_LAST]] [--eval_steps EVAL_STEPS]
[--dataloader_num_workers DATALOADER_NUM_WORKERS] [--past_index PAST_INDEX] [--run_name RUN_NAME] [--disable_tqdm DISABLE_TQDM]
[--remove_unused_columns [REMOVE_UNUSED_COLUMNS]] [--no_remove_unused_columns] [--label_names LABEL_NAMES [LABEL_NAMES ...]]
[--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]] [--metric_for_best_model METRIC_FOR_BEST_MODEL] [--greater_is_better GREATER_IS_BETTER]
[--ignore_data_skip [IGNORE_DATA_SKIP]] [--sharded_ddp SHARDED_DDP] [--fsdp FSDP] [--fsdp_min_num_params FSDP_MIN_NUM_PARAMS] [--fsdp_config FSDP_CONFIG]
[--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP] [--deepspeed DEEPSPEED] [--label_smoothing_factor LABEL_SMOOTHING_FACTOR]
[--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit}]
[--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]] [--group_by_length [GROUP_BY_LENGTH]] [--length_column_name LENGTH_COLUMN_NAME] [--report_to REPORT_TO [REPORT_TO ...]]
[--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS] [--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB] [--dataloader_pin_memory [DATALOADER_PIN_MEMORY]]
[--no_dataloader_pin_memory] [--skip_memory_metrics [SKIP_MEMORY_METRICS]] [--no_skip_memory_metrics] [--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]]
[--push_to_hub [PUSH_TO_HUB]] [--resume_from_checkpoint RESUME_FROM_CHECKPOINT] [--hub_model_id HUB_MODEL_ID] [--hub_strategy {end,every_save,checkpoint,all_checkpoints}]
[--hub_token HUB_TOKEN] [--hub_private_repo [HUB_PRIVATE_REPO]] [--gradient_checkpointing [GRADIENT_CHECKPOINTING]]
[--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]] [--fp16_backend {auto,cuda_amp,apex,cpu_amp}] [--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID]
[--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION] [--push_to_hub_token PUSH_TO_HUB_TOKEN] [--mp_parameters MP_PARAMETERS]
[--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]] [--full_determinism [FULL_DETERMINISM]] [--torchdynamo TORCHDYNAMO] [--ray_scope RAY_SCOPE] [--ddp_timeout DDP_TIMEOUT]
[--torch_compile [TORCH_COMPILE]] [--torch_compile_backend TORCH_COMPILE_BACKEND] [--torch_compile_mode TORCH_COMPILE_MODE] [--xpu_backend {mpi,ccl,gloo}]
[--use_peft [USE_PEFT]] [--no_use_peft] [--target_modules TARGET_MODULES] [--lora_rank LORA_RANK] [--lora_dropout LORA_DROPOUT] [--lora_alpha LORA_ALPHA]
[--modules_to_save MODULES_TO_SAVE] [--peft_path PEFT_PATH]
supervised_finetuning.py: error: the following arguments are required: --output_dir
usage: supervised_finetuning.py [-h] [--model_type MODEL_TYPE] [--model_name_or_path MODEL_NAME_OR_PATH] [--tokenizer_name_or_path TOKENIZER_NAME_OR_PATH] [--load_in_8bit [LOAD_IN_8BIT]]
[--cache_dir CACHE_DIR] [--use_fast_tokenizer [USE_FAST_TOKENIZER]] [--torch_dtype {auto,bfloat16,float16,float32}] [--device_map DEVICE_MAP]
[--trust_remote_code [TRUST_REMOTE_CODE]] [--no_trust_remote_code] [--dataset_name DATASET_NAME] [--dataset_config_name DATASET_CONFIG_NAME]
[--train_file_dir TRAIN_FILE_DIR] [--validation_file_dir VALIDATION_FILE_DIR] [--max_source_length MAX_SOURCE_LENGTH] [--max_target_length MAX_TARGET_LENGTH]
[--max_train_samples MAX_TRAIN_SAMPLES] [--max_eval_samples MAX_EVAL_SAMPLES] [--overwrite_cache [OVERWRITE_CACHE]]
[--validation_split_percentage VALIDATION_SPLIT_PERCENTAGE] [--preprocessing_num_workers PREPROCESSING_NUM_WORKERS] --output_dir OUTPUT_DIR
[--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]] [--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]] [--do_predict [DO_PREDICT]] [--evaluation_strategy {no,steps,epoch}]
[--prediction_loss_only [PREDICTION_LOSS_ONLY]] [--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE] [--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE]
[--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE] [--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE] [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS]
[--eval_accumulation_steps EVAL_ACCUMULATION_STEPS] [--eval_delay EVAL_DELAY] [--learning_rate LEARNING_RATE] [--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1]
[--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON] [--max_grad_norm MAX_GRAD_NORM] [--num_train_epochs NUM_TRAIN_EPOCHS] [--max_steps MAX_STEPS]
[--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau}] [--warmup_ratio WARMUP_RATIO]
[--warmup_steps WARMUP_STEPS] [--log_level {debug,info,warning,error,critical,passive}] [--log_level_replica {debug,info,warning,error,critical,passive}]
[--log_on_each_node [LOG_ON_EACH_NODE]] [--no_log_on_each_node] [--logging_dir LOGGING_DIR] [--logging_strategy {no,steps,epoch}]
[--logging_first_step [LOGGING_FIRST_STEP]] [--logging_steps LOGGING_STEPS] [--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]] [--no_logging_nan_inf_filter]
[--save_strategy {no,steps,epoch}] [--save_steps SAVE_STEPS] [--save_total_limit SAVE_TOTAL_LIMIT] [--save_safetensors [SAVE_SAFETENSORS]]
[--save_on_each_node [SAVE_ON_EACH_NODE]] [--no_cuda [NO_CUDA]] [--use_mps_device [USE_MPS_DEVICE]] [--seed SEED] [--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]]
[--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]] [--fp16_opt_level FP16_OPT_LEVEL] [--half_precision_backend {auto,cuda_amp,apex,cpu_amp}]
[--bf16_full_eval [BF16_FULL_EVAL]] [--fp16_full_eval [FP16_FULL_EVAL]] [--tf32 TF32] [--local_rank LOCAL_RANK] [--ddp_backend {nccl,gloo,mpi,ccl}]
[--tpu_num_cores TPU_NUM_CORES] [--tpu_metrics_debug [TPU_METRICS_DEBUG]] [--debug DEBUG] [--dataloader_drop_last [DATALOADER_DROP_LAST]] [--eval_steps EVAL_STEPS]
[--dataloader_num_workers DATALOADER_NUM_WORKERS] [--past_index PAST_INDEX] [--run_name RUN_NAME] [--disable_tqdm DISABLE_TQDM]
[--remove_unused_columns [REMOVE_UNUSED_COLUMNS]] [--no_remove_unused_columns] [--label_names LABEL_NAMES [LABEL_NAMES ...]]
[--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]] [--metric_for_best_model METRIC_FOR_BEST_MODEL] [--greater_is_better GREATER_IS_BETTER]
[--ignore_data_skip [IGNORE_DATA_SKIP]] [--sharded_ddp SHARDED_DDP] [--fsdp FSDP] [--fsdp_min_num_params FSDP_MIN_NUM_PARAMS] [--fsdp_config FSDP_CONFIG]
[--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP] [--deepspeed DEEPSPEED] [--label_smoothing_factor LABEL_SMOOTHING_FACTOR]
[--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit}]
[--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]] [--group_by_length [GROUP_BY_LENGTH]] [--length_column_name LENGTH_COLUMN_NAME] [--report_to REPORT_TO [REPORT_TO ...]]
[--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS] [--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB] [--dataloader_pin_memory [DATALOADER_PIN_MEMORY]]
[--no_dataloader_pin_memory] [--skip_memory_metrics [SKIP_MEMORY_METRICS]] [--no_skip_memory_metrics] [--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]]
[--push_to_hub [PUSH_TO_HUB]] [--resume_from_checkpoint RESUME_FROM_CHECKPOINT] [--hub_model_id HUB_MODEL_ID] [--hub_strategy {end,every_save,checkpoint,all_checkpoints}]
[--hub_token HUB_TOKEN] [--hub_private_repo [HUB_PRIVATE_REPO]] [--gradient_checkpointing [GRADIENT_CHECKPOINTING]]
[--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]] [--fp16_backend {auto,cuda_amp,apex,cpu_amp}] [--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID]
[--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION] [--push_to_hub_token PUSH_TO_HUB_TOKEN] [--mp_parameters MP_PARAMETERS]
[--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]] [--full_determinism [FULL_DETERMINISM]] [--torchdynamo TORCHDYNAMO] [--ray_scope RAY_SCOPE] [--ddp_timeout DDP_TIMEOUT]
[--torch_compile [TORCH_COMPILE]] [--torch_compile_backend TORCH_COMPILE_BACKEND] [--torch_compile_mode TORCH_COMPILE_MODE] [--xpu_backend {mpi,ccl,gloo}]
[--use_peft [USE_PEFT]] [--no_use_peft] [--target_modules TARGET_MODULES] [--lora_rank LORA_RANK] [--lora_dropout LORA_DROPOUT] [--lora_alpha LORA_ALPHA]
[--modules_to_save MODULES_TO_SAVE] [--peft_path PEFT_PATH]
supervised_finetuning.py: error: the following arguments are required: --output_dir
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 1740890) of binary: /usr/local/anaconda3/envs/hj-glm6b2/bin/python
Traceback (most recent call last):
File "/usr/local/anaconda3/envs/hj-glm6b2/bin/torchrun", line 8, in
sys.exit(main())
File "/usr/local/anaconda3/envs/hj-glm6b2/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/usr/local/anaconda3/envs/hj-glm6b2/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main
run(args)
File "/usr/local/anaconda3/envs/hj-glm6b2/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/usr/local/anaconda3/envs/hj-glm6b2/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/usr/local/anaconda3/envs/hj-glm6b2/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
supervised_finetuning.py FAILED
Failures:
[1]:
time : 2023-06-29_18:21:05
host : JoinShareAIPC
rank : 1 (local_rank: 1)
exitcode : 2 (pid: 1740891)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
Root Cause (first observed failure):
[0]:
time : 2023-06-29_18:21:05
host : JoinShareAIPC
rank : 0 (local_rank: 0)
exitcode : 2 (pid: 1740890)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
run_sft.sh: 2: --model_type: not found
run_sft.sh: 3: --model_name_or_path: not found
run_sft.sh: 4: --train_file_dir: not found
run_sft.sh: 5: --validation_file_dir: not found
run_sft.sh: 6: --per_device_train_batch_size: not found
run_sft.sh: 7: --per_device_eval_batch_size: not found
run_sft.sh: 8: --do_train: not found
run_sft.sh: 9: --do_eval: not found
run_sft.sh: 10: --use_peft: not found
run_sft.sh: 11: --fp16: not found
run_sft.sh: 12: --max_train_samples: not found
run_sft.sh: 13: --max_eval_samples: not found
run_sft.sh: 14: --num_train_epochs: not found
run_sft.sh: 15: --learning_rate: not found
run_sft.sh: 16: --warmup_ratio: not found
run_sft.sh: 17: --weight_decay: not found
run_sft.sh: 18: --logging_strategy: not found
run_sft.sh: 19: --logging_steps: not found
run_sft.sh: 20: --eval_steps: not found
run_sft.sh: 21: --evaluation_strategy: not found
run_sft.sh: 22: --save_steps: not found
run_sft.sh: 23: --save_strategy: not found
run_sft.sh: 24: --save_total_limit: not found
run_sft.sh: 25: --gradient_accumulation_steps: not found
run_sft.sh: 26: --preprocessing_num_workers: not found
run_sft.sh: 27: --max_source_length: not found
run_sft.sh: 28: --max_target_length: not found
run_sft.sh: 29: --output_dir: not found
run_sft.sh: 30: --overwrite_output_dir: not found
run_sft.sh: 31: --ddp_timeout: not found
run_sft.sh: 32: --logging_first_step: not found
run_sft.sh: 33: --target_modules: not found
run_sft.sh: 34: --lora_rank: not found
run_sft.sh: 35: --lora_alpha: not found
run_sft.sh: 36: --lora_dropout: not found
run_sft.sh: 37: --torch_dtype: not found
run_sft.sh: 38: --device_map: not found
run_sft.sh: 39: --report_to: not found
run_sft.sh: 40: --ddp_find_unused_parameters: not found
run_sft.sh: 41: --gradient_checkpointing: not found
The text was updated successfully, but these errors were encountered: