Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument target in method wrapper_CUDA_nll_loss_forward),请问如何调整训练代码,适配多卡训练?? #38

Open
jhonffe opened this issue Sep 17, 2023 · 1 comment

Comments

@jhonffe
Copy link

jhonffe commented Sep 17, 2023

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-bc6f7aa3b7d4f48f/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-8d547816b9814051.arrow
0%| | 0/3581 [00:00<?, ?it/s]use_cache=True is incompatible with gradient checkpointing. Setting use_cache=False...
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /mnt/disk_data/LLM/chatGLM-6B-QLoRA-main/train_qlora.py:229 in │
│ │
│ 226 │
│ 227 if name == "main": │
│ 228 │ args = parse_args() │
│ ❱ 229 │ train(args) │
│ 230 │
│ 231 │
│ │
│ /mnt/disk_data/LLM/chatGLM-6B-QLoRA-main/train_qlora.py:223 in train │
│ │
│ 220 │ │ data_collator=data_collator │
│ 221 │ ) │
│ 222 │ │
│ ❱ 223 │ trainer.train(resume_from_checkpoint=resume_from_checkpoint) │
│ 224 │ trainer.model.save_pretrained(hf_train_args.output_dir) │
│ 225 │
│ 226 │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │
│ ers/trainer.py:1645 in train │
│ │
│ 1642 │ │ inner_training_loop = find_executable_batch_size( │
│ 1643 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │
│ 1644 │ │ ) │
│ ❱ 1645 │ │ return inner_training_loop( │
│ 1646 │ │ │ args=args, │
│ 1647 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │
│ 1648 │ │ │ trial=trial, │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │
│ ers/trainer.py:1938 in _inner_training_loop │
│ │
│ 1935 │ │ │ │ │ self.control = self.callback_handler.on_step_begin(args, self.state, │
│ 1936 │ │ │ │ │
│ 1937 │ │ │ │ with self.accelerator.accumulate(model): │
│ ❱ 1938 │ │ │ │ │ tr_loss_step = self.training_step(model, inputs) │
│ 1939 │ │ │ │ │
│ 1940 │ │ │ │ if ( │
│ 1941 │ │ │ │ │ args.logging_nan_inf_filter │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │
│ ers/trainer.py:2759 in training_step │
│ │
│ 2756 │ │ │ return loss_mb.reduce_mean().detach().to(self.args.device) │
│ 2757 │ │ │
│ 2758 │ │ with self.compute_loss_context_manager(): │
│ ❱ 2759 │ │ │ loss = self.compute_loss(model, inputs) │
│ 2760 │ │ │
│ 2761 │ │ if self.args.n_gpu > 1: │
│ 2762 │ │ │ loss = loss.mean() # mean() to average on multi-gpu parallel training │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │
│ ers/trainer.py:2784 in compute_loss │
│ │
│ 2781 │ │ │ labels = inputs.pop("labels") │
│ 2782 │ │ else: │
│ 2783 │ │ │ labels = None │
│ ❱ 2784 │ │ outputs = model(**inputs) │
│ 2785 │ │ # Save past state if it exists │
│ 2786 │ │ # TODO: this needs to be fixed and made cleaner later. │
│ 2787 │ │ if self.args.past_index >= 0: │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │
│ modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/peft/peft │
│ _model.py:922 in forward │
│ │
│ 919 │ │ │ │ │ **kwargs, │
│ 920 │ │ │ │ ) │
│ 921 │ │ │ │
│ ❱ 922 │ │ │ return self.base_model( │
│ 923 │ │ │ │ input_ids=input_ids, │
│ 924 │ │ │ │ attention_mask=attention_mask, │
│ 925 │ │ │ │ inputs_embeds=inputs_embeds, │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │
│ modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/accelerat │
│ e/hooks.py:165 in new_forward │
│ │
│ 162 │ │ │ with torch.no_grad(): │
│ 163 │ │ │ │ output = old_forward(*args, **kwargs) │
│ 164 │ │ else: │
│ ❱ 165 │ │ │ output = old_forward(*args, **kwargs) │
│ 166 │ │ return module._hf_hook.post_forward(module, output) │
│ 167 │ │
│ 168 │ module.forward = new_forward │
│ │
│ /root/.cache/huggingface/modules/transformers_modules/ChatGLM2-6B/modeling_chatglm.py:960 in │
│ forward │
│ │
│ 957 │ │ │ shift_labels = labels[..., 1:].contiguous() │
│ 958 │ │ │ # Flatten the tokens │
│ 959 │ │ │ loss_fct = CrossEntropyLoss(ignore_index=-100) │
│ ❱ 960 │ │ │ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.v │
│ 961 │ │ │ │
│ 962 │ │ │ lm_logits = lm_logits.to(hidden_states.dtype) │
│ 963 │ │ │ loss = loss.to(hidden_states.dtype) │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │
│ modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │
│ modules/loss.py:1174 in forward │
│ │
│ 1171 │ │ self.label_smoothing = label_smoothing │
│ 1172 │ │
│ 1173 │ def forward(self, input: Tensor, target: Tensor) -> Tensor: │
│ ❱ 1174 │ │ return F.cross_entropy(input, target, weight=self.weight, │
│ 1175 │ │ │ │ │ │ │ ignore_index=self.ignore_index, reduction=self.reduction, │
│ 1176 │ │ │ │ │ │ │ label_smoothing=self.label_smoothing) │
│ 1177 │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │
│ functional.py:3029 in cross_entropy │
│ │
│ 3026 │ │ ) │
│ 3027 │ if size_average is not None or reduce is not None: │
│ 3028 │ │ reduction = _Reduction.legacy_get_string(size_average, reduce) │
│ ❱ 3029 │ return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(re │
│ 3030 │
│ 3031 │
│ 3032 def binary_cross_entropy( │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking
argument for argument target in method wrapper_CUDA_nll_loss_forward)

@sevenandseven
Copy link

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-bc6f7aa3b7d4f48f/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-8d547816b9814051.arrow 0%| | 0/3581 [00:00<?, ?it/s]use_cache=True is incompatible with gradient checkpointing. Setting use_cache=False... ╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /mnt/disk_data/LLM/chatGLM-6B-QLoRA-main/train_qlora.py:229 in │ │ │ │ 226 │ │ 227 if name == "main": │ │ 228 │ args = parse_args() │ │ ❱ 229 │ train(args) │ │ 230 │ │ 231 │ │ │ │ /mnt/disk_data/LLM/chatGLM-6B-QLoRA-main/train_qlora.py:223 in train │ │ │ │ 220 │ │ data_collator=data_collator │ │ 221 │ ) │ │ 222 │ │ │ ❱ 223 │ trainer.train(resume_from_checkpoint=resume_from_checkpoint) │ │ 224 │ trainer.model.save_pretrained(hf_train_args.output_dir) │ │ 225 │ │ 226 │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │ │ ers/trainer.py:1645 in train │ │ │ │ 1642 │ │ inner_training_loop = find_executable_batch_size( │ │ 1643 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │ │ 1644 │ │ ) │ │ ❱ 1645 │ │ return inner_training_loop( │ │ 1646 │ │ │ args=args, │ │ 1647 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │ │ 1648 │ │ │ trial=trial, │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │ │ ers/trainer.py:1938 in _inner_training_loop │ │ │ │ 1935 │ │ │ │ │ self.control = self.callback_handler.on_step_begin(args, self.state, │ │ 1936 │ │ │ │ │ │ 1937 │ │ │ │ with self.accelerator.accumulate(model): │ │ ❱ 1938 │ │ │ │ │ tr_loss_step = self.training_step(model, inputs) │ │ 1939 │ │ │ │ │ │ 1940 │ │ │ │ if ( │ │ 1941 │ │ │ │ │ args.logging_nan_inf_filter │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │ │ ers/trainer.py:2759 in training_step │ │ │ │ 2756 │ │ │ return loss_mb.reduce_mean().detach().to(self.args.device) │ │ 2757 │ │ │ │ 2758 │ │ with self.compute_loss_context_manager(): │ │ ❱ 2759 │ │ │ loss = self.compute_loss(model, inputs) │ │ 2760 │ │ │ │ 2761 │ │ if self.args.n_gpu > 1: │ │ 2762 │ │ │ loss = loss.mean() # mean() to average on multi-gpu parallel training │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │ │ ers/trainer.py:2784 in compute_loss │ │ │ │ 2781 │ │ │ labels = inputs.pop("labels") │ │ 2782 │ │ else: │ │ 2783 │ │ │ labels = None │ │ ❱ 2784 │ │ outputs = model(**inputs) │ │ 2785 │ │ # Save past state if it exists │ │ 2786 │ │ # TODO: this needs to be fixed and made cleaner later. │ │ 2787 │ │ if self.args.past_index >= 0: │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/peft/peft │ │ _model.py:922 in forward │ │ │ │ 919 │ │ │ │ │ **kwargs, │ │ 920 │ │ │ │ ) │ │ 921 │ │ │ │ │ ❱ 922 │ │ │ return self.base_model( │ │ 923 │ │ │ │ input_ids=input_ids, │ │ 924 │ │ │ │ attention_mask=attention_mask, │ │ 925 │ │ │ │ inputs_embeds=inputs_embeds, │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/accelerat │ │ e/hooks.py:165 in new_forward │ │ │ │ 162 │ │ │ with torch.no_grad(): │ │ 163 │ │ │ │ output = old_forward(*args, **kwargs) │ │ 164 │ │ else: │ │ ❱ 165 │ │ │ output = old_forward(*args, **kwargs) │ │ 166 │ │ return module._hf_hook.post_forward(module, output) │ │ 167 │ │ │ 168 │ module.forward = new_forward │ │ │ │ /root/.cache/huggingface/modules/transformers_modules/ChatGLM2-6B/modeling_chatglm.py:960 in │ │ forward │ │ │ │ 957 │ │ │ shift_labels = labels[..., 1:].contiguous() │ │ 958 │ │ │ # Flatten the tokens │ │ 959 │ │ │ loss_fct = CrossEntropyLoss(ignore_index=-100) │ │ ❱ 960 │ │ │ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.v │ │ 961 │ │ │ │ │ 962 │ │ │ lm_logits = lm_logits.to(hidden_states.dtype) │ │ 963 │ │ │ loss = loss.to(hidden_states.dtype) │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ modules/loss.py:1174 in forward │ │ │ │ 1171 │ │ self.label_smoothing = label_smoothing │ │ 1172 │ │ │ 1173 │ def forward(self, input: Tensor, target: Tensor) -> Tensor: │ │ ❱ 1174 │ │ return F.cross_entropy(input, target, weight=self.weight, │ │ 1175 │ │ │ │ │ │ │ ignore_index=self.ignore_index, reduction=self.reduction, │ │ 1176 │ │ │ │ │ │ │ label_smoothing=self.label_smoothing) │ │ 1177 │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ functional.py:3029 in cross_entropy │ │ │ │ 3026 │ │ ) │ │ 3027 │ if size_average is not None or reduce is not None: │ │ 3028 │ │ reduction = _Reduction.legacy_get_string(size_average, reduce) │ │ ❱ 3029 │ return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(re │ │ 3030 │ │ 3031 │ │ 3032 def binary_cross_entropy( │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument target in method wrapper_CUDA_nll_loss_forward)

你好,请问你的问题解决了吗?我也想使用单机多卡微调qlora怎么修改代码?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants