-
Notifications
You must be signed in to change notification settings - Fork 46
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument target in method wrapper_CUDA_nll_loss_forward),请问如何调整训练代码,适配多卡训练?? #38
Comments
你好,请问你的问题解决了吗?我也想使用单机多卡微调qlora怎么修改代码? |
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-bc6f7aa3b7d4f48f/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-8d547816b9814051.arrow
0%| | 0/3581 [00:00<?, ?it/s]
use_cache=True
is incompatible with gradient checkpointing. Settinguse_cache=False
...╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /mnt/disk_data/LLM/chatGLM-6B-QLoRA-main/train_qlora.py:229 in │
│ │
│ 226 │
│ 227 if name == "main": │
│ 228 │ args = parse_args() │
│ ❱ 229 │ train(args) │
│ 230 │
│ 231 │
│ │
│ /mnt/disk_data/LLM/chatGLM-6B-QLoRA-main/train_qlora.py:223 in train │
│ │
│ 220 │ │ data_collator=data_collator │
│ 221 │ ) │
│ 222 │ │
│ ❱ 223 │ trainer.train(resume_from_checkpoint=resume_from_checkpoint) │
│ 224 │ trainer.model.save_pretrained(hf_train_args.output_dir) │
│ 225 │
│ 226 │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │
│ ers/trainer.py:1645 in train │
│ │
│ 1642 │ │ inner_training_loop = find_executable_batch_size( │
│ 1643 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │
│ 1644 │ │ ) │
│ ❱ 1645 │ │ return inner_training_loop( │
│ 1646 │ │ │ args=args, │
│ 1647 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │
│ 1648 │ │ │ trial=trial, │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │
│ ers/trainer.py:1938 in _inner_training_loop │
│ │
│ 1935 │ │ │ │ │ self.control = self.callback_handler.on_step_begin(args, self.state, │
│ 1936 │ │ │ │ │
│ 1937 │ │ │ │ with self.accelerator.accumulate(model): │
│ ❱ 1938 │ │ │ │ │ tr_loss_step = self.training_step(model, inputs) │
│ 1939 │ │ │ │ │
│ 1940 │ │ │ │ if ( │
│ 1941 │ │ │ │ │ args.logging_nan_inf_filter │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │
│ ers/trainer.py:2759 in training_step │
│ │
│ 2756 │ │ │ return loss_mb.reduce_mean().detach().to(self.args.device) │
│ 2757 │ │ │
│ 2758 │ │ with self.compute_loss_context_manager(): │
│ ❱ 2759 │ │ │ loss = self.compute_loss(model, inputs) │
│ 2760 │ │ │
│ 2761 │ │ if self.args.n_gpu > 1: │
│ 2762 │ │ │ loss = loss.mean() # mean() to average on multi-gpu parallel training │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │
│ ers/trainer.py:2784 in compute_loss │
│ │
│ 2781 │ │ │ labels = inputs.pop("labels") │
│ 2782 │ │ else: │
│ 2783 │ │ │ labels = None │
│ ❱ 2784 │ │ outputs = model(**inputs) │
│ 2785 │ │ # Save past state if it exists │
│ 2786 │ │ # TODO: this needs to be fixed and made cleaner later. │
│ 2787 │ │ if self.args.past_index >= 0: │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │
│ modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/peft/peft │
│ _model.py:922 in forward │
│ │
│ 919 │ │ │ │ │ **kwargs, │
│ 920 │ │ │ │ ) │
│ 921 │ │ │ │
│ ❱ 922 │ │ │ return self.base_model( │
│ 923 │ │ │ │ input_ids=input_ids, │
│ 924 │ │ │ │ attention_mask=attention_mask, │
│ 925 │ │ │ │ inputs_embeds=inputs_embeds, │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │
│ modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/accelerat │
│ e/hooks.py:165 in new_forward │
│ │
│ 162 │ │ │ with torch.no_grad(): │
│ 163 │ │ │ │ output = old_forward(*args, **kwargs) │
│ 164 │ │ else: │
│ ❱ 165 │ │ │ output = old_forward(*args, **kwargs) │
│ 166 │ │ return module._hf_hook.post_forward(module, output) │
│ 167 │ │
│ 168 │ module.forward = new_forward │
│ │
│ /root/.cache/huggingface/modules/transformers_modules/ChatGLM2-6B/modeling_chatglm.py:960 in │
│ forward │
│ │
│ 957 │ │ │ shift_labels = labels[..., 1:].contiguous() │
│ 958 │ │ │ # Flatten the tokens │
│ 959 │ │ │ loss_fct = CrossEntropyLoss(ignore_index=-100) │
│ ❱ 960 │ │ │ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.v │
│ 961 │ │ │ │
│ 962 │ │ │ lm_logits = lm_logits.to(hidden_states.dtype) │
│ 963 │ │ │ loss = loss.to(hidden_states.dtype) │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │
│ modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │
│ modules/loss.py:1174 in forward │
│ │
│ 1171 │ │ self.label_smoothing = label_smoothing │
│ 1172 │ │
│ 1173 │ def forward(self, input: Tensor, target: Tensor) -> Tensor: │
│ ❱ 1174 │ │ return F.cross_entropy(input, target, weight=self.weight, │
│ 1175 │ │ │ │ │ │ │ ignore_index=self.ignore_index, reduction=self.reduction, │
│ 1176 │ │ │ │ │ │ │ label_smoothing=self.label_smoothing) │
│ 1177 │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │
│ functional.py:3029 in cross_entropy │
│ │
│ 3026 │ │ ) │
│ 3027 │ if size_average is not None or reduce is not None: │
│ 3028 │ │ reduction = _Reduction.legacy_get_string(size_average, reduce) │
│ ❱ 3029 │ return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(re │
│ 3030 │
│ 3031 │
│ 3032 def binary_cross_entropy( │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking
argument for argument target in method wrapper_CUDA_nll_loss_forward)
The text was updated successfully, but these errors were encountered: