Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RuntimeError: expected scalar type BFloat16 but found Float #6

Open
geoffrey-hin-wong opened this issue Apr 19, 2023 · 2 comments
Open

Comments

@geoffrey-hin-wong
Copy link


RuntimeError Traceback (most recent call last)
Cell In[16], line 3
1 generate_text = InstructionTextGenerationPipeline(model=model, tokenizer=tokenizer)
----> 3 generate_text("Look up the boiling point of water.")

File /opt/conda/envs/textgen/lib/python3.10/site-packages/transformers/pipelines/base.py:1109, in Pipeline.call(self, inputs, num_workers, batch_size, *args, **kwargs)
1101 return next(
1102 iter(
1103 self.get_iterator(
(...)
1106 )
1107 )
1108 else:
-> 1109 return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)

File /opt/conda/envs/textgen/lib/python3.10/site-packages/transformers/pipelines/base.py:1116, in Pipeline.run_single(self, inputs, preprocess_params, forward_params, postprocess_params)
1114 def run_single(self, inputs, preprocess_params, forward_params, postprocess_params):
1115 model_inputs = self.preprocess(inputs, **preprocess_params)
-> 1116 model_outputs = self.forward(model_inputs, **forward_params)
1117 outputs = self.postprocess(model_outputs, **postprocess_params)
1118 return outputs

File /opt/conda/envs/textgen/lib/python3.10/site-packages/transformers/pipelines/base.py:1015, in Pipeline.forward(self, model_inputs, **forward_params)
1013 with inference_context():
1014 model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device)
-> 1015 model_outputs = self._forward(model_inputs, **forward_params)
1016 model_outputs = self._ensure_tensor_on_device(model_outputs, device=torch.device("cpu"))
1017 else:

Cell In[1], line 98, in InstructionTextGenerationPipeline._forward(self, model_inputs, **generate_kwargs)
96 input_ids = model_inputs["input_ids"]
97 attention_mask = model_inputs.get("attention_mask", None)
---> 98 generated_sequence = self.model.generate(
99 input_ids=input_ids.to(self.model.device),
100 attention_mask=attention_mask,
101 pad_token_id=self.tokenizer.pad_token_id,
102 **generate_kwargs,
103 )[0].cpu()
104 instruction_text = model_inputs.pop("instruction_text")
105 return {"generated_sequence": generated_sequence, "input_ids": input_ids, "instruction_text": instruction_text}

File /opt/conda/envs/textgen/lib/python3.10/site-packages/peft/peft_model.py:627, in PeftModelForCausalLM.generate(self, **kwargs)
625 try:
626 if not isinstance(self.peft_config, PromptLearningConfig):
--> 627 outputs = self.base_model.generate(**kwargs)
628 else:
629 if "input_ids" not in kwargs:

File /opt/conda/envs/textgen/lib/python3.10/site-packages/peft/peft_model.py:627, in PeftModelForCausalLM.generate(self, **kwargs)
625 try:
626 if not isinstance(self.peft_config, PromptLearningConfig):
--> 627 outputs = self.base_model.generate(**kwargs)
628 else:
629 if "input_ids" not in kwargs:

File /opt/conda/envs/textgen/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)

File /opt/conda/envs/textgen/lib/python3.10/site-packages/transformers/generation/utils.py:1508, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, streamer, **kwargs)
1500 input_ids, model_kwargs = self._expand_inputs_for_generation(
1501 input_ids=input_ids,
1502 expand_size=generation_config.num_return_sequences,
1503 is_encoder_decoder=self.config.is_encoder_decoder,
1504 **model_kwargs,
1505 )
1507 # 13. run sample
-> 1508 return self.sample(
1509 input_ids,
1510 logits_processor=logits_processor,
1511 logits_warper=logits_warper,
1512 stopping_criteria=stopping_criteria,
1513 pad_token_id=generation_config.pad_token_id,
1514 eos_token_id=generation_config.eos_token_id,
1515 output_scores=generation_config.output_scores,
1516 return_dict_in_generate=generation_config.return_dict_in_generate,
1517 synced_gpus=synced_gpus,
1518 streamer=streamer,
1519 **model_kwargs,
1520 )
1522 elif is_beam_gen_mode:
1523 if generation_config.num_return_sequences > generation_config.num_beams:

File /opt/conda/envs/textgen/lib/python3.10/site-packages/transformers/generation/utils.py:2547, in GenerationMixin.sample(self, input_ids, logits_processor, stopping_criteria, logits_warper, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, streamer, **model_kwargs)
2544 model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
2546 # forward pass to get next token
-> 2547 outputs = self(
2548 **model_inputs,
2549 return_dict=True,
2550 output_attentions=output_attentions,
2551 output_hidden_states=output_hidden_states,
2552 )
2554 if synced_gpus and this_peer_finished:
2555 continue # don't waste resources running the code we don't need

File /opt/conda/envs/textgen/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File /opt/conda/envs/textgen/lib/python3.10/site-packages/accelerate/hooks.py:165, in add_hook_to_module..new_forward(*args, **kwargs)
163 output = old_forward(*args, **kwargs)
164 else:
--> 165 output = old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)

File /opt/conda/envs/textgen/lib/python3.10/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:662, in GPTNeoXForCausalLM.forward(self, input_ids, attention_mask, position_ids, inputs_embeds, head_mask, past_key_values, labels, use_cache, output_attentions, output_hidden_states, return_dict)
621 r"""
622 past_key_values (tuple(tuple(torch.FloatTensor)), optional, returned when use_cache=True is passed or when config.use_cache=True):
623 Tuple of tuple(torch.FloatTensor) of length config.n_layers, with each tuple having 2 tensors of shape
(...)
658 >>> prediction_logits = outputs.logits
659 ```"""
660 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
--> 662 outputs = self.gpt_neox(
663 input_ids,
664 attention_mask=attention_mask,
665 position_ids=position_ids,
666 head_mask=head_mask,
667 inputs_embeds=inputs_embeds,
668 past_key_values=past_key_values,
669 use_cache=use_cache,
670 output_attentions=output_attentions,
671 output_hidden_states=output_hidden_states,
672 return_dict=return_dict,
673 )
675 hidden_states = outputs[0]
676 lm_logits = self.embed_out(hidden_states)

File /opt/conda/envs/textgen/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File /opt/conda/envs/textgen/lib/python3.10/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:553, in GPTNeoXModel.forward(self, input_ids, attention_mask, position_ids, head_mask, inputs_embeds, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
545 outputs = torch.utils.checkpoint.checkpoint(
546 create_custom_forward(layer),
547 hidden_states,
(...)
550 head_mask[i],
551 )
552 else:
--> 553 outputs = layer(
554 hidden_states,
555 attention_mask=attention_mask,
556 position_ids=position_ids,
557 head_mask=head_mask[i],
558 layer_past=layer_past,
559 use_cache=use_cache,
560 output_attentions=output_attentions,
561 )
562 hidden_states = outputs[0]
563 if use_cache is True:

File /opt/conda/envs/textgen/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File /opt/conda/envs/textgen/lib/python3.10/site-packages/accelerate/hooks.py:165, in add_hook_to_module..new_forward(*args, **kwargs)
163 output = old_forward(*args, **kwargs)
164 else:
--> 165 output = old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)

File /opt/conda/envs/textgen/lib/python3.10/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:320, in GPTNeoXLayer.forward(self, hidden_states, attention_mask, position_ids, head_mask, use_cache, layer_past, output_attentions)
310 def forward(
311 self,
312 hidden_states: Optional[torch.FloatTensor],
(...)
318 output_attentions: Optional[bool] = False,
319 ):
--> 320 attention_layer_outputs = self.attention(
321 self.input_layernorm(hidden_states),
322 attention_mask=attention_mask,
323 position_ids=position_ids,
324 layer_past=layer_past,
325 head_mask=head_mask,
326 use_cache=use_cache,
327 output_attentions=output_attentions,
328 )
329 attn_output = attention_layer_outputs[0] # output_attn: attn_output, present, (attn_weights)
330 outputs = attention_layer_outputs[1:]

File /opt/conda/envs/textgen/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File /opt/conda/envs/textgen/lib/python3.10/site-packages/accelerate/hooks.py:165, in add_hook_to_module..new_forward(*args, **kwargs)
163 output = old_forward(*args, **kwargs)
164 else:
--> 165 output = old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)

File /opt/conda/envs/textgen/lib/python3.10/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:116, in GPTNeoXAttention.forward(self, hidden_states, attention_mask, position_ids, head_mask, layer_past, use_cache, output_attentions)
111 has_layer_past = layer_past is not None
113 # Compute QKV
114 # Attention heads [batch, seq_len, hidden_size]
115 # --> [batch, seq_len, (np * 3 * head_size)]
--> 116 qkv = self.query_key_value(hidden_states)
118 # [batch, seq_len, (num_heads * 3 * head_size)]
119 # --> [batch, seq_len, num_heads, 3 * head_size]
120 new_qkv_shape = qkv.size()[:-1] + (self.num_attention_heads, 3 * self.head_size)

File /opt/conda/envs/textgen/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File /opt/conda/envs/textgen/lib/python3.10/site-packages/peft/tuners/lora.py:530, in MergedLinear.forward(self, x)
528 result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
529 if self.r > 0:
--> 530 after_A = self.lora_A(self.lora_dropout(x))
531 after_B = self.lora_B(after_A.transpose(-2, -1)).transpose(-2, -1)
532 result += self.zero_pad(after_B) * self.scaling

File /opt/conda/envs/textgen/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File /opt/conda/envs/textgen/lib/python3.10/site-packages/torch/nn/modules/linear.py:114, in Linear.forward(self, input)
113 def forward(self, input: Tensor) -> Tensor:
--> 114 return F.linear(input, self.weight, self.bias)

RuntimeError: expected scalar type BFloat16 but found Float

@wendilinplay
Copy link

you could use float16 or even int8, such as
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)

@GuevaraDev
Copy link

Try using with torch.autocast("cuda"):

I was able to get the generated text using it like this:

with torch.autocast("cuda"):
  response = generate_text("Give three tips for staying healthy.")
  print(response)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants