You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
023-12-17T14:25:01.295949Z ERROR lorax_launcher: interceptor.py:41 Method Warmup encountered an error.
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/site-packages/lorax_server/utils/gptq/custom_autotune.py", line 90, in _bench
return triton.testing.do_bench(
TypeError: do_bench() got an unexpected keyword argument 'percentiles'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/bin/lorax-server", line 8, in <module>
sys.exit(app())
File "/opt/conda/lib/python3.10/site-packages/typer/main.py", line 311, in __call__
return get_command(self)(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1157, in __call__
return self.main(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/typer/core.py", line 778, in main
return _main(
File "/opt/conda/lib/python3.10/site-packages/typer/core.py", line 216, in _main
rv = self.invoke(ctx)
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1688, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1434, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 783, in invoke
return __callback(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/typer/main.py", line 683, in wrapper
return callback(**use_params) # type: ignore
File "/opt/conda/lib/python3.10/site-packages/lorax_server/cli.py", line 84, in serve
server.serve(
File "/opt/conda/lib/python3.10/site-packages/lorax_server/server.py", line 277, in serve
asyncio.run(
File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 636, in run_until_complete
self.run_forever()
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
self._run_once()
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once
handle._run()
File "/opt/conda/lib/python3.10/asyncio/events.py", line 80, in _run
self._context.run(self._callback, *self._args)
File "/opt/conda/lib/python3.10/site-packages/grpc_interceptor/server.py", line 165, in invoke_intercept_method
return await self.intercept(
> File "/opt/conda/lib/python3.10/site-packages/lorax_server/interceptor.py", line 38, in intercept
return await response
File "/opt/conda/lib/python3.10/site-packages/opentelemetry/instrumentation/grpc/_aio_server.py", line 82, in _unary_interceptor
raise error
File "/opt/conda/lib/python3.10/site-packages/opentelemetry/instrumentation/grpc/_aio_server.py", line 73, in _unary_interceptor
return await behavior(request_or_iterator, context)
File "/opt/conda/lib/python3.10/site-packages/lorax_server/server.py", line 74, in Warmup
max_supported_total_tokens = self.model.warmup(batch)
File "/opt/conda/lib/python3.10/site-packages/lorax_server/models/flash_causal_lm.py", line 864, in warmup
_, batch = self.generate_token(batch)
File "/opt/conda/lib/python3.10/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/opt/conda/lib/python3.10/site-packages/lorax_server/models/flash_causal_lm.py", line 963, in generate_token
raise e
File "/opt/conda/lib/python3.10/site-packages/lorax_server/models/flash_causal_lm.py", line 960, in generate_token
out = self.forward(batch, adapter_data)
File "/opt/conda/lib/python3.10/site-packages/lorax_server/models/flash_causal_lm.py", line 919, in forward
return self.model.forward(
File "/opt/conda/lib/python3.10/site-packages/lorax_server/models/custom_modeling/flash_qwen_modeling.py", line 475, in forward
hidden_states = self.transformer(
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lorax_server/models/custom_modeling/flash_qwen_modeling.py", line 432, in forward
hidden_states, residual = layer(
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lorax_server/models/custom_modeling/flash_qwen_modeling.py", line 357, in forward
attn_output = self.attn(
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lorax_server/models/custom_modeling/flash_qwen_modeling.py", line 226, in forward
qkv = self.c_attn(hidden_states, adapter_data)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lorax_server/utils/layers.py", line 481, in forward
result = self.base_layer(input)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lorax_server/utils/layers.py", line 285, in forward
return self.linear.forward(x)
File "/opt/conda/lib/python3.10/site-packages/lorax_server/utils/gptq/quant_linear.py", line 349, in forward
out = QuantLinearFunction.apply(
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 539, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/opt/conda/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py", line 121, in decorate_fwd
return fwd(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lorax_server/utils/gptq/quant_linear.py", line 244, in forward
output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq)
File "/opt/conda/lib/python3.10/site-packages/lorax_server/utils/gptq/quant_linear.py", line 216, in matmul248
matmul_248_kernel[grid](
File "/opt/conda/lib/python3.10/site-packages/lorax_server/utils/gptq/custom_autotune.py", line 110, in run
timings = {
File "/opt/conda/lib/python3.10/site-packages/lorax_server/utils/gptq/custom_autotune.py", line 111, in <dictcomp>
config: self._bench(*args, config=config, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/lorax_server/utils/gptq/custom_autotune.py", line 93, in _bench
except triton.compiler.OutOfResources:
AttributeError: module 'triton.compiler' has no attribute 'OutOfResources'
2023-12-17T14:25:01.296225Z ERROR warmup{max_input_length=1024 max_prefill_tokens=4096}:warmup: lorax_client: router/client/src/lib.rs:33: Server error: module 'triton.compiler' has no attribute 'OutOfResources'
Error: Warmup(Generation("module 'triton.compiler' has no attribute 'OutOfResources'"))
Information
Docker
The CLI directly
Tasks
An officially supported command
My own modifications
Reproduction
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
ghcr.io/predibase/lorax:0.4 --model-id Qwen/Qwen-14B-Chat-Int4
Expected behavior
It runs ok with ghcr.io/predibase/lorax:0.3
The text was updated successfully, but these errors were encountered:
Thanks for reporting this @pandada8, looks like this was due to a Triton version change after upgrading PyTorch to 2.1. I just put up PR #140 which should address the issue.
System Info
ghcr.io/predibase/lorax:0.4
failed to load gptq imagecommand:
--model-id /mnt/local-model/Qwen-14B-Chat-Int4/ --quantize gptq --trust-remote-code
Using model:
Information
Tasks
Reproduction
Expected behavior
It runs ok with ghcr.io/predibase/lorax:0.3
The text was updated successfully, but these errors were encountered: