predibase · tgaddair · Nov 22, 2023 · Nov 22, 2023 · Nov 22, 2023
diff --git a/server/Makefile b/server/Makefile
@@ -26,7 +26,7 @@ install: gen-server install-torch
 run-dev:
 	# SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=1 lorax_server/cli.py serve meta-llama/Llama-2-7b-hf --sharded
 	SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 lorax_server/cli.py serve mistralai/Mistral-7B-Instruct-v0.1 --sharded
-	# SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=1 lorax_server/cli.py serve alexsherstinsky/Mistral-7B-v0.1-sharded --sharded
+	# SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=1 lorax_server/cli.py serve flozi00/Mistral-7B-german-assistant-v5-4bit-autogptq --quantize gptq
 
 export-requirements:
 	poetry export -o requirements.txt -E bnb -E quantize --without-hashes
diff --git a/server/lorax_server/utils/adapter.py b/server/lorax_server/utils/adapter.py
@@ -3,13 +3,15 @@
 from functools import lru_cache
 from pathlib import Path
 from typing import List, Dict, Set, Tuple
+import warnings
 
 import torch
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from loguru import logger
 from peft import LoraConfig
 from peft.utils import transpose
 from safetensors.torch import load_file, save_file
+from transformers import AutoConfig
 from tqdm import tqdm
 from filelock import FileLock
 
@@ -27,8 +29,18 @@ def load_module_map(model_id, adapter_id, adapter_source, weight_names):
     config_path = get_config_path(adapter_id, adapter_source)
     adapter_config = LoraConfig.from_pretrained(config_path)
     if adapter_config.base_model_name_or_path != model_id:
-        raise ValueError(f"Adapter '{adapter_id}' is not compatible with model '{model_id}'. "
-                            f"Use --model-id '{adapter_config.base_model_name_or_path}' instead.")
+        expected_config = AutoConfig.from_pretrained(model_id)
+        model_config = AutoConfig.from_pretrained(adapter_config.base_model_name_or_path)
+        if model_config.architectures == expected_config.architectures:
+            warnings.warn(
+                f"Adapter '{adapter_id}' was not trained on base model '{model_id}'. "
+                f"If you encounter issues, use --model-id '{adapter_config.base_model_name_or_path}' instead."
+            )
+        else:
+            # TODO(travis): revisit this when we support clasification heads which will not use CausalLM
+            raise ValueError(f"Adapter '{adapter_id}' is not compatible with model '{model_id}'. "
+                             f"Architectures differ: {model_config.architectures} != {expected_config.architectures}. "
+                             f"Use --model-id '{adapter_config.base_model_name_or_path}' instead.")
 
     # load adapter weights from all shards (should have relatively small memory footprint)
     adapter_filenames = source.weight_files()

diff --git a/server/lorax_server/utils/gptq/quant_linear.py b/server/lorax_server/utils/gptq/quant_linear.py
@@ -357,3 +357,7 @@ def forward(self, x):
         )
         out = out + self.bias if self.bias is not None else out
         return out.reshape(out_shape)
+
+    @property
+    def weight(self) -> torch.Tensor:
+        return self.qweight