In [None]:
from transformers import AutoConfig, AutoTokenizer
from huggingface_hub import model_info

model_id = "mistralai/Mistral-7B-Instruct-v0.2"

# Get HF model info
info = model_info(model_id)
print("📌 pipeline_tag:", info.pipeline_tag)

# Load model config
config = AutoConfig.from_pretrained(model_id)
auto_map = getattr(config, "auto_map", None)
print("📌 auto_map:", auto_map)

# Load tokenizer to check chat template
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
chat_template = getattr(tokenizer, "chat_template", None)
print("📌 Uses chat template:", "✅" if chat_template else "❌")

# Final decision: is it usable with AutoModelForCausalLM?
if auto_map and "AutoModelForCausalLM" in auto_map:
    print("✅ Can use with AutoModelForCausalLM (trust_remote_code=True)")
else:
    print("⚠️ Might require specific model class or loading logic")


📌 pipeline_tag: text-generation
📌 auto_map: None
📌 Uses chat template: ✅
⚠️ Might require specific model class or loading logic


In [3]:
import torch
torch.cuda.get_device_name(1)

'NVIDIA T1000 8GB'

In [5]:
from transformers import AutoTokenizer,AutoModelForCausalLM
tokenizer=AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
model= AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-8B-Instruct",
    device_map="auto",
    torch_dtype=torch.float32,
    max_memory={
        0:'5GB',
        1:"5GB",
        'cpu':"16GB"
    },
    offload_folder="/offload_nvm",
    offload_state_dict=True,
    
    )

Loading checkpoint shards: 100%|██████████| 4/4 [00:29<00:00,  7.45s/it]
Some parameters are on the meta device because they were offloaded to the disk and cpu.


In [8]:
print(model.model.layers)

ModuleList(
  (0-31): 32 x LlamaDecoderLayer(
    (self_attn): LlamaAttention(
      (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
      (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
      (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
      (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
    )
    (mlp): LlamaMLP(
      (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
      (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
      (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
      (act_fn): SiLU()
    )
    (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
    (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
  )
)


In [18]:
for i in model.model.layers:
    print(next(i.parameters()).to("cuda:0"))
    # break
# next(model[0].parameters()).device

tensor([[ 0.0052, -0.0293, -0.0064,  ...,  0.0092, -0.0415, -0.0269],
        [-0.0150, -0.0679, -0.0059,  ..., -0.0149, -0.0498,  0.0197],
        [-0.0173, -0.0391, -0.0040,  ...,  0.0107, -0.0132,  0.0071],
        ...,
        [-0.0035, -0.0383,  0.0781,  ...,  0.0057, -0.0012,  0.0024],
        [-0.0033, -0.0093,  0.0437,  ...,  0.0047, -0.0011,  0.0012],
        [-0.0019, -0.0153,  0.0347,  ...,  0.0111,  0.0004,  0.0042]],
       device='cuda:0', grad_fn=<ToCopyBackward0>)
tensor([[-0.0352, -0.0178, -0.0166,  ..., -0.0297,  0.0003, -0.0120],
        [-0.0032,  0.0012, -0.0082,  ..., -0.0229,  0.0017,  0.0066],
        [ 0.0002,  0.0092, -0.0227,  ..., -0.0153, -0.0040, -0.0110],
        ...,
        [ 0.0557,  0.0065, -0.0040,  ...,  0.0128,  0.0164, -0.0405],
        [-0.0173, -0.0264, -0.0415,  ...,  0.0391,  0.0240,  0.0092],
        [ 0.0238,  0.0258, -0.0001,  ..., -0.0137, -0.0101, -0.0140]],
       device='cuda:0', grad_fn=<ToCopyBackward0>)
tensor([[-0.0291,  0.0149,  0.

NotImplementedError: Cannot copy out of meta tensor; no data!

In [None]:
import torch
try:
    from flash_attn import flash_attn_func
    print("FlashAttention library imported successfully.")
    # Try a minimal dummy call (ensure a CUDA device is available)
    if torch.cuda.is_available():
        q = torch.randn(1, 1024, 12, 64, dtype=torch.float16, device='cuda')
        k = torch.randn(1, 1024, 12, 64, dtype=torch.float16, device='cuda')
        v = torch.randn(1, 1024, 12, 64, dtype=torch.float16, device='cuda')
        _ = flash_attn_func(q, k, v)
        print("FlashAttention dummy run successful.")
    else:
        print("CUDA not available for dummy FlashAttention test.")
except ImportError:
    print("FlashAttention library NOT imported or functional.")
except Exception as e:
    print(f"Error during FlashAttention dummy run: {e}")

NotImplementedError: Cannot copy out of meta tensor; no data! Please use torch.nn.Module.to_empty() instead of torch.nn.Module.to() when moving module from meta to a different device.