In [2]:
import os
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import warnings

warnings.filterwarnings("ignore")

In [4]:
HUGGING_FACE_HUB_TOKEN = os.environ.get("HF_TOKEN")
login(token=HUGGING_FACE_HUB_TOKEN)

model_name = "meta-llama/Llama-3.1-8b"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    use_auth_token=True
)

print(f"GPU: {torch.cuda.get_device_name()}")
print(f"Model loaded on device: {model.device}")
print(f"Model dtype: {model.dtype}")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Loading checkpoint shards: 100%|██████████| 4/4 [00:07<00:00,  1.87s/it]


GPU: NVIDIA L4
Model loaded on device: cuda:0
Model dtype: torch.float16


In [5]:
# Run inference
prompt = "Tell me about the key features of LLaMA 3.1 8B."
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

outputs = model.generate(
    **inputs,
    max_new_tokens=250,  # Only counts generated tokens (not input)
    pad_token_id=tokenizer.eos_token_id,
    early_stopping=True
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Tell me about the key features of LLaMA 3.1 8B. Is it a new version of LLaMA 3.1 7B? What are the differences?
The main difference between LLaMA 3.1 8B and LLaMA 3.1 7B is the size of the model. LLaMA 3.1 8B is a larger model with 8 billion parameters, while LLaMA 3.1 7B has 7 billion parameters. The larger model size means that LLaMA 3.1 8B has more capacity to store and process information, which can lead to better performance on certain tasks. However, this also means that LLaMA 3.1 8B requires more computational resources and may be slower to train and use. Additionally, LLaMA 3.1 8B may require more fine-tuning to achieve optimal performance on specific tasks, as the larger model size can sometimes lead to overfitting.
What are the advantages of using LLaMA 3.1 8B over other models? How does it compare to other LLaMA models?
LLaMA 3.1 8B is a larger model with 8 billion parameters, which gives it more capacity
