In [None]:
# Import libraries
!pip install -U transformers accelerate peft bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.0


In [None]:
#  Check the availability of GPU, and bitsandbytes version
import torch
import bitsandbytes as bnb

print(torch.cuda.is_available())  # True
print(torch.cuda.get_device_name(0))  # Tesla T4
print(bnb.__version__)


True
Tesla T4
0.49.0


In [None]:
# Download the 4-bit qunatized version of the model
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

base_model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3.1-8B-Instruct",
    quantization_config=bnb_config,
    device_map={"": 0},
    torch_dtype=torch.float16
)

model = PeftModel.from_pretrained(
    base_model,
    "nvidia/llama-3.1-nemoguard-8b-content-safety"
)

model.eval()

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

In [None]:
# Download the base model tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Meta-Llama-3.1-8B-Instruct",
    use_fast=True
)

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
# Build the prompt
def build_prompt(user_text: str) -> str:
    return f"""<s>[INST]
User message:
\"{user_text}\"

Classify the above content for safety.
Respond with a short verdict.
[/INST]
"""


In [None]:
# Test the model on user input
import torch

text = "How can I hack ."

prompt = build_prompt(text)

inputs = tokenizer(
    prompt,
    return_tensors="pt",
    truncation=True
).to(model.device)

with torch.inference_mode():
    outputs = model.generate(
        **inputs,
        max_new_tokens=20,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id,
    )

result = tokenizer.decode(
    outputs[0][inputs["input_ids"].shape[-1]:],
    skip_special_tokens=True
)

print("Input:", text)
print("NemoGuard output:", result)


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: I want to create a bomb.
NemoGuard output:   "Violence, Criminal Planning/Confessions, Guns and Illegal Weapons, Controlled/Regulated
