In [1]:
from transformers import (
    AutoModelForCausalLM, AutoTokenizer
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
from transformers import BitsAndBytesConfig

In [3]:
model_id = "models/qwen1.5b/"

In [4]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_type=torch.bfloat16
)

In [5]:
torch.cuda.is_available()

True

In [6]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code = True,
).to('cuda')

The module name  (originally ) is not a valid Python identifier. Please rename the original module to avoid import issues.


In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [8]:
inputs = tokenizer(
    "Explain the theory of nudging in the context of behavioral science.", 
    return_tensors = "pt"
)

In [9]:
!nvidia-smi

Mon Dec 22 12:38:55 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.119.02             Driver Version: 580.119.02     CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 5090        Off |   00000000:01:00.0 Off |                  N/A |
|  0%   30C    P1             72W /  450W |    6606MiB /  32607MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [10]:
inputs = {k: v.to('cuda') for k, v in inputs.items()}
inputs

{'input_ids': tensor([[  840, 20772,   279, 10126,   315, 41441,  3173,   304,   279,  2266,
            315, 35595,  8038,    13]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [11]:
outputs = model.generate(
    **inputs
)

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


In [12]:
list(outputs.cpu().numpy())

[array([   840,  20772,    279,  10126,    315,  41441,   3173,    304,
           279,   2266,    315,  35595,   8038,     13,    451,  87312,
           374,    264,  35595,   8038,   5486,    429,  21538,    311,
         10173,   1251,    594,   7709,    553,   3259,    432,   8661,
           369,   1105,    311,   1281,    279,   1290,  11454,     13,
          1084,    374,   3118,    389,    279,   4522,    429,   1251,
           525,   3545,    537,   7853,    315,    279,  11454,    807,
          1281,    476,    279,  15917,    315,   1846,  11454,     11,
           323,    429,   2613,   4344,    304,    279,   4573,    646,
          2990,    311,   5089,   4344,    304,   7709,    382,    785,
         10126,    315,  41441,   3173,    572,   1156,  11523,    553,
         11867,    663,  13111,    323,  25442,   8059,  12429,    304,
           862,   2311,    330,     45,  19561,     25,  21961,   4405,
          3714,   6805,   9975,   6267,     11,  59741,     11, 

In [13]:
tokenizer.decode(list(outputs.cpu().numpy())[0])

'Explain the theory of nudging in the context of behavioral science. Nudging is a behavioral science approach that aims to influence people\'s behavior by making it easier for them to make the right choices. It is based on the idea that people are often not aware of the choices they make or the consequences of those choices, and that small changes in the environment can lead to significant changes in behavior.\n\nThe theory of nudging was first introduced by Richard Thaler and Cass Sunstein in their book "Nudge: Improving Decisions About Health, Wealth, and Happiness." The authors argue that people are often influenced by their environment, and that small changes in the way decisions are presented can lead to better outcomes.\n\nNudging involves using small changes in the environment to encourage people to make the right choices. For example, a company might offer a discount on a product if a customer signs up for a loyalty program, or a government might provide incentives for people t

In [14]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((1536,), eps=1e-06)
    (rotar