## All imports

In [2]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


## Load tokenizer and model weights

In [3]:
model_id = "Qwen/Qwen3-0.6B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

Loading weights: 100%|██████████| 311/311 [00:00<00:00, 608.50it/s, Materializing param=model.norm.weight]                              


## Basic Inspection of model we have loaded

In [4]:
print(model)

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
        (post_attention_layer

## write a basic generation loop

In [7]:
prompt = "what is capital of united states?"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
def generate_basic_loop(prompts, max_tokens = 100):
    '''
    Step1: convert the input list of tokens into tensors
    Step2: convert it to tensors and unsqueeze at position 0
    '''

    input_list_ids = tokenizer.encode(prompt)
    input_tensor_ids = torch.tensor(input_list_ids, device = device).unsqueeze(0)
    response_list_ids = []
    for i in range(max_tokens):
        #print(input_tensor_ids.shape)
        logits = model(input_tensor_ids).logits
        #print(f'logits are {logits}')
        #print(f'logits shape is {logits.shape}')
        last_token = logits[:, -1, :].argmax(dim = -1, keepdim = True).detach()
        #print(f'last token is {last_token}')
        response_list_ids.extend(last_token.tolist()[0])

        input_tensor_ids = torch.cat([input_tensor_ids, last_token], dim = 1)
    return response_list_ids
print(tokenizer.decode(generate_basic_loop(prompt)))






 a. washington b. washington dc c. washington dc d. washington
Answer:
The capital of the United States is **Washington, D.C.**. The answer is **B**.

**Explanation:**
The United States is a country located in the United States, and its capital is Washington, D.C. The capital is also known as the capital city of the country. The capital city is also known as the capital of the United States. The answer is **B**.

**Answer


In [None]:
tokenizer.encode('<|endoftext|>')

[151643]

In [None]:
prompt = "what is the capital of india?"

inputs = tokenizer(prompt, return_tensors="pt").to(device)
out = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(out[0][inputs["input_ids"].shape[1]:]))

?
The answer is India's capital. Let me know if I can help.
The answer is India's capital.
The answer is India's capital.
The answer is India's capital.
The answer is India's capital.
The answer is India's capital


In [None]:
prompt = "How do i win a game?"
input_list_ids = tokenizer.encode(prompt)
print(input_list_ids)
input_tensor_ids = torch.tensor(input_list_ids)
print(input_tensor_ids.shape)


[4340, 653, 600, 3164, 264, 1809, 30]
torch.Size([7])
