In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# What device will we run on?
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

# We'll use a small model first to see mechanics clearly
model_name = "microsoft/phi-2"  # 2.7B params, ~6GB, fast on your hardware

tokenizer = AutoTokenizer.from_pretrained(model_name)
print(f"Vocabulary size: {tokenizer.vocab_size}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: mps
Vocabulary size: 50257


In [3]:
print(f"Loading {model_name}...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # half precision - why?
    trust_remote_code=True
)
model = model.to(device)
print(f"Model loaded on {device}")
print(f"Parameters: {sum(p.numel() for p in model.parameters()) / 1e9:.2f}B")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading microsoft/phi-2...


Fetching 2 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:50<00:00, 25.48s/it]
Loading weights: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 453/453 [00:00<00:00, 2384.04it/s, Materializing param=model.layers.31.self_attn.v_proj.weight]


Model loaded on mps
Parameters: 2.78B


In [4]:
prompt = "The party enters a dark dungeon. The dungeon master describes:"

# Tokenize - this is what the model actually sees
inputs = tokenizer(prompt, return_tensors="pt").to(device)
print(f"Input token IDs: {inputs['input_ids']}")
print(f"Token count: {inputs['input_ids'].shape[1]}")
print(f"Tokens decoded individually:")
for token_id in inputs['input_ids'][0]:
    print(f"  {token_id.item()} → '{tokenizer.decode([token_id])}'")

Input token IDs: tensor([[  464,  2151, 14170,   257,  3223, 18974,    13,   383, 18974,  4958,
          8477,    25]], device='mps:0')
Token count: 12
Tokens decoded individually:
  464 → 'The'
  2151 → ' party'
  14170 → ' enters'
  257 → ' a'
  3223 → ' dark'
  18974 → ' dungeon'
  13 → '.'
  383 → ' The'
  18974 → ' dungeon'
  4958 → ' master'
  8477 → ' describes'
  25 → ':'


In [5]:
with torch.no_grad():
    output = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.8,
        do_sample=True,
        top_p=0.9,
    )

# Decode just the new tokens
new_tokens = output[0][inputs['input_ids'].shape[1]:]
print(f"New token IDs: {new_tokens}")
print(f"\nGenerated text:")
print(tokenizer.decode(new_tokens, skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


New token IDs: tensor([  921,  3802,   257,  3223,   290, 21151, 18974,    13,   383,  1633,
          318,  6546,   351,   262,  8508,   286, 22119,   290, 15936,    13,
          921,  3285,   262,  2128,   286, 37472,  1660,   290, 18107, 30346,
          286,  6941,   504,   290,  7128,   504,    13,   921,   766,   257,
        18107,  1657,   287,   262,  5253,    11,  2406,   422,   257,  1588,
         3420,   379,   262,   886,   286,   262, 20749,    13,  2141,   345,
          765,   284,    25,   317,     8,  6062,   262,  3420,   290,   766,
          644,   338,  2641,    13,   347,     8, 41401,   262, 18974,  2252,
           13,   327,     8,  6756,   736,   290,   804,   329,  1194,   835,
          503,    13,   198, 15037, 45532,    25,   317,     8,  6062,   262],
       device='mps:0')

Generated text:
 You enter a dark and damp dungeon. The air is thick with the smell of decay and mold. You hear the sound of dripping water and faint echoes of moans and groans. Yo

In [6]:
# Clear previous model from GPU memory first
del model
torch.mps.empty_cache()

model_name = "microsoft/phi-2"  # we'll switch to instruct variant
# Actually let's use a proper instruct model
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=torch.float16,
    trust_remote_code=True
)
model = model.to(device)
print(f"Parameters: {sum(p.numel() for p in model.parameters()) / 1e9:.2f}B")

Fetching 3 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [01:47<00:00, 35.89s/it]
Loading weights: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 291/291 [00:00<00:00, 297.45it/s, Materializing param=model.norm.weight]


Parameters: 7.24B


In [7]:
prompt = """<s>[INST] You are a Dungeon Master for a D&D campaign. 
Describe the entrance to an ancient dwarven fortress in 3 sentences. 
Be atmospheric and specific. [/INST]"""

inputs = tokenizer(prompt, return_tensors="pt").to(device)
print(f"Input tokens: {inputs['input_ids'].shape[1]}")

with torch.no_grad():
    output = model.generate(
        **inputs,
        max_new_tokens=150,
        temperature=0.8,
        do_sample=True,
        top_p=0.9,
    )

new_tokens = output[0][inputs['input_ids'].shape[1]:]
print(tokenizer.decode(new_tokens, skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Input tokens: 49
Nestled deep within the heart of the Misty Mountains, the ancient dwarven fortress of Thundertop is shrouded in an aura of mystery and intrigue. The entrance, a massive, ornately carved stone door, is hidden behind a waterfall, its thunderous roar masking the sounds of the fortress within. The air is thick with the scent of damp moss and the faint, metallic tang of iron, while the dim, flickering light of moss-covered lanterns cast eerie, dancing shadows upon the damp, moss-covered walls. As adventurers approach, they cannot help but feel a sense of awe and trepidation, knowing that they are


In [8]:
prompt = """<s>[INST] You are a Dungeon Master for a D&D campaign. 
Describe the entrance to an ancient dwarven fortress in exactly 3 sentences.
End your response with </description> [/INST]"""

inputs = tokenizer(prompt, return_tensors="pt").to(device)

with torch.no_grad():
    output = model.generate(
        **inputs,
        max_new_tokens=200,
        temperature=0.8,
        do_sample=True,
        top_p=0.9,
        stop_strings=["</description>"],
        tokenizer=tokenizer,
    )

new_tokens = output[0][inputs['input_ids'].shape[1]:]
print(tokenizer.decode(new_tokens, skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Nestled deep within the heart of the Frostfell Mountains, the entrance to the ancient dwarven fortress of Thundertop is concealed by a cascading waterfall, its thundering roar masking the faint hum of magical wards that keep intruders at bay. Carved from living stone, the imposing iron-bound doors bear the emblem of the Stonehammer Clan, with runes that pulse with an inner light, guarding the threshold to the realm of the stout-hearted dwarves. </description>


In [9]:
del model
del tokenizer
torch.mps.empty_cache()
print("GPU memory freed")

GPU memory freed
