In [1]:
from huggingface_hub import whoami

print(whoami())  # confirms your saved token is being used

{'type': 'user', 'id': '6507ef88423b46492ee03543', 'name': 'rayhu', 'fullname': 'Ray Hu', 'email': 'rayhu007@gmail.com', 'emailVerified': True, 'canPay': True, 'periodEnd': 1756684799, 'isPro': False, 'avatarUrl': '/avatars/e88381835e647dbe02fbd97654539e8d.svg', 'orgs': [{'type': 'org', 'id': '636025b83605bd411c1889d9', 'name': 'Stanford', 'fullname': 'Stanford AI', 'email': None, 'canPay': False, 'periodEnd': None, 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/604d2f473050a33ebb17ef51/Z-vDTyG_6-yZhzfXklqAK.jpeg', 'roleInOrg': 'contributor', 'isEnterprise': False}, {'type': 'org', 'id': '66087bf0abd8bd25eeffdf83', 'name': 'surfski', 'fullname': 'Surfski', 'email': None, 'canPay': False, 'periodEnd': None, 'avatarUrl': 'https://www.gravatar.com/avatar/4fa1051e443bad5dc9df6937156a08bf?d=retro&size=100', 'roleInOrg': 'admin', 'isEnterprise': False}], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'jupyter-note', 'role': 'write', 'createdAt': '2025

In [2]:
# from huggingface_hub import login
# login()

In [3]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0))

CUDA available: True
GPU name: NVIDIA GeForce RTX 3090


In [4]:
device = "cuda"
model_id = "meta-llama/Llama-3.2-3B-Instruct"

In [5]:
# # Use a pipeline as a high-level helper
# from transformers import pipeline

# pipe = pipeline(
#     "text-generation", 
#     device=0, 
#     model=model_id
# )

# # torch_dtype=torch.bfloat16,      # bfloat16/float16 automatically if GPU supports it
# # device=0,               # 0 = first GPU, -1 = CPU
# # device_map="auto",       # puts model on GPU if available, or 

In [6]:

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,   # or torch.float16
    device_map="auto",            # puts model on GPU
    low_cpu_mem_usage=True,
    attn_implementation="sdpa"    # memory-efficient attention
).eval()

model = model.to(device)   # or .to("cuda") or .to("cuda:0")
print("Model device:", model.device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model device: cuda:0


In [8]:
prompt="User: What is 2+2?"

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

outputs = model.generate(
    **inputs,
    max_new_tokens=300,
    temperature=0.7,
    pad_token_id=tokenizer.eos_token_id,
    top_p=0.9
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

User: What is 2+2? 
This is a simple math problem that can be answered with a straightforward calculation.
2 + 2 = 4
Therefore, the answer is 4.


In [9]:
text_input= "Explain the difference between supervised and unsupervised learning:"
inputs = tokenizer(text_input, return_tensors="pt").to(model.device)

outputs = model.generate(
    **inputs,
    max_new_tokens=500,
    temperature=0.7,
    pad_token_id=tokenizer.eos_token_id,
    top_p=0.9
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Explain the difference between supervised and unsupervised learning: supervised learning is a type of machine learning in which a model is trained on labeled data, whereas unsupervised learning is a type of machine learning in which a model is trained on unlabeled data. In supervised learning, the model is trained to predict a specific output based on the input data, whereas in unsupervised learning, the model is trained to identify patterns or relationships in the data without making any predictions. This distinction is important because it affects how the model is trained, evaluated, and used in practice.
In supervised learning, the model is trained on labeled data, meaning that the data is already annotated or labeled with the correct output. The goal of the model is to learn a mapping between the input data and the output labels, so that it can make accurate predictions on new, unseen data. Supervised learning is commonly used in applications such as image classification, speech re

In [10]:
# Get logits (raw next-token scores)
with torch.no_grad():
    logits = model(**inputs).logits
    print("Logits shape:", logits.shape)   # [batch, seq_len, vocab_size]

# Convert last token logits to probabilities
probs = torch.nn.functional.softmax(logits[0, -1], dim=-1)
topk = torch.topk(probs, k=5)
print("Top-5 predicted tokens:", [tokenizer.decode([i]) for i in topk.indices.tolist()])


Logits shape: torch.Size([1, 13, 128256])
Top-5 predicted tokens: [' Superv', ' \n', ' supervised', ' The', ' \n\n']


In [11]:
raise RuntimeError("⚠️ Manual only: uncomment this line to run cleanup")

import gc, torch

del model
gc.collect()
torch.cuda.empty_cache()

RuntimeError: ⚠️ Manual only: uncomment this line to run cleanup