In [1]:
from dataloaderlite import DataLoaderLite
from transformers import AutoTokenizer
import torch

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = "mps"
print(f"using device: {device}")

# SEED
torch.manual_seed(1337)
if torch.cuda.is_available():
    torch.cuda.manual_seed(1337)

# STOP
num_return_sequences = 5
max_length = 30

train_loader = DataLoaderLite(B = 16, T = 512)
# Define the fixed text for prediction
fixed_text = "This is a fixed text used for prediction."
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer")
encoded_text = tokenizer(fixed_text, return_tensors="pt").to(device)
print (f"Encoded Text {encoded_text}")

using device: cuda
loaded 341094 tokens
1 epoch = 41 batches
Encoded Text {'input_ids': tensor([[ 1348,   314,   253,  6450,  1694,   804,   327, 12435,    30]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}


In [2]:
# Use
from deepseek_smollm2 import DeepSeekModel, DeepSeekConfig
model = DeepSeekModel(DeepSeekConfig())

# Wrap model in DataParallel
#if torch.cuda.device_count() > 1:
#    print(f"Using {torch.cuda.device_count()} GPUs!")
#    model = torch.nn.DataParallel(model)  # Enable Multi-GPU

model.to(device)
model = torch.compile(model)

# NEW CODE
import time
optimizer = torch.optim.AdamW(model.parameters(), lr = 3e-4)

In [3]:
# Training loop
total_steps = 10000
prediction_interval = 500
checkpoint_path = "llm_checkpoint.pt"

for step in range(total_steps):
    t0 = time.time()
    x, y = train_loader.next_batch()
    x, y = x.to(device), y.to(device)
    optimizer.zero_grad()
    # NEW CODE ADDED HERE
    with torch.autocast(device_type=device, dtype=torch.bfloat16):
        logits, loss = model(x, y) 
    loss.backward()
    optimizer.step()
    torch.cuda.synchronize() 
    t1 = time.time()
    dt = (t1 - t0) * 1000
    tokens_per_sec = (train_loader.B * train_loader.T) / (t1 - t0)
    print(f'step{step} | loss: {loss.item()} | dt: {dt:.2f}ms | tok/sec: {tokens_per_sec: .2f}')
    
    # Perform prediction every 500 steps
    if (step + 1) % prediction_interval == 0:
        with torch.no_grad():
            model.eval()
            
            max_new_tokens = 10
            temperature = 1.0
            top_k = 40
            
            prediction_logits = model.generate(encoded_text["input_ids"], max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k)
            prediction = tokenizer.decode(prediction_logits[0], skip_special_tokens=True)
            print(f"Prediction at step {step+1}: \n : {prediction} \n")
            model.train()

# Save checkpoint after 5000 steps
torch.save({"model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict()}, checkpoint_path)
print(f"Checkpoint saved to {checkpoint_path}")



W0226 22:12:58.266887 107630 site-packages/torch/_dynamo/variables/tensor.py:776] [0/0] Graph break from `Tensor.item()`, consider setting:
W0226 22:12:58.266887 107630 site-packages/torch/_dynamo/variables/tensor.py:776] [0/0]     torch._dynamo.config.capture_scalar_outputs = True
W0226 22:12:58.266887 107630 site-packages/torch/_dynamo/variables/tensor.py:776] [0/0] or:
W0226 22:12:58.266887 107630 site-packages/torch/_dynamo/variables/tensor.py:776] [0/0]     env TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1
W0226 22:12:58.266887 107630 site-packages/torch/_dynamo/variables/tensor.py:776] [0/0] to include these operations in the captured graph.
W0226 22:12:58.266887 107630 site-packages/torch/_dynamo/variables/tensor.py:776] [0/0] 
W0226 22:12:58.266887 107630 site-packages/torch/_dynamo/variables/tensor.py:776] [0/0] Graph break: from user code at:
W0226 22:12:58.266887 107630 site-packages/torch/_dynamo/variables/tensor.py:776] [0/0]   File "/home/xpz2/tsai/S13/deepseek_smollm2.py", line 3

step0 | loss: 15.23127269744873 | dt: 9193.81ms | tok/sec:  891.03
step1 | loss: 11.866670608520508 | dt: 429.33ms | tok/sec:  19080.94
step2 | loss: 10.49077320098877 | dt: 423.09ms | tok/sec:  19362.27
step3 | loss: 9.664892196655273 | dt: 451.48ms | tok/sec:  18144.58
step4 | loss: 9.38666820526123 | dt: 490.73ms | tok/sec:  16693.35
step5 | loss: 9.80578327178955 | dt: 516.71ms | tok/sec:  15854.22
step6 | loss: 10.225263595581055 | dt: 487.89ms | tok/sec:  16790.66
step7 | loss: 9.731541633605957 | dt: 507.34ms | tok/sec:  16146.93
step8 | loss: 9.041110038757324 | dt: 486.20ms | tok/sec:  16849.12
step9 | loss: 8.883980751037598 | dt: 511.15ms | tok/sec:  16026.47
step10 | loss: 8.641397476196289 | dt: 485.64ms | tok/sec:  16868.58
step11 | loss: 8.930558204650879 | dt: 500.43ms | tok/sec:  16369.82
step12 | loss: 9.2169189453125 | dt: 486.89ms | tok/sec:  16825.09
step13 | loss: 8.78663444519043 | dt: 511.43ms | tok/sec:  16017.80
step14 | loss: 8.667610168457031 | dt: 486.54ms 

In [5]:
for i in range(5):
    with torch.no_grad():
        model.eval()
        
        max_new_tokens = 10
        temperature = 1.0
        top_k = 40
        
        prediction_logits = model.generate(encoded_text["input_ids"], max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k)
        prediction = tokenizer.decode(prediction_logits[0], skip_special_tokens=True)
        print(f"Prediction output {i+1}: \n : {prediction} \n")
        model.train()

Prediction output 1: 
 : This is a fixed text used for prediction.
IET:
MENRY VI: 

Prediction output 2: 
 : This is a fixed text used for prediction.
BUCIO:
KING EDW 

Prediction output 3: 
 : This is a fixed text used for prediction. I have we are not my brother? Come, 

Prediction output 4: 
 : This is a fixed text used for prediction.
I fear,

And all, the 

Prediction output 5: 
 : This is a fixed text used for prediction.
Why, I
A:
A greater 



In [9]:
# Load checkpoint and train for another 50 steps
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

print("Loaded checkpoint. Resuming training for another 50 steps.")

for step in range(50):
    t0 = time.time()

    # Training batch
    x, y = train_loader.next_batch()
    x, y = x.to(device), y.to(device)
    optimizer.zero_grad()
    with torch.autocast(device_type=device, dtype=torch.bfloat16):
        logits, loss = model(x, y)
    loss.backward()
    optimizer.step()
    torch.cuda.synchronize()

    t1 = time.time()
    dt = (t1 - t0) * 1000  # Time in milliseconds
    tokens_per_sec = (train_loader.B * train_loader.T) / (t1 - t0)

    print(f"Step {step+1} (After Resuming) | Loss: {loss.item():.4f} | dt: {dt:.2f}ms | tok/sec: {tokens_per_sec:.2f}")
    
print(loss)

# step13 | loss: 7.214234352111816 | dt: 2418.43ms | tok/sec:  3387.32
# step13 | loss: 7.214259624481201 | dt: 1337.68ms | tok/sec:  6124.05
# step13 | loss: 7.330005645751953 | dt: 978.19ms | tok/sec:  8374.65
# step13 | loss: 7.012979507446289 | dt: 1032.97ms | tok/sec:  15861.12
# step13 | loss: 6.9644927978515625 | dt: 1004.06ms | tok/sec:  16317.75

  checkpoint = torch.load(checkpoint_path)


Loaded checkpoint. Resuming training for another 50 steps.
Step 1 (After Resuming) | Loss: 3.2190 | dt: 334.46ms | tok/sec: 48986.25
Step 2 (After Resuming) | Loss: 2.9984 | dt: 326.77ms | tok/sec: 50139.09
Step 3 (After Resuming) | Loss: 3.1723 | dt: 355.79ms | tok/sec: 46049.37
Step 4 (After Resuming) | Loss: 3.0268 | dt: 330.31ms | tok/sec: 49601.80
Step 5 (After Resuming) | Loss: 2.9755 | dt: 326.42ms | tok/sec: 50193.76
Step 6 (After Resuming) | Loss: 2.9315 | dt: 350.14ms | tok/sec: 46792.09
Step 7 (After Resuming) | Loss: 3.0457 | dt: 331.34ms | tok/sec: 49447.40
Step 8 (After Resuming) | Loss: 2.9824 | dt: 330.66ms | tok/sec: 49550.08
Step 9 (After Resuming) | Loss: 3.1363 | dt: 352.96ms | tok/sec: 46419.38
Step 10 (After Resuming) | Loss: 3.0489 | dt: 329.30ms | tok/sec: 49754.61
Step 11 (After Resuming) | Loss: 3.0183 | dt: 328.53ms | tok/sec: 49870.55
Step 12 (After Resuming) | Loss: 3.0337 | dt: 354.19ms | tok/sec: 46257.36
Step 13 (After Resuming) | Loss: 2.9782 | dt: 329.