<a href="https://colab.research.google.com/github/sachinthadilshann/pytorch_course_by_DanielBourke/blob/main/CIFAR10_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torchvision

print(f"PyTorch version: {torch.__version__}")
print(f"TorchVision version: {torchvision.__version__}")

# Set the target device
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Using device: {device}")

PyTorch version: 2.9.0+cu126
TorchVision version: 0.24.0+cu126
Using device: cuda


In [2]:
# Create model weights and transforms
model_weights = torchvision.models.ResNet50_Weights.IMAGENET1K_V2 # <- use the latest weights (could also use .DEFAULT)
transforms = model_weights.transforms()

# Setup model
model = torchvision.models.resnet50(weights=model_weights)

# Count the number of parameters in the model
total_params = sum(
    param.numel() for param in model.parameters()
)

print(f"Total parameters of model: {total_params} (the more parameters, the more GPU memory the model will use, the more *relative* of a speedup you'll get)")
print(f"Model transforms:\n{transforms}")

Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth


100%|██████████| 97.8M/97.8M [00:00<00:00, 158MB/s]


Total parameters of model: 25557032 (the more parameters, the more GPU memory the model will use, the more *relative* of a speedup you'll get)
Model transforms:
ImageClassification(
    crop_size=[224]
    resize_size=[232]
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)


In [3]:
def create_model(num_classes=10):

  model_weights = torchvision.models.ResNet50_Weights.IMAGENET1K_V2
  transforms = model_weights.transforms()
  model = torchvision.models.resnet50(weights=model_weights)

  model.fc = torch.nn.Linear(in_features=2048,
                             out_features=num_classes)
  return model, transforms

model, transforms = create_model()

In [4]:
import torch

total_free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
print(f"Total free GPU memory: {round(total_free_gpu_memory * 1e-9, 3)} GB")
print(f"Total GPU memory: {round(total_gpu_memory * 1e-9, 3)} GB")

Total free GPU memory: 15.72 GB
Total GPU memory: 15.828 GB


In [5]:

total_free_gpu_memory_gb = round(total_free_gpu_memory * 1e-9, 3)
if total_free_gpu_memory_gb >= 16:
  BATCH_SIZE = 128
  IMAGE_SIZE = 224
  print(f"GPU memory available is {total_free_gpu_memory_gb} GB, using batch size of {BATCH_SIZE} and image size {IMAGE_SIZE}")
else:
  BATCH_SIZE = 32
  IMAGE_SIZE = 128
  print(f"GPU memory available is {total_free_gpu_memory_gb} GB, using batch size of {BATCH_SIZE} and image size {IMAGE_SIZE}")

GPU memory available is 15.72 GB, using batch size of 32 and image size 128


In [6]:
transforms.crop_size = IMAGE_SIZE
transforms.resize_size = IMAGE_SIZE
print(f"Updated data transforms:\n{transforms}")

Updated data transforms:
ImageClassification(
    crop_size=128
    resize_size=128
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)


In [8]:
import torch
import torchvision

train_dataset = torchvision.datasets.CIFAR10(root='.',
                                             train=True,
                                             download=True,
                                             transform=torchvision.transforms.ToTensor())

test_datset = torchvision.datasets.CIFAR10(root='.',
                                           train=False,
                                           download=True,
                                           transform=torchvision.transforms.ToTensor())



100%|██████████| 170M/170M [00:03<00:00, 48.1MB/s]


In [9]:
train_len = len(train_dataset)
test_len = len(test_datset)

print(f"Train dataset length: {train_len}")
print(f"Test dataset length: {test_len}")

Train dataset length: 50000
Test dataset length: 10000


In [10]:
train_dataset[0][1]

6

In [18]:
from torch.utils.data import DataLoader
import os

NUM_WOKERS= os.cpu_count()
NUM_WOKERS

2

In [20]:


train_dataloader = DataLoader(dataset=train_dataset,
                             batch_size=BATCH_SIZE,
                             shuffle=True,
                             num_workers=NUM_WOKERS)

test_dataloader = DataLoader(dataset=test_datset,
                             batch_size=BATCH_SIZE,
                             shuffle=True,
                             num_workers=NUM_WOKERS)

print(f"Train dataloader length: {len(train_dataloader)} batches of size {BATCH_SIZE}")
print(f"Test dataloader length: {len(test_dataloader)} batches of size {BATCH_SIZE}")
print(f"Using number of workers: {NUM_WORKERS} (generally more workers means faster dataloading from CPU to GPU)")

Train dataloader length: 1563 batches of size 32
Test dataloader length: 313 batches of size 32
Using number of workers: 2 (generally more workers means faster dataloading from CPU to GPU)


In [21]:
from prompt_toolkit.shortcuts import progress_bar
import time
from tqdm.auto import tqdm
from typing import Dict, List, Tuple
import torch

def train_step(epoch: int,
               model: torch.nn.Module,
               dataloader: torch.utils.data.DataLoader,
               loss_fn:torch.nn.Module,
               optimizer: torch.optim.Optimizer,
               device: torch.device,
               disable_progress_bar: bool = False) -> Tuple[float,float]:

  model.train()

  train_loss,train_acc = 0,0

  ProgressBar = tqdm(enumerate(dataloader),
                   desc=f"Training Epoch {epoch}",
                   total = len(dataloader),
                   disable = disable_progress_bar )


  for batch,(X,y) in ProgressBar:

    X,y = X.to(device), y.to(device)

    y_pred = model(X)

    loss = loss_fn(y_pred,y)
    train_loss += loss.item()

    optimizer.zero_grad()

    loss.backward()

    optimizer.step()

    y_pred_class = torch.argmax(torch.softmax(y_pred,dim=1),dim=1)
    train_acc += (y_pred_class == y).sum().item()/len(y_pred)

    ProgressBar.set_postfix(
            {
                "train_loss": train_loss / (batch + 1),
                "train_acc": train_acc / (batch + 1),
            }
        )


  train_loss  = train_loss / len(dataloader)
  train_acc = train_acc / len(dataloader)

  return train_loss, train_acc


def test_step(  epoch:int,
                model: torch.nn.Module,
                dataloader: torch.utils.data.DataLoader,
                loss_fn: torch.nn.Module,
                device: torch.device,
                disable_progress_bar: bool = False)  -> Tuple[float,float]:

    model.eval()

    test_loss, test_acc = 0,0

    progress_bar = tqdm(enumerate(dataloader),
                        desc=f"Testing Epoch {epoch}",
                        total=len(dataloader),
                        disable=disable_progress_bar)

    with torch.inference_mode():
      for batch,(X,y) in progress_bar:
        X,y = X.to(device), y.to(device)

        test_pred_logits = model(X)

        loss = loss_fn(test_pred_logits,y)
        test_loss += loss.item()

        test_pred_labels = test_pred_logits.argmax(dim=1)
        test_acc += ((test_pred_labels == y).sum().item()/len(test_pred_labels))

        progress_bar.set_postfix(
            {
                  "test_loss": test_loss / (batch + 1),
                  "test_acc": test_acc / (batch + 1),
              }
          )

      test_loss = test_loss/len(dataloader)
      test_acc = test_acc/len(dataloader)
      return test_loss, test_acc


def train(model: torch.nn.Module,
           train_dataloader: torch.utils.data.DataLoader,
           test_dataloader: torch.utils.data.DataLoader,
           optimizer: torch.optim.Optimizer,
           loss_fn: torch.nn.Module,
           epochs: int,
           device: torch.device,
           disable_progress_bar: bool = False) -> Dict[str, List]:


  results = {"train_loss": [],
               "train_acc": [],
               "test_loss": [],
               "test_acc": [],
               "training_epochs_time":[],
               "testing_epochs_time":[]
    }

  for epoch in tqdm(range(epochs),disable=disable_progress_bar):

      train_epochs_start_time = time.time()
      train_loss, train_acc = train_step(epoch=epoch,
                                         model = model,
                                         dataloader = train_dataloader,
                                         loss_fn = loss_fn,
                                         optimizer = optimizer,
                                         device = device,
                                         disable_progress_bar = disable_progress_bar)


      train_epochs_end_time = time.time()
      train_epoch_time = train_epochs_end_time - train_epochs_start_time

      test_epoch_start_time = time.time()
      test_loss, test_acc = test_step(epoch=epoch,
                                      model=model,
                                      dataloader=test_dataloader,
                                      loss_fn=loss_fn,
                                      device=device,
                                      disable_progress_bar=disable_progress_bar)

      test_epoch_end_time = time.time()
      test_epoch_time = test_epoch_end_time - test_epoch_start_time


      print(
          f"Epoch: {epoch+1} | "
          f"train_loss: {train_loss:.4f} | "
          f"train_acc: {train_acc:.4f} | "
          f"test_loss: {test_loss:.4f} | "
          f"test_acc: {test_acc:.4f} | "
          f"train_time: {train_epoch_time:.3f} | "
          f"test_time: {test_epoch_time:.3f}"
      )


      results["train_loss"].append(train_loss)
      results["train_acc"].append(train_acc)
      results["test_loss"].append(test_loss)
      results["test_acc"].append(test_acc)
      results["training_epochs_time"].append(train_epoch_time)
      results["testing_epochs_time"].append(test_epoch_time)


  return results

In [22]:
NUM_EPOCHS = 1
LEARNING_RATE = 0.003


model,transforms = create_model()
model.to(device)

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),
                             lr=LEARNING_RATE)

compile_start_time = time.time()

compiled_model = torch.compile(model)

compile_end_time = time.time()
compile_time = compile_end_time - compile_start_time
print(f"Model compiled in {compile_time:.3f} seconds")

single_run_compile_results = train(model = compiled_model,
                                   train_dataloader = train_dataloader,
                                   test_dataloader = test_dataloader,
                                   loss_fn = loss_fn,
                                   optimizer = optimizer,
                                   epochs = NUM_EPOCHS,
                                   device = device)


Model compiled in 0.003 seconds


  0%|          | 0/1 [00:00<?, ?it/s]

Training Epoch 0:   0%|          | 0/1563 [00:00<?, ?it/s]

  return torch._C._get_cublas_allow_tf32()
W0202 05:19:27.315000 1747 torch/_inductor/utils.py:1558] [0/0] Not enough SMs to use max_autotune_gemm mode


Testing Epoch 0:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 1.8682 | train_acc: 0.3143 | test_loss: 1.4875 | test_acc: 0.4409 | train_time: 197.316 | test_time: 23.259


In [23]:
import pandas as pd

result_single_df = pd.DataFrame(single_run_compile_results)
result_single_df

Unnamed: 0,train_loss,train_acc,test_loss,test_acc,training_epochs_time,testing_epochs_time
0,1.868204,0.314279,1.487527,0.440895,197.315534,23.25905


In [36]:
def create_compiled_model():
  model,transforms = create_model()
  model.to(device)

  compiled_model = torch.compile(model)
  return compiled_model


def train_compiled_model(model=compiled_model,
                         epochs = NUM_EPOCHS,
                         lerninig_rate = LEARNING_RATE,
                         disable_progress_bar = False):




  loss_fn = torch.nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(),
                               lr=LEARNING_RATE)


  results = train(model = compiled_model,
                  train_dataloader = train_dataloader,
                  test_dataloader = test_dataloader,
                  loss_fn = loss_fn,
                  optimizer = optimizer,
                  epochs = NUM_EPOCHS,
                  device = device,
                  disable_progress_bar = disable_progress_bar)
  return results



In [33]:
NUM_RUNS = 5
NUM_EPOCHS = 1



In [37]:
model = create_compiled_model()


compiled_results_multiple_runs = []

# Run compiled model for multiple runs
for i in tqdm(range(NUM_RUNS)):
    print(f"[INFO] Run {i+1} of {NUM_RUNS} for compiled model")

    # Train the compiled model (note: the model will only be compiled once and then re-used for subsequent runs)
    results = train_compiled_model(model=model, epochs=NUM_EPOCHS, disable_progress_bar=True)
    compiled_results_multiple_runs.append(results)

  0%|          | 0/5 [00:00<?, ?it/s]

[INFO] Run 1 of 5 for compiled model
Epoch: 1 | train_loss: 1.4955 | train_acc: 0.4455 | test_loss: 1.4286 | test_acc: 0.4561 | train_time: 51.643 | test_time: 3.759
[INFO] Run 2 of 5 for compiled model
Epoch: 1 | train_loss: 1.4954 | train_acc: 0.4435 | test_loss: 1.4350 | test_acc: 0.4566 | train_time: 70.165 | test_time: 6.062
[INFO] Run 3 of 5 for compiled model
Epoch: 1 | train_loss: 1.4938 | train_acc: 0.4449 | test_loss: 1.4316 | test_acc: 0.4602 | train_time: 86.463 | test_time: 5.697
[INFO] Run 4 of 5 for compiled model
Epoch: 1 | train_loss: 1.4958 | train_acc: 0.4446 | test_loss: 1.4300 | test_acc: 0.4578 | train_time: 82.523 | test_time: 6.369
[INFO] Run 5 of 5 for compiled model
Epoch: 1 | train_loss: 1.4945 | train_acc: 0.4443 | test_loss: 1.4424 | test_acc: 0.4554 | train_time: 82.602 | test_time: 5.684


In [38]:
compile_result = []

for result in compiled_results_multiple_runs:
  result_df = pd.DataFrame(result)
  compile_result.append(result_df)
compile_result = pd.concat(compile_result)

compile_result

Unnamed: 0,train_loss,train_acc,test_loss,test_acc,training_epochs_time,testing_epochs_time
0,1.495548,0.445457,1.428556,0.45607,51.642557,3.758856
0,1.495445,0.443518,1.43501,0.456569,70.16506,6.062267
0,1.493838,0.444938,1.431582,0.460164,86.46332,5.69651
0,1.495776,0.444618,1.430012,0.457768,82.522761,6.368743
0,1.494525,0.444278,1.442443,0.455371,82.601627,5.683835
