>## Things that are new with Pytorch 2.0
### 1. `torch.comple` => operator fusion and graph monitoring to speed up training
### 2. `torch.set_default_device` or context manager `with torch.device(device)` => set device globally
### 3.  `TensorFloat32` => datatype that bridges float32 and float16

## possible improvements and extensions
- use more powerful CPU and GPUs (imporove data loading speeds)
- Use Automatic Mixed Precision training (AMP)
- transformer based model may see more base speed ups than convolutional ( because of optimizer scaled_dot_product_attention())
- train for longer 

(https://sebastianraschka.com/blog/2023/pytorch-faster.html)

In [2]:
import torch
import torchvision

print(f"Pytorch version: {torch.__version__}")
print(f"Torchvision version: {torchvision.__version__}")

#setup device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Pytorch version: 2.0.1
Torchvision version: 0.15.2a0
Using device: cuda


In [20]:
# Make sure we're using a NVIDIA GPU
if torch.cuda.is_available():
  gpu_info = !nvidia-smi
  gpu_info = '\n'.join(gpu_info)
  if gpu_info.find("failed") >= 0:
    print("Not connected to a GPU, to leverage the best of PyTorch 2.0, you should connect to a GPU.")

  # Get GPU name
  gpu_name = !nvidia-smi --query-gpu=gpu_name --format=csv
  gpu_name = gpu_name[1]
  GPU_NAME = gpu_name.replace(" ", "_") # remove underscores for easier saving
  print(f'GPU name: {GPU_NAME}')

  # Get GPU capability score
  GPU_SCORE = torch.cuda.get_device_capability()
  print(f"GPU capability score: {GPU_SCORE}")
  if GPU_SCORE >= (8, 0):
    print(f"GPU score higher than or equal to (8, 0), PyTorch 2.x speedup features available.")
  else:
    print(f"GPU score lower than (8, 0), PyTorch 2.x speedup features will be limited (PyTorch 2.x speedups happen most on newer GPUs).")
  
  # Print GPU info
  print(f"GPU information:\n{gpu_info}")

else:
  print("PyTorch couldn't find a GPU, to leverage the best of PyTorch 2.0, you should connect to a GPU.")

GPU name: NVIDIA_GeForce_GTX_1660_Ti_with_Max-Q_Design
GPU capability score: (7, 5)
GPU score lower than (8, 0), PyTorch 2.x speedup features will be limited (PyTorch 2.x speedups happen most on newer GPUs).
GPU information:
Thu Oct 19 17:00:12 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.113.01             Driver Version: 535.113.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce GTX 1660 ...    Off | 00000000:01:00.0 Off |                  N/A |
| N/A   58C    P8               7W /  60W |    198MiB /  6144MiB |      0%      Default |
|                      

## create model and transforms: ResNet 50

In [3]:
weights = torchvision.models.ResNet50_Weights.IMAGENET1K_V2 #DEAFAULT is best available 
transforms = weights.transforms()
transforms

ImageClassification(
    crop_size=[224]
    resize_size=[232]
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)

In [4]:
#create the model
model = torchvision.models.resnet50(weights=weights)
model

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [5]:
#cout the number of paramters in the model
total_params = sum(
    param.numel() for param in model.parameters() #count all params
    # param.numel() for param in model.parameters() if param.requires_grad = True #to count trainable params
)
total_params

25557032

### Note pytorch 2.0 speedups with be most noticeable when higher percentage of GPU is being used. this means larger model (more trainable params) may take longer to train on the whole but will be relatively faster. 
### eg. model with 1M params may take 10m to train, but model with 25m might take only 20m to train because GPU enable parallel computing 


In [6]:
def create_model(num_classes:int=10):
    """creates a resnet 50 model with transfomers and returns them both

    Args:
        num_classes (int, optional): _description_. Defaults to 10.
    """
    
    model_weights = torchvision.models.ResNet50_Weights.DEFAULT
    transforms = model_weights.transforms()
    model = torchvision.models.resnet50(weights=model_weights)
    
    #adjust headlayer to fit the no o fclasses
    
    model.fc = torch.nn.Linear(in_features=2048, out_features=num_classes)
    
    return model, transforms

model, transforms = create_model()

In [7]:
transforms

ImageClassification(
    crop_size=[224]
    resize_size=[232]
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)

## speedups are most noticable when a large portion of the GPU is being used 
Since modern GPUs are so *fast* at performing operations, you will oftne notice the majority of *relative* speedups when a much data as possible is on the GPU

In practise you generally want to use asmuch of your GPu memory as possible.

* increase the batchsize - generally as large as possible here ideally we might want to use 128
* increase data_size = for examlple instead of using  32x32, you could use an increase embeddding size for your data
* increase the modelsize - for example instead of suing a model with 1m params , use a model with 10m paramsdd
* decrease data transfer - since bandwidth costs (transferring data) will slow down a GPU ( because it wants to compute on data )

As a result of doing the things about you rrelative speedups should be better.
Eg. overall training time may take longer for smaller experiments, but larger experiments might take much less time because of parallelization 



### check available GPU memory and total GPU memory

In [8]:

total_free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
print(f"Total free GPU memory: {round(total_free_gpu_memory * 1e-9,3)} GB")
print(f"Total  GPU memory: {round(total_gpu_memory * 1e-9,3)} GB")

Total free GPU memory: 6.14 GB
Total  GPU memory: 6.225 GB


### if the gpu has 16gb+ set the batch size to 128
### else set the batchsize to 32

In [9]:
total_free_gpu_memory_gb = round(total_free_gpu_memory * 1e-9,3)
if total_free_gpu_memory_gb >= 16:
    BATCH_SIZE = 128
    IMAGE_SIZE = 224
    print(f"GPU memory available is {total_free_gpu_memory_gb} GB, using batchsize of {BATCH_SIZE} and image size {IMAGE_SIZE}x{IMAGE_SIZE}")
else:
    BATCH_SIZE = 32
    IMAGE_SIZE = 128
    print(f"GPU memoery available is {total_free_gpu_memory_gb} GB, using batch size {BATCH_SIZE} and iamge size {IMAGE_SIZE}x{IMAGE_SIZE}")
    

GPU memoery available is 6.14 GB, using batch size 32 and iamge size 128x128


### Since we are gonna change the image size we need to update the transforms as well!

In [10]:
transforms

ImageClassification(
    crop_size=[224]
    resize_size=[232]
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)

In [11]:
transforms.crop_size = IMAGE_SIZE
transforms.resize_size = IMAGE_SIZE
transforms

ImageClassification(
    crop_size=128
    resize_size=128
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)

## more potential speedups with TF32

TF32 = TensorFloat32
TensorFloat32 = a datatyp that bridges Float32 and Float16
Float32 = a number is represented by 32 bits (eg. 1010101010101010101 is 32 '1' and '0' is bits)
Float16 = a number is represented by 16 bits (eg 010100101 ;'1' and '0' is a bit; 1 byte is 8 bits )

### what we want is :
1. Fast model training ( from float16)
2. Accurate model training (from float32)
TensorFloaat32 = a Datatype type that combines float32 and float16

### prep the dataset CIFAR10

In [12]:
## create train and test datasets
import torchvision 
train_dataset = torchvision.datasets.CIFAR10(root=".",
                                             train=True,
                                             download=True,
                                             transform=transforms,
                                             )
test_dataset = torchvision.datasets.CIFAR10(root='.',
                                            train=False,
                                            download=True,
                                            transform=transforms)

train_len = len(train_dataset)
test_len = len(test_dataset)
print(f"[INFO] Train dataest length: {train_len}")
print(f"[INFO] Test dataest length: {test_len}")

Files already downloaded and verified
Files already downloaded and verified
[INFO] Train dataest length: 50000
[INFO] Test dataest length: 10000


### even tho CIFAR10 has 32x32 our images will be 224 or 224

In [13]:
transforms

ImageClassification(
    crop_size=128
    resize_size=128
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)

In [14]:
train_dataset

Dataset CIFAR10
    Number of datapoints: 50000
    Root location: .
    Split: Train
    StandardTransform
Transform: ImageClassification(
               crop_size=128
               resize_size=128
               mean=[0.485, 0.456, 0.406]
               std=[0.229, 0.224, 0.225]
               interpolation=InterpolationMode.BILINEAR
           )

In [15]:
train_dataset[0][0].shape

torch.Size([3, 128, 128])

In [16]:
from torch.utils.data import DataLoader
import os

NUM_WORKERS = os.cpu_count()
train_dataloader = DataLoader(dataset=train_dataset, 
                              batch_size=BATCH_SIZE,
                              shuffle=True,
                              num_workers=NUM_WORKERS)
test_dataloader = DataLoader(dataset=test_dataset, 
                              batch_size=BATCH_SIZE,
                              num_workers=NUM_WORKERS)
#print details
print(f"Train dataloader: Num batches: {len(train_dataloader)} of batchsize: {BATCH_SIZE}")
print(f"Test dataloader: Num batches: {len(test_dataloader)} of batchsize: {BATCH_SIZE}")
print(f"Using num workers to load data (more is generally better): {NUM_WORKERS}")

Train dataloader: Num batches: 1563 of batchsize: 32
Test dataloader: Num batches: 313 of batchsize: 32
Using num workers to load data (more is generally better): 8


### creating training and test loops
where we can time each step

In [17]:
import time
from tqdm.auto import tqdm
from typing import Dict, List, Tuple

def train_step(epoch: int,
               model: torch.nn.Module, 
               dataloader: torch.utils.data.DataLoader, 
               loss_fn: torch.nn.Module, 
               optimizer: torch.optim.Optimizer,
               device: torch.device,
               disable_progress_bar: bool = False) -> Tuple[float, float]:
  """Trains a PyTorch model for a single epoch.

  Turns a target PyTorch model to training mode and then
  runs through all of the required training steps (forward
  pass, loss calculation, optimizer step).

  Args:
    model: A PyTorch model to be trained.
    dataloader: A DataLoader instance for the model to be trained on.
    loss_fn: A PyTorch loss function to minimize.
    optimizer: A PyTorch optimizer to help minimize the loss function.
    device: A target device to compute on (e.g. "cuda" or "cpu").

  Returns:
    A tuple of training loss and training accuracy metrics.
    In the form (train_loss, train_accuracy). For example:

    (0.1112, 0.8743)
  """
  # Put model in train mode
  model.train()

  # Setup train loss and train accuracy values
  train_loss, train_acc = 0, 0

  # Loop through data loader data batches
  progress_bar = tqdm(
        enumerate(dataloader), 
        desc=f"Training Epoch {epoch}", 
        total=len(dataloader),
        disable=disable_progress_bar
    )

  for batch, (X, y) in progress_bar:
      # Send data to target device
      X, y = X.to(device), y.to(device)

      # 1. Forward pass
      y_pred = model(X)

      # 2. Calculate  and accumulate loss
      loss = loss_fn(y_pred, y)
      train_loss += loss.item() 

      # 3. Optimizer zero grad
      optimizer.zero_grad()

      # 4. Loss backward
      loss.backward()

      # 5. Optimizer step
      optimizer.step()

      # Calculate and accumulate accuracy metric across all batches
      y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
      train_acc += (y_pred_class == y).sum().item()/len(y_pred)

      # Update progress bar
      progress_bar.set_postfix(
            {
                "train_loss": train_loss / (batch + 1),
                "train_acc": train_acc / (batch + 1),
            }
        )


  # Adjust metrics to get average loss and accuracy per batch 
  train_loss = train_loss / len(dataloader)
  train_acc = train_acc / len(dataloader)
  return train_loss, train_acc

def test_step(epoch: int,
              model: torch.nn.Module, 
              dataloader: torch.utils.data.DataLoader, 
              loss_fn: torch.nn.Module,
              device: torch.device,
              disable_progress_bar: bool = False) -> Tuple[float, float]:
  """Tests a PyTorch model for a single epoch.

  Turns a target PyTorch model to "eval" mode and then performs
  a forward pass on a testing dataset.

  Args:
    model: A PyTorch model to be tested.
    dataloader: A DataLoader instance for the model to be tested on.
    loss_fn: A PyTorch loss function to calculate loss on the test data.
    device: A target device to compute on (e.g. "cuda" or "cpu").

  Returns:
    A tuple of testing loss and testing accuracy metrics.
    In the form (test_loss, test_accuracy). For example:

    (0.0223, 0.8985)
  """
  # Put model in eval mode
  model.eval() 

  # Setup test loss and test accuracy values
  test_loss, test_acc = 0, 0

  # Loop through data loader data batches
  progress_bar = tqdm(
      enumerate(dataloader), 
      desc=f"Testing Epoch {epoch}", 
      total=len(dataloader),
      disable=disable_progress_bar
  )

  # Turn on inference context manager
  with torch.inference_mode(): # no_grad() required for PyTorch 2.0, I found some errors with `torch.inference_mode()`, please let me know if this is not the case
      # Loop through DataLoader batches
      for batch, (X, y) in progress_bar:
          # Send data to target device
          X, y = X.to(device), y.to(device)

          # 1. Forward pass
          test_pred_logits = model(X)

          # 2. Calculate and accumulate loss
          loss = loss_fn(test_pred_logits, y)
          test_loss += loss.item()

          # Calculate and accumulate accuracy
          test_pred_labels = test_pred_logits.argmax(dim=1)
          test_acc += ((test_pred_labels == y).sum().item()/len(test_pred_labels))

          # Update progress bar
          progress_bar.set_postfix(
              {
                  "test_loss": test_loss / (batch + 1),
                  "test_acc": test_acc / (batch + 1),
              }
          )

  # Adjust metrics to get average loss and accuracy per batch 
  test_loss = test_loss / len(dataloader)
  test_acc = test_acc / len(dataloader)
  return test_loss, test_acc

def train(model: torch.nn.Module, 
          train_dataloader: torch.utils.data.DataLoader, 
          test_dataloader: torch.utils.data.DataLoader, 
          optimizer: torch.optim.Optimizer,
          loss_fn: torch.nn.Module,
          epochs: int,
          device: torch.device,
          disable_progress_bar: bool = False) -> Dict[str, List]:
  """Trains and tests a PyTorch model.

  Passes a target PyTorch models through train_step() and test_step()
  functions for a number of epochs, training and testing the model
  in the same epoch loop.

  Calculates, prints and stores evaluation metrics throughout.

  Args:
    model: A PyTorch model to be trained and tested.
    train_dataloader: A DataLoader instance for the model to be trained on.
    test_dataloader: A DataLoader instance for the model to be tested on.
    optimizer: A PyTorch optimizer to help minimize the loss function.
    loss_fn: A PyTorch loss function to calculate loss on both datasets.
    epochs: An integer indicating how many epochs to train for.
    device: A target device to compute on (e.g. "cuda" or "cpu").

  Returns:
    A dictionary of training and testing loss as well as training and
    testing accuracy metrics. Each metric has a value in a list for 
    each epoch.
    In the form: {train_loss: [...],
                  train_acc: [...],
                  test_loss: [...],
                  test_acc: [...]} 
    For example if training for epochs=2: 
                 {train_loss: [2.0616, 1.0537],
                  train_acc: [0.3945, 0.3945],
                  test_loss: [1.2641, 1.5706],
                  test_acc: [0.3400, 0.2973]} 
  """
  # Create empty results dictionary
  results = {"train_loss": [],
      "train_acc": [],
      "test_loss": [],
      "test_acc": [],
      "train_epoch_time": [],
      "test_epoch_time": []
  }

  # Loop through training and testing steps for a number of epochs
  for epoch in tqdm(range(epochs), disable=disable_progress_bar):

      # Perform training step and time it
      train_epoch_start_time = time.time()
      train_loss, train_acc = train_step(epoch=epoch, 
                                        model=model,
                                        dataloader=train_dataloader,
                                        loss_fn=loss_fn,
                                        optimizer=optimizer,
                                        device=device,
                                        disable_progress_bar=disable_progress_bar)
      train_epoch_end_time = time.time()
      train_epoch_time = train_epoch_end_time - train_epoch_start_time
      
      # Perform testing step and time it
      test_epoch_start_time = time.time()
      test_loss, test_acc = test_step(epoch=epoch,
                                      model=model,
                                      dataloader=test_dataloader,
                                      loss_fn=loss_fn,
                                      device=device,
                                      disable_progress_bar=disable_progress_bar)
      test_epoch_end_time = time.time()
      test_epoch_time = test_epoch_end_time - test_epoch_start_time

      # Print out what's happening
      print(
          f"Epoch: {epoch+1} | "
          f"train_loss: {train_loss:.4f} | "
          f"train_acc: {train_acc:.4f} | "
          f"test_loss: {test_loss:.4f} | "
          f"test_acc: {test_acc:.4f} | "
          f"train_epoch_time: {train_epoch_time:.4f} | "
          f"test_epoch_time: {test_epoch_time:.4f}"
      )

      # Update results dictionary
      results["train_loss"].append(train_loss)
      results["train_acc"].append(train_acc)
      results["test_loss"].append(test_loss)
      results["test_acc"].append(test_acc)
      results["train_epoch_time"].append(train_epoch_time)
      results["test_epoch_time"].append(test_epoch_time)

  # Return the filled results at the end of the epochs
  return results

### time modelsa cross a single run

>## Experiment 1: single run without a `torch.compile()` for 5 epochs

**NOTE**: Depending on your GPU.MAChine the following code may take a while to run the A100 takes about 7 minutes

In [22]:
NUM_EPOCHS = 5
#set the learning rate
LEARNING_RATE = 0.003
#crate a model
model, _ = create_model()
model.to(device)

#loss fn and optimizer 
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(),
                             lr=LEARNING_RATE)
# train the model and track the results
single_run_no_compile_results = train(model=model,
                                     train_dataloader=train_dataloader,
                                     test_dataloader=test_dataloader,
                                     loss_fn=loss_fn,
                                     optimizer=optimizer,
                                     epochs=NUM_EPOCHS,
                                     device=device)

  0%|          | 0/5 [00:00<?, ?it/s]

Training Epoch 0:   0%|          | 0/1563 [00:00<?, ?it/s]

Testing Epoch 0:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 1.1469 | train_acc: 0.5923 | test_loss: 0.9221 | test_acc: 0.6817 | train_epoch_time: 249.2805 | test_epoch_time: 17.0742


Training Epoch 1:   0%|          | 0/1563 [00:00<?, ?it/s]

Testing Epoch 1:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch: 2 | train_loss: 0.6828 | train_acc: 0.7632 | test_loss: 0.6300 | test_acc: 0.7860 | train_epoch_time: 258.6229 | test_epoch_time: 16.3873


Training Epoch 2:   0%|          | 0/1563 [00:00<?, ?it/s]

Testing Epoch 2:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch: 3 | train_loss: 0.5231 | train_acc: 0.8178 | test_loss: 0.5766 | test_acc: 0.8056 | train_epoch_time: 254.9817 | test_epoch_time: 17.5346


Training Epoch 3:   0%|          | 0/1563 [00:00<?, ?it/s]

Testing Epoch 3:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch: 4 | train_loss: 0.4203 | train_acc: 0.8551 | test_loss: 0.5301 | test_acc: 0.8244 | train_epoch_time: 260.0587 | test_epoch_time: 17.8099


Training Epoch 4:   0%|          | 0/1563 [00:00<?, ?it/s]

Testing Epoch 4:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch: 5 | train_loss: 0.3345 | train_acc: 0.8835 | test_loss: 0.4775 | test_acc: 0.8422 | train_epoch_time: 259.4417 | test_epoch_time: 17.6898


In [23]:
single_run_no_compile_results

{'train_loss': [1.1468667172119547,
  0.6827692999263185,
  0.5230546136573195,
  0.42029553274752157,
  0.33447302201129997],
 'train_acc': [0.5922904670505438,
  0.7631957773512476,
  0.8178182981445937,
  0.8550863723608445,
  0.8834572936660269],
 'test_loss': [0.9221364668192574,
  0.6299862993506197,
  0.5766392902444346,
  0.530137239244228,
  0.47754845021965026],
 'test_acc': [0.6817092651757188,
  0.7860423322683706,
  0.8056110223642172,
  0.8243809904153354,
  0.8421525559105432],
 'train_epoch_time': [249.28045415878296,
  258.6229043006897,
  254.98170471191406,
  260.0586841106415,
  259.44171261787415],
 'test_epoch_time': [17.074208736419678,
  16.38734269142151,
  17.53455877304077,
  17.809882640838623,
  17.689764499664307]}

>## experiment 2: single run with complied model

In [18]:
NUM_EPOCHS = 5
#set the learning rate
LEARNING_RATE = 0.003
#crate a model
model, _ = create_model()
model.to(device)

#loss fn and optimizer 
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(),
                             lr=LEARNING_RATE)

import time
compile_start_time = time.time()
compiled_model = torch.compile(model=model)
compiled_model.to(device)
compile_end_time = time.time()
compile_time = compile_end_time - compile_start_time

print(f"[INFO] time to compile: {compile_time} | Note: first time you compile a model / train a compiled model the first epoch might take longer due to optimizations happening behind the scenes")
# train the model and track the results
single_run_compile_results = train(model=compiled_model,
                                     train_dataloader=train_dataloader,
                                     test_dataloader=test_dataloader,
                                     loss_fn=loss_fn,
                                     optimizer=optimizer,
                                     epochs=NUM_EPOCHS,
                                     device=device)

RuntimeError: Python 3.11+ not yet supported for torch.compile

## compare the results of experiement 1 and 2

In [None]:
import pandas as pd
single_run_no_compile_results_df = pd.Dataframe(single_run_no_compile_results)
single_run_compile_results_df = pd.DataFrame(single_run_compile_results)

In [None]:
single_run_compile_results_df

In [None]:
#create dataset name and file name 
DATASET_NAME = "CIFAR10"
MODEL_NAME = "ResNet50"

In [None]:


import matplotlib.pyplot as plt
import numpy as np

def plot_mean_epoch_times(non_compiled_results: pd.DataFrame, 
                          compiled_results: pd.DataFrame, 
                          multi_runs: bool=False, 
                          num_runs: int=0, 
                          save: bool=False, 
                          save_path: str="",
                          dataset_name: str=DATASET_NAME,
                          model_name: str=MODEL_NAME,
                          num_epochs: int=NUM_EPOCHS,
                          image_size: int=IMAGE_SIZE,
                          batch_size: int=BATCH_SIZE) -> plt.figure:
    
    # Get the mean epoch times from the non-compiled models
    mean_train_epoch_time = non_compiled_results.train_epoch_time.mean()
    mean_test_epoch_time = non_compiled_results.test_epoch_time.mean()
    mean_results = [mean_train_epoch_time, mean_test_epoch_time]

    # Get the mean epoch times from the compiled models
    mean_compile_train_epoch_time = compiled_results.train_epoch_time.mean()
    mean_compile_test_epoch_time = compiled_results.test_epoch_time.mean()
    mean_compile_results = [mean_compile_train_epoch_time, mean_compile_test_epoch_time]

    # Calculate the percentage difference between the mean compile and non-compile train epoch times
    train_epoch_time_diff = mean_compile_train_epoch_time - mean_train_epoch_time
    train_epoch_time_diff_percent = (train_epoch_time_diff / mean_train_epoch_time) * 100

    # Calculate the percentage difference between the mean compile and non-compile test epoch times
    test_epoch_time_diff = mean_compile_test_epoch_time - mean_test_epoch_time
    test_epoch_time_diff_percent = (test_epoch_time_diff / mean_test_epoch_time) * 100

    # Print the mean difference percentages
    print(f"Mean train epoch time difference: {round(train_epoch_time_diff_percent, 3)}% (negative means faster)")
    print(f"Mean test epoch time difference: {round(test_epoch_time_diff_percent, 3)}% (negative means faster)")

    # Create a bar plot of the mean train and test epoch time for both compiled and non-compiled models
    plt.figure(figsize=(10, 7))
    width = 0.3
    x_indicies = np.arange(len(mean_results))

    plt.bar(x=x_indicies, height=mean_results, width=width, label="non_compiled_results")
    plt.bar(x=x_indicies + width, height=mean_compile_results, width=width, label="compiled_results")
    plt.xticks(x_indicies + width / 2, ("Train Epoch", "Test Epoch"))
    plt.ylabel("Mean epoch time (seconds, lower is better)")

    # Create the title based on the parameters passed to the function
    if multi_runs:
        plt.suptitle("Multiple run results")
        plt.title(f"GPU: {gpu_name} | Epochs: {num_epochs} ({num_runs} runs) | Data: {dataset_name} | Model: {model_name} | Image size: {image_size} | Batch size: {batch_size}")
    else:
        plt.suptitle("Single run results")
        plt.title(f"GPU: {gpu_name} | Epochs: {num_epochs} | Data: {dataset_name} | Model: {model_name} | Image size: {image_size} | Batch size: {batch_size}")
    plt.legend();

    # Save the figure
    if save:
        assert save_path != "", "Please specify a save path to save the model figure to via the save_path parameter."
        plt.savefig(save_path)
        print(f"[INFO] Plot saved to {save_path}")

In [19]:
#craet dirs for saving figures in 
import os
dir_to_save_figures_in = "pytorch_2_results/figures/"
os.makedirs(dir_to_save_figures_in, exist_ok=True)


# create a save path for the single run results
save_path_single_run = f"{dir_to_save_figures_in}single_run_{GPU_NAME}_{MODEL_NAME}_{DATASET_NAME}_{IMAGE_SIZE}_train_epoch_time.png"
print(f"[INFO] save path for single run results: {save_path_single_run}")

#plot the resulta and save the figure
plot_mean_epoch_times(non_compiled_results=single_run_no_compile_results_df,
                      compiled_results=single_run_compile_results_df,
                      multi_runs=False,
                      save_path=save_path_single_run,
                      save=True
                      )


In [None]:
# Make a directory for single_run results
import os
pytorch_2_results_dir = "pytorch_2_results"
pytorch_2_single_run_results_dir = f"{pytorch_2_results_dir}/single_run_results"
os.makedirs(pytorch_2_single_run_results_dir, exist_ok=True)

# Create filenames for each of the dataframes
save_name_for_non_compiled_results = f"single_run_non_compiled_results_{DATASET_NAME}_{MODEL_NAME}_{GPU_NAME}.csv"
save_name_for_compiled_results = f"single_run_compiled_results_{DATASET_NAME}_{MODEL_NAME}_{GPU_NAME}.csv"

# Create filepaths to save the results to
single_run_no_compile_save_path = f"{pytorch_2_single_run_results_dir}/{save_name_for_non_compiled_results}"
single_run_compile_save_path = f"{pytorch_2_single_run_results_dir}/{save_name_for_compiled_results}"
print(f"[INFO] Saving non-compiled experiment 1 results to: {single_run_no_compile_save_path}")
print(f"[INFO] Saving compiled experiment 2 results to: {single_run_compile_save_path}")

# Save the results
single_run_no_compile_results_df.to_csv(single_run_no_compile_save_path)
single_run_compile_results_df.to_csv(single_run_compile_save_path)

## 4. time models across multi runs
time for multi-run experiments
- Experiment 3- 3x5 epochs without torch.compile()
- experimenet 4 - 3x5 epochs with torch.compile()

Before running erpeimetn 3 and 4 lets create 3 functions:
1. **experimemnet3:** `create_and_train_non_compiled_model()` - craetes and train a model(for single runs)
2. **experiements4:** `create_compiled_model()` -  cretees and compiles a model, returns the compiled model.
3. **experiments4:**  `train_compiled_model()` - trains a compiled model for a single run( can put this in a loop to train form multiple runs)

In [None]:
def create_and_train_non_compiled_model(epochs=NUM_EPOCHS,
                                        learning_rate=LEARNING_RATE,
                                        disable_progress_bar=False):
    """create and trian a non-compuled pytorch model.

    Args:
        epochs (_type_, optional): _description_. Defaults to NUM_EPOCHS.
        learning_rate (_type_, optional): _description_. Defaults to LEARNING_RATE.
        disable_progress_bar (bool, optional): _description_. Defaults to False.
    """
    model, _ = create_model()
    model.to(device)
    
    loss_fn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(params=model.parameters(),
                                 lr=learning_rate)
    
    results = train(model=model,
                train_dataloader=train_dataloader,
                test_dataloader=test_dataloader,
                loss_fn=loss_fn,
                optimizer=optimizer,
                epochs=epochs,
                device=device,
                disable_progress_bar=disable_progress_bar)
    return results

def create_compiled_model():
    """create a compiled pytorch model and return it  
    """
    model, _ = create_model()
    model.to(device)
    
    compile_start_time = time.time()
    compiled_model = torch.compile(model)
    compile_end_time = time.time()

    compile_time = compile_end_time - compile_start_time

    print(f"Time to compile: {compile_time} | Note: The first time you compile your model, the first few epochs will be slower than subsequent runs.")
    return compiled_model

def train_compiled_model(model=compiled_model, 
                         epochs=NUM_EPOCHS, 
                         learning_rate=LEARNING_RATE,
                         disable_progress_bar=False):
    """
    Train a compiled model and return the results.
    """
    loss_fn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(compiled_model.parameters(),
                                 lr=learning_rate)
    
    compile_results = train(model=model,
                            train_dataloader=train_dataloader,
                            test_dataloader=test_dataloader,
                            loss_fn=loss_fn,
                            optimizer=optimizer,
                            epochs=epochs,
                            device=device,
                            disable_progress_bar=disable_progress_bar)
    
    return compile_results


>## experiement 3: multiple runs with no compile 
**NOTE** because weare runnning multiple runs, the code bwlow may take  awhile to run, if one single run takes ~7 minutes on a A100, the folloing could take aobut 20m on a A100

In [None]:
NUM_EPOCHS = 5
NUM_RUNS = 3
#set the learning rate
LEARNING_RATE = 0.003
#crate a model
model, _ = create_model()
model.to(device)

#loss fn and optimizer 
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(),
                             lr=LEARNING_RATE)

import time
compile_start_time = time.time()
compiled_model = torch.compile(model=model)
compiled_model.to(device)
compile_end_time = time.time()
compile_time = compile_end_time - compile_start_time

print(f"[INFO] time to compile: {compile_time} | Note: first time you compile a model / train a compiled model the first epoch might take longer due to optimizations happening behind the scenes")
# train the model and track the results
single_run_compile_results = train(model=compiled_model,
                                     train_dataloader=train_dataloader,
                                     test_dataloader=test_dataloader,
                                     loss_fn=loss_fn,
                                     optimizer=optimizer,
                                     epochs=NUM_EPOCHS,
                                     device=device)

In [None]:
# Run non-compiled model for multiple runs
NUM_RUNS = 3
NUM_EPOCHS = 5

# Create an empty list to store multiple run results
non_compile_results_multiple_runs = []

# Run non-compiled model for multiple runs
for i in tqdm(range(NUM_RUNS)):
    print(f"[INFO] Run {i+1} of {NUM_RUNS} for non-compiled model")
    results = create_and_train_non_compiled_model(epochs=NUM_EPOCHS, disable_progress_bar=False)
    non_compile_results_multiple_runs.append(results)

In [None]:
# go through the non_compile_results_multiple_runs and crete a dataframe for each
non_compile_results_df = []
for result in non_compile_results_multiple_runs:
    result_df = pd.DataFrame(result)
    non_compile_results_df.append(result_df)
non_compile_results_multiple_runs_df = pd.concat(non_compile_results_df)

# get the average results acorss the oard
non_compile_results_multiple_runs_df = non_compile_results_multiple_runs_df.groupby(non_compile_results_multiple_runs_df.index).mean()
non_compile_results_multiple_runs_df

>## experiment 4 mutli runs with compiled model


In [None]:
compiled_model = create_compiled_model()
# ceate an empty list to store compiled model results
compiled_results_multiple_runs = []

# run compiled model for multiple runs
for i in tqdm(range(NUM_RUNS)):
    print(f"[INFO] Run {i+1} of {NUM_RUNS} for compiled model")
    # Train the compiled model (note: the model will only be compiled once and then re-used for subsequent runs)
    results = train_compiled_model(model=compiled_model,
                                   epochs=NUM_EPOCHS,
                                   disable_progress_bar=True)
    compiled_results_multiple_runs.append(results)


In [None]:
#  create a dataframe for each run then concatenate them together compile_results_multiple_runs
compile_results_dfs = []
for result in compiled_results_multiple_runs:
    result_df = pd.DataFrame(result)
    compile_results_dfs.append(result_df)
compile_results_multiple_runs_df = pd.concat(compile_results_dfs)

# get avg of mutli runs
compile_results_multiple_runs_df = compile_results_multiple_runs_df.groupby(compile_results_multiple_runs_df.index).mean() # .index = groupby the epoch number
compile_results_multiple_runs_df

## compare results of experiment 3 and 4

In [None]:
# Create a directory to save the multi-run figure to 
os.makedirs("pytorch_2_results/figures", exist_ok=True)

# Create a path to save the figure for multiple runs
save_path_multi_run = f"pytorch_2_results/figures/multi_run_{GPU_NAME}_{MODEL_NAME}_{DATASET_NAME}_{IMAGE_SIZE}_train_epoch_time.png"

# Plot the mean epoch times for experiment 3 and 4
plot_mean_epoch_times(non_compiled_results=non_compile_results_multiple_runs_df, 
                      compiled_results=compile_results_multiple_runs_df, 
                      multi_runs=True, 
                      num_runs=NUM_RUNS, 
                      save_path=save_path_multi_run, 
                      save=True)

## possible improvements and extensions
- use more powerful CPU and GPUs (imporove data loading speeds)
- Use Automatic Mixed Precision training (AMP)
- transformer based model may see more base speed ups than convolutional ( because of optimizer scaled_dot_product_attention())
- train for longer 

(https://sebastianraschka.com/blog/2023/pytorch-faster.html)