>## Things that are new with Pytorch 2.0
### 1. `torch.comple` => operator fusion and graph monitoring to speed up training
### 2. `torch.set_default_device` or context manager `with torch.device(device)` => set device globally
### 3.  `TensorFloat32` => datatype that bridges float32 and float16

In [2]:
import torch
import torchvision

print(f"Pytorch version: {torch.__version__}")
print(f"Torchvision version: {torchvision.__version__}")

#setup device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Pytorch version: 2.0.1
Torchvision version: 0.15.2a0
Using device: cuda


## create model and transforms: ResNet 50

In [3]:
weights = torchvision.models.ResNet50_Weights.IMAGENET1K_V2 #DEAFAULT is best available 
transforms = weights.transforms()
transforms

torch.set

ImageClassification(
    crop_size=[224]
    resize_size=[232]
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)

In [4]:
#create the model
model = torchvision.models.resnet50(weights=weights)
model

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [5]:
#cout the number of paramters in the model
total_params = sum(
    param.numel() for param in model.parameters() #count all params
    # param.numel() for param in model.parameters() if param.requires_grad = True #to count trainable params
)
total_params

25557032

### Note pytorch 2.0 speedups with be most noticeable when higher percentage of GPU is being used. this means larger model (more trainable params) may take longer to train on the whole but will be relatively faster. 
### eg. model with 1M params may take 10m to train, but model with 25m might take only 20m to train because GPU enable parallel computing 


In [6]:
def create_model(num_classes:int=10):
    """creates a resnet 50 model with transfomers and returns them both

    Args:
        num_classes (int, optional): _description_. Defaults to 10.
    """
    
    model_weights = torchvision.models.ResNet50_Weights.DEFAULT
    transforms = model_weights.transforms()
    model = torchvision.models.resnet50(weights=model_weights)
    
    #adjust headlayer to fit the no o fclasses
    
    model.fc = torch.nn.Linear(in_features=2048, out_features=num_classes)
    
    return model, transforms

model, transforms = create_model()

In [7]:
transforms

ImageClassification(
    crop_size=[224]
    resize_size=[232]
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)

## speedups are most noticable when a large portion of the GPU is being used 
Since modern GPUs are so *fast* at performing operations, you will oftne notice the majority of *relative* speedups when a much data as possible is on the GPU

In practise you generally want to use asmuch of your GPu memory as possible.

* increase the batchsize - generally as large as possible here ideally we might want to use 128
* increase data_size = for examlple instead of using  32x32, you could use an increase embeddding size for your data
* increase the modelsize - for example instead of suing a model with 1m params , use a model with 10m paramsdd
* decrease data transfer - since bandwidth costs (transferring data) will slow down a GPU ( because it wants to compute on data )

As a result of doing the things about you rrelative speedups should be better.
Eg. overall training time may take longer for smaller experiments, but larger experiments might take much less time because of parallelization 



### check available GPU memory and total GPU memory

In [8]:

total_free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
print(f"Total free GPU memory: {round(total_free_gpu_memory * 1e-9,3)} GB")
print(f"Total  GPU memory: {round(total_gpu_memory * 1e-9,3)} GB")

Total free GPU memory: 6.14 GB
Total  GPU memory: 6.225 GB


### if the gpu has 16gb+ set the batch size to 128
### else set the batchsize to 32

In [19]:
total_free_gpu_memory_gb = round(total_free_gpu_memory * 1e-9,3)
if total_free_gpu_memory_gb >= 16:
    BATCH_SIZE = 128
    IMAGE_SIZE = 224
    print(f"GPU memory available is {total_free_gpu_memory_gb} GB, using batchsize of {BATCH_SIZE} and image size {IMAGE_SIZE}x{IMAGE_SIZE}")
else:
    BATCH_SIZE = 32
    IMAGE_SIZE = 128
    print(f"GPU memoery available is {total_free_gpu_memory_gb} GB, using batch size {BATCH_SIZE} and iamge size {IMAGE_SIZE}x{IMAGE_SIZE}")
    

GPU memoery available is 6.14 GB, using batch size 32 and iamge size 128x128


### Since we are gonna change the image size we need to update the transforms as well!

In [20]:
transforms

ImageClassification(
    crop_size=224
    resize_size=224
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)

In [21]:
transforms.crop_size = IMAGE_SIZE
transforms.resize_size = IMAGE_SIZE
transforms

ImageClassification(
    crop_size=128
    resize_size=128
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)

## more potential speedups with TF32

TF32 = TensorFloat32
TensorFloat32 = a datatyp that bridges Float32 and Float16
Float32 = a number is represented by 32 bits (eg. 1010101010101010101 is 32 '1' and '0' is bits)
Float16 = a number is represented by 16 bits (eg 010100101 ;'1' and '0' is a bit; 1 byte is 8 bits )

### what we want is :
1. Fast model training ( from float16)
2. Accurate model training (from float32)
TensorFloaat32 = a Datatype type that combines float32 and float16

### prep the dataset CIFAR10

In [23]:
## create train and test datasets
import torchvision 
train_dataset = torchvision.datasets.CIFAR10(root=".",
                                             train=True,
                                             download=True,
                                             transform=transforms,
                                             )
test_dataset = torchvision.datasets.CIFAR10(root='.',
                                            train=False,
                                            download=True,
                                            transform=transforms)

train_len = len(train_dataset)
test_len = len(test_dataset)
print(f"[INFO] Train dataest length: {train_len}")
print(f"[INFO] Test dataest length: {test_len}")

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [03:40<00:00, 773865.73it/s] 


Extracting ./cifar-10-python.tar.gz to .
Files already downloaded and verified
[INFO] Train dataest length: 50000
[INFO] Test dataest length: 10000


### even tho CIFAR10 has 32x32 our images will be 224 or 224

In [14]:
transforms

ImageClassification(
    crop_size=224
    resize_size=224
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)

In [24]:
train_dataset

Dataset CIFAR10
    Number of datapoints: 50000
    Root location: .
    Split: Train
    StandardTransform
Transform: ImageClassification(
               crop_size=128
               resize_size=128
               mean=[0.485, 0.456, 0.406]
               std=[0.229, 0.224, 0.225]
               interpolation=InterpolationMode.BILINEAR
           )

In [29]:
train_dataset[0][0].shape

torch.Size([3, 128, 128])

In [30]:
from torch.utils.data import DataLoader
import os

NUM_WORKERS = os.cpu_count()
train_dataloader = DataLoader(dataset=train_dataset, 
                              batch_size=BATCH_SIZE,
                              shuffle=True,
                              num_workers=NUM_WORKERS)
test_dataloader = DataLoader(dataset=test_dataset, 
                              batch_size=BATCH_SIZE,
                              num_workers=NUM_WORKERS)
#print details
print(f"Train dataloader: Num batches: {len(train_dataloader)} of batchsize: {BATCH_SIZE}")
print(f"Test dataloader: Num batches: {len(test_dataloader)} of batchsize: {BATCH_SIZE}")
print(f"Using num workers to load data (more is generally better): {NUM_WORKERS}")

Train dataloader: Num batches: 1563 of batchsize: 32
Test dataloader: Num batches: 313 of batchsize: 32
Using num workers to load data (more is generally better): 8


### creating training and test loops

In [None]:
###