## Notebook 4: find ideal LR for Resnet model, and add dropout (find optimal dropout probability)

## Imports and Grab Data

In [1]:
from utils.notebook_2_utils import * 
import utils.notebook_2_utils as utils

training_df = pd.read_csv("data/training.csv")
lookup_df = pd.read_csv("data/IdLookupTable.csv")
training_df.fillna(method = 'ffill',inplace = True)

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [2]:
len(training_df)

7049

In [3]:
7049*30

211470

In [4]:
train_X, train_Y, val_X, val_Y = create_train_test_sets_nchw(training_df, normalize=True)

## ResNet Model
* Input shape:  64, 1, 96, 96 (nchw)
* Output shape: 64, 30 (nc)


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
# from sklearn.model_selection import train_test_split
import torch.nn.functional as F

### Model Definition

In [6]:
resnet = models.resnet18(pretrained=True)




In [7]:
resnet.conv1

Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)

## Control Panel

In [8]:
EPOCHS = 60
LEARNING_RATE = .002
BATCH_SIZE = 32


loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.SGD(resnet.parameters(), lr=LEARNING_RATE)





## Replace first and Last layers
* my input shape [32, 1, 96, 96] (NCHW) channels first

In [9]:
# freeze all layers
# for param in resnet.parameters():
#     param.requires_grad = False

# replace last layer
resnet.fc = nn.Linear(512, 30) # 30 is the number of keypoints we want to predict

# relpace the first layer
resnet.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)

# move to GPU
resnet = resnet.to(device)


In [10]:
resnet.conv1

Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)

### DataLoaders

In [11]:
torch.manual_seed(42)
from torch.utils.data import TensorDataset, DataLoader


# Put data to target device
train_X, train_Y = train_X.to(device), train_Y.to(device)
val_X, val_Y = val_X.to(device), val_Y.to(device)

train_dataset = TensorDataset(train_X, train_Y)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = TensorDataset(val_X, val_Y)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

## Find Optimal LR

In [12]:
final_lr = 10
init_lr = 1e-10
num_iters = 100

In [13]:
import matplotlib.pyplot as plt

def find_lr(model, train_loader, criterion, optimizer, init_lr=1e-8, final_lr=10, num_iters=100):
    """
    Finds the optimal learning rate for the model by gradually increasing the learning rate and plotting the loss.
    Args:
    - model (torch.nn.Module): the PyTorch model to train
    - train_loader (torch.utils.data.DataLoader): the training data loader
    - criterion (torch.nn.Module): the loss function
    - optimizer (torch.optim.Optimizer): the optimizer
    - init_lr (float): the initial learning rate
    - final_lr (float): the final learning rate
    - num_iters (int): the number of iterations to run
    """
    model.train()
    optimizer.param_groups[0]['lr'] = init_lr
    
    init, final = math.log(init_lr, 10), math.log(final_lr, 10)

    
    lr_steps = np.logspace(init, final, num=num_iters+1)
    lr = init_lr
    losses = []
    lrs = []
    lrs_dx = []
    
    for i, (inputs, targets) in enumerate(train_loader):
        if i >= num_iters:
            break
        
        optimizer.param_groups[0]['lr'] = lr
        inputs, targets = inputs.cuda(), targets.cuda()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        losses.append(loss.item())
        lrs.append(lr)
        if i != 0: 
            lrs_dx.append(loss.item() - losses[i-1]) 
        lr = lr_steps[i]
    
    # plot the learning rate vs. loss
    plt.plot(lrs, losses)
    plt.xscale('log')
    plt.xlabel('Learning Rate')
    plt.ylabel('Loss')
    
    max = np.argmin(lrs_dx)
    min = np.argmin(losses)
    
    text = "highest delta: " + str(lrs[max])[:7] + " lowest loss: " + str(lrs[min])[:7]
    plt.title(text)
    
    plt.scatter(lrs[min], losses[min], c="r")
    plt.scatter(lrs[max], losses[max], c="g")
    
    plt.show()
    return losses, lrs, lrs_dx

In [14]:
# losses, lrs, lrs_dx = find_lr(resnet, train_dataloader, loss_fn, optimizer, final_lr=100)

In [15]:
for X_batch, y_batch in train_dataloader:
    print(X_batch.shape, y_batch.shape)
    break

torch.Size([32, 1, 96, 96]) torch.Size([32, 30])


In [16]:
resnet = resnet.to(device)

preds = resnet(X_batch)

In [17]:
preds.shape

torch.Size([32, 30])

In [18]:
loss = loss_fn(preds, y_batch)
loss

tensor(0.8526, device='cuda:0', grad_fn=<MseLossBackward0>)

In [19]:
# Checking to make sure the right layers are frozen

print("batch norm 1: ")
for param in resnet.bn1.parameters():
    print(param.requires_grad)
    
print("CONV1: ")
for param in resnet.conv1.parameters():
    print(param.requires_grad)

print("fc: ")
for param in resnet.fc.parameters():
    print(param.requires_grad)


batch norm 1: 
True
True
CONV1: 
True
fc: 
True
True


In [20]:
y_batch.shape

torch.Size([32, 30])

In [21]:
loss_fn(preds, y_batch)

tensor(0.8526, device='cuda:0', grad_fn=<MseLossBackward0>)

## Training Loop 

In [22]:
for epoch in range(EPOCHS):
    ### Training mode 
    resnet.train()

    
    for X_batch, y_batch in train_dataloader:
        # 1. Forward pass (model outputs raw logits)
        y_logits = resnet(X_batch)
        
        # 2. Calculate loss/accuracy
        loss = loss_fn(y_logits, y_batch)
        
        # 3. Optimizer zero grad
        optimizer.zero_grad()
        
        # 4. Loss backwards
        loss.backward()
        
        # 5. Optimizer step
        optimizer.step()
    
    
    resnet.eval()
    with torch.inference_mode():
        for X_batch, y_batch in test_dataloader:
            test_logits = resnet(X_batch)
            test_loss = loss_fn(test_logits, y_batch)

    # Print out what's happening
    if epoch % 10 == 0:
        print(f"Epoch: {epoch} | Loss: {loss:.5f} | Test Loss: {test_loss:.5f}")

Epoch: 0 | Loss: 0.70826 | Test Loss: 0.58285
Epoch: 10 | Loss: 0.13220 | Test Loss: 0.09554
Epoch: 20 | Loss: 0.19193 | Test Loss: 0.06232
Epoch: 30 | Loss: 0.02310 | Test Loss: 0.01310
Epoch: 40 | Loss: 0.24047 | Test Loss: 0.00743
Epoch: 50 | Loss: 0.01440 | Test Loss: 0.00302


In [35]:
preds = resnet(val_X)

RuntimeError: CUDA out of memory. Tried to allocate 794.00 MiB (GPU 0; 15.90 GiB total capacity; 5.66 GiB already allocated; 4.50 MiB free; 5.73 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
preds.shape

In [None]:
val_Y.shape

In [None]:
loss_fn(preds.to(device), val_Y.to(device))

In [None]:
val_X.shape, preds.shape, val_Y.shape

In [None]:
preds[0][1]

In [None]:
val_X[1].shape, val_X[1][0].shape

In [None]:
def show_pred(X, preds, actual, index, point):
    plt.imshow(X[index][0],cmap='gray')

    plt.scatter(96* preds[index][point],96* preds[index][point + 1] ,c='r', marker='s', s=60, alpha=.5)
    plt.scatter(96*actual[index][point],96* actual[index][point+1],c='g', marker='s', s=60, alpha=.5)

    plt.legend(['predicted','actual'])
    


In [None]:
show_pred(val_X.to("cpu"), preds.to("cpu").detach().numpy(), val_Y.to("cpu").detach().numpy(), 0, 0)

In [None]:
preds[0][0], preds[0][1]

In [33]:
val_Y[0][0], val_Y[0][1]

(tensor(0.6736, device='cuda:0'), tensor(0.3915, device='cuda:0'))

In [34]:
val_X[0][0]

tensor([[0.1843, 0.1529, 0.1255,  ..., 0.2902, 0.2980, 0.3137],
        [0.1725, 0.1529, 0.1333,  ..., 0.2863, 0.2941, 0.3059],
        [0.1686, 0.1608, 0.1451,  ..., 0.2863, 0.2941, 0.2980],
        ...,
        [0.2902, 0.2902, 0.2902,  ..., 0.2314, 0.2353, 0.2392],
        [0.2980, 0.2902, 0.2863,  ..., 0.2353, 0.2392, 0.2471],
        [0.3059, 0.2941, 0.2824,  ..., 0.2431, 0.2431, 0.2471]],
       device='cuda:0')