In [1]:
from utils.notebook_2_utils import * 
import utils.notebook_2_utils as utils

training_df = pd.read_csv("data/training.csv")#.iloc[:1000]
lookup_df = pd.read_csv("data/IdLookupTable.csv")
training_df.fillna(method = 'ffill',inplace = True)

import math


device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [2]:
train_X, train_Y, val_X, val_Y = create_train_test_sets_nchw(training_df, normalize=True)

## Very Simple CNN Model
* Input shape:  64, 1, 96, 96 (nchw)
* Output shape: 64, 30 (nc)


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
# from sklearn.model_selection import train_test_split
import torch.nn.functional as F

### Model Definition

In [None]:
class CNN(nn.Module):
    def __init__(self, input_shape: int, hidden_dim: int, output_dim: int):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=input_shape, out_channels=hidden_dim, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=hidden_dim, out_channels=hidden_dim, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(hidden_dim * 24 * 24, hidden_dim) #this is multiplied by 7*7 because the image is 28*28 and we have 2 conv layers with stride 1 and padding 1
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, kernel_size=2, stride=2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, kernel_size=2, stride=2)
        x = x.reshape(x.shape[0], -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

## Control Panel

In [None]:
cnn = CNN(input_shape=1, hidden_dim=64, output_dim=30)

EPOCHS = 40
LEARNING_RATE = .02
BATCH_SIZE = 32


loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.SGD(cnn.parameters(), lr=LEARNING_RATE)





### DataLoaders

In [None]:
torch.manual_seed(42)
from torch.utils.data import TensorDataset, DataLoader


# Put data to target device
train_X, train_Y = train_X.to(device), train_Y.to(device)
val_X, val_Y = val_X.to(device), val_Y.to(device)
cnn = cnn.to(device)

train_dataset = TensorDataset(train_X, train_Y)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = TensorDataset(val_X, val_Y)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
for X_batch, y_batch in train_dataloader:
    print(X_batch.shape, y_batch.shape)
    break

In [None]:
for X_batch, y_batch in test_dataloader:
    print(X_batch.shape, y_batch.shape)
    break

In [None]:
for idx,batch in enumerate(train_dataloader):
    print("batch input: ", batch[0].size())
    print("batch idx: ",  idx)
    print("batch label: " , batch[1].shape)
    break

print("---------- test dataloader -------")
for idx, (data,target) in enumerate(test_dataloader):
    print(idx, data.shape)
    print(idx, target.shape)
    break

## Learning Rate finder
* use exponentially increasing learning rate, starting at init.

In [None]:
final_lr = 10
init_lr = 1e-10
num_iters = 100

In [None]:
init, final = math.log(init_lr, 10), math.log(final_lr, 10)
init,final

In [None]:
x = np.logspace(init, final, num=100)
x[:5],x[95:]

In [None]:
for idx, batch in train_dataloader:
    print(idx.shape, batch.shape)
    break

In [None]:
for i, (inputs, targets) in enumerate(train_dataloader):
    print(i)
    break

In [None]:
n,m = next(iter(train_dataloader))

In [None]:
n.shape, m.shape

In [None]:
for i in range (10):
    n,m = next(iter(train_dataloader))
    print(n.shape, m.shape)
    break

In [None]:
for idx,batch in enumerate(train_dataloader):
        print("batch input: ", batch[0].size())
        print("batch idx: ",  idx)
        print("batch label: " , batch[1].shape)
        curr_batch = idx
        break
        

In [None]:
len(train_dataloader)

In [None]:
batches = [next(iter(train_dataloader)) for _ in range(len(train_dataloader))]


In [None]:
len(batches)

for i in range(5):
    print(i % 3)



In [None]:
import matplotlib.pyplot as plt

def find_lr(model, train_loader, criterion, optimizer, init_lr=1e-8, final_lr=10, num_iters=100):
    """
    Finds the optimal learning rate for the model by gradually increasing the learning rate and plotting the loss.
    Args:
    - model (torch.nn.Module): the PyTorch model to train
    - train_loader (torch.utils.data.DataLoader): the training data loader
    - criterion (torch.nn.Module): the loss function
    - optimizer (torch.optim.Optimizer): the optimizer
    - init_lr (float): the initial learning rate
    - final_lr (float): the final learning rate
    - num_iters (int): the number of iterations to run
    """
    model.train()
    optimizer.param_groups[0]['lr'] = init_lr
    
    init, final = math.log(init_lr, 10), math.log(final_lr, 10)

    
    lr_steps = np.logspace(init, final, num=num_iters+1)
    lr = init_lr
    losses = []
    lrs = []
        
    for i, (inputs, targets) in enumerate(train_loader):
        if i >= num_iters:
            break
        
        optimizer.param_groups[0]['lr'] = lr
        inputs, targets = inputs.cuda(), targets.cuda()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        losses.append(loss.item())
        lrs.append(lr)
        lr = lr_steps[i]
    
    # plot the learning rate vs. loss
    plt.plot(lrs, losses)
    plt.xscale('log')
    plt.xlabel('Learning Rate')
    plt.ylabel('Loss')
    # plt.scatter(losses, lrs)
    plt.show()
    return losses, lrs


In [None]:
losses, lrs, lrs_dx = find_lr(cnn, train_dataloader, loss_fn, optimizer, final_lr=1)

In [None]:
max = np.argmin(lrs_dx)

In [None]:
min = np.argmin(losses)

In [None]:
lrs[max]

In [None]:
lrs[min]

In [None]:
lrs[np.argmin(losses)]

In [None]:
lrs_dx

In [None]:
#  to find steepest point: calculate the derivative, find the min/max)

In [None]:
preds = cnn(X_batch)

In [None]:
y_batch.shape

In [None]:
loss_fn(preds, y_batch)

# Gradient Accumilation

## The same training loop but with gradient accumilation added
## __NOTE__: Only run one option
Why: some GPU's may run out of memory, but changing batch size will require varying learning rate and confuse things

In [None]:
# X_blob_train, y_blob_train = train_X, train_Y
# X_blob_test, y_blob_test = val_X, val_Y

for epoch in range(EPOCHS):
    ### Training mode 
    cnn.train()

    
    # Set the number of batches to accumulate gradients over
    accumulation_steps = 4

    # Initialize a counter to keep track of the number of accumulated batches
    accumulation_counter = 0

    # Create an empty tensor to accumulate gradients over the accumulation steps
    grad_acc = {k: torch.zeros_like(v) for k, v in cnn.named_parameters()}

    for X_batch, y_batch in train_dataloader:

        # 1. Forward pass (model outputs raw logits)
        y_logits = cnn(X_batch)

        # 2. Calculate loss/accuracy
        loss = loss_fn(y_logits, y_batch)

        # 3. Loss backwards
        loss.backward()

        # 4. Accumulate gradients over the accumulation steps
        accumulation_counter += 1
        if accumulation_counter == accumulation_steps:
            for param_name, param in cnn.named_parameters():
                grad_acc[param_name] += param.grad / accumulation_steps
            accumulation_counter = 0

        # 5. Optimizer step (only update weights after accumulating gradients over accumulation_steps)
        if accumulation_counter == 0:
            for param_name, param in cnn.named_parameters():
                param.grad = grad_acc[param_name]
            optimizer.step()
            optimizer.zero_grad()

    # Check if there are any remaining accumulated gradients to update
    if accumulation_counter != 0:
        for param_name, param in cnn.named_parameters():
            param.grad = grad_acc[param_name]
        optimizer.step()
        optimizer.zero_grad()

    
    
    cnn.eval()
    with torch.inference_mode():
        for X_batch, y_batch in test_dataloader:
            test_logits = cnn(X_batch)
            test_loss = loss_fn(test_logits, y_batch)
            
            
    # Print out what's happening
    if epoch % 10 == 0:
        print(f"Epoch: {epoch} | Loss: {loss:.5f} | Test Loss: {test_loss:.5f}")

## Predictions


In [None]:
preds = cnn(val_X.to(device))

In [None]:
preds.shape

In [None]:
val_Y.shape

In [None]:
loss_fn(preds, val_Y)

In [None]:
val_X.shape, preds.shape, val_Y.shape

In [None]:
preds[0][1]

In [None]:
val_X[1].shape, val_X[1][0].shape

In [None]:
def show_pred(X, preds, actual, index, point):
    plt.imshow(X[index][0],cmap='gray')

    plt.scatter(96* preds[index][point],96* preds[index][point + 1] ,c='r', marker='s', s=60, alpha=.5)
    plt.scatter(96*actual[index][point],96* actual[index][point+1],c='g', marker='s', s=60, alpha=.5)

    plt.legend(['predicted','actual'])
    


In [None]:
show_pred(val_X.to("cpu"), preds.to("cpu").detach().numpy(), val_Y.to("cpu").detach().numpy(), 0, 0)

In [None]:
preds[0][0], preds[0][1]

In [None]:
val_Y[0][0], val_Y[0][1]

In [None]:
val_X[0][0]