# CIS 545 Recitation 10 - PyTorch and Neural Networks

## Dependency Loading

In [None]:
import numpy as np

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import matplotlib.pyplot as plt

print(torch.__version__)

2.1.0+cu118


## A Brief Tutorial on PyTorch
**Acknowledgement**: We appreciate the brief tutorial on PyTorch given by Prof. Jacob Gardner

### Making sure a GPU is available.

Part of what we'll be doing in this tutorial is covering using GPUs for computation in PyTorch. In order to enable the use of a GPU in colab, you'll first need to go to `Runtime -> Change runtime type` in the menu system above. Then, under hardware acceleration, choose GPU.

Once you've done that, the next cell should run without errors.

In [None]:
# Running this cell shouldn't produce an error if you've done the above steps correctly.
assert torch.cuda.is_available()

### Understanding Tensors

Tensors in PyTorch are direct equivalents of `ndarray` (or just arrays) in NumPy in many ways. In fact, many of the operations you are now familiar with in numpy translate directly over to PyTorch! Below are some examples.

In [None]:
# Making a 5x5 matrix, a 5x1 column vector, a 1x5 row vector and a (5,) "proper" vector in NumPy and PyTorch
np_vec = np.arange(1, 6)  # (5,)
np_rvec = np_vec[None, :]  # (1, 5)
np_cvec = np_vec[:, None]  # (5, 1)
np_mat = np.tile(np_rvec, (5, 1))  # Repeat the np_rvec row 5 times --> (5, 5)
print(np_vec.shape, np_rvec.shape, np_cvec.shape, np_mat.shape)

th_vec = torch.arange(1, 6)
th_rvec = th_vec[None, :]  # Or: th_vec.unsqueeze(0)
th_cvec = th_vec[:, None]  # Or: th_vec.unsqueeze(1)
th_mat = torch.tile(th_rvec, (5, 1))
print(th_vec.shape, th_rvec.shape, th_cvec.shape, th_mat.shape)

print('NumPy matrix...')
print(np_mat)
print('Torch matrix...')
print(th_mat)

(5,) (1, 5) (5, 1) (5, 5)
torch.Size([5]) torch.Size([1, 5]) torch.Size([5, 1]) torch.Size([5, 5])
NumPy matrix...
[[1 2 3 4 5]
 [1 2 3 4 5]
 [1 2 3 4 5]
 [1 2 3 4 5]
 [1 2 3 4 5]]
Torch matrix...
tensor([[1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5]])


In [None]:
# As a reminder, here are some operations you can do in NumPy on matrices and vectors.

# Matrix vector multiplication
np_mvm = np_mat @ np_cvec  # or np.matmul

# Add np_rvec to each row of np_mat
np_add1 = np_mat + np_rvec

# Subtract np_cvec from each column of np_mat
np_add2 = np_mat - np_cvec

# Take the square root of each element of np_mat
np_sqrt = np.sqrt(np_mat)

In [None]:
np_mvm

array([[55],
       [55],
       [55],
       [55],
       [55]])

In [None]:
# Replicating the above operations on the PyTorch operations.

# Matrix vector multiplication
th_mvm = th_mat @ th_cvec  # or torch.matmul

# Add np_rvec to each row of np_mat
th_add1 = th_mat + th_rvec

# Subtract np_cvec from each column of np_mat
th_add2 = th_mat - th_cvec

# Take the square root of each element of np_mat
th_sqrt = torch.sqrt(th_mat)

In [None]:
th_mvm

tensor([[55],
        [55],
        [55],
        [55],
        [55]])

### Difference 1: GPU Computing

The first major difference between `NumPy` and `PyTorch` is that PyTorch supports easy use of GPUs for computation. The way this works is as follows:

1. First, move any tensors that you want to do computation on the GPU with to the GPU. This can be accomplished by calling either `gpu_tensor = tensor.cuda()` or `gpu_tensor = tensor.to('cuda')`. If you have more than one GPU on your computer, you can also specify which GPU to use, e.g. `tensor.to('cuda:1')`.

**Note**: this will fail if you don't have a GPU available.

2. There is no step 2! Do computation with the tensors as normal and it all happens on the GPU.

In [None]:
# Making a random 5000x5000 matrix in torch and do a matrix multiply with a 1000x1000 matrix of all ones.
# Hint: use torch.rand(n, m) and torch.ones(n, m)

th_mat1 = torch.rand(5000, 5000)
th_mat2 = torch.ones(5000, 5000)

In [None]:
# Timing a matrix multiply on the CPU

%time res = th_mat1 @ th_mat2

CPU times: user 2.88 s, sys: 69.8 ms, total: 2.95 s
Wall time: 6.49 s


In [None]:
# Making new tensors Move th_mat1 and th_mat2 to the GPU
th_mat1_gpu = th_mat1.cuda()
th_mat2_gpu = th_mat2.cuda()

In [None]:
%%time

# Filling in "res" below.

res = th_mat1_gpu @ th_mat2_gpu

print(res[0, 0])

tensor(2462.1714, device='cuda:0')
CPU times: user 465 ms, sys: 247 ms, total: 711 ms
Wall time: 2.71 s


### Difference 2: Autograd

The second major difference between `NumPy` and `PyTorch` is that PyTorch supports automatic differentiation. What this means is that PyTorch allows you to do computation and get derivatives for free! Here's the basic work flow:

1. Define a Parameter with `some_param = torch.nn.Parameter(some_tensor)`.
1. Use the parameter in some computation.
1. Call `.backward()` on any scalar result of the computation to get derivatives for some_param in `some_param.grad`

In [None]:
some_tensor = torch.rand(1)
# Making a parameter out of some_param
some_param = torch.nn.Parameter(some_tensor)

# Computing sin(exp(some_param)) and the derivative of sin(exp(x)) with respect to some_param.
# Hint: Use torch.sin and torch.exp
res = torch.sin(torch.exp(some_param))
res.backward()

print(res, some_param.grad)

tensor([0.4899], grad_fn=<SinBackward0>) tensor([-2.2926])


Of course, autograd works for multivariate calculus, too. Let's compute the partial derivatives of the following 5 dimensional function: $$f(\mathbf{x}) = \sum_{i=1}^{5} \sin ( \exp (x_i) )$$ at a few inputs. E.g., we're going to compute $\frac{\partial f}{\partial x_i}$ for all i.

In [None]:
# Making a random parameter of length 5.
param = torch.nn.Parameter(torch.rand(5,))
res = torch.sum(torch.sin(torch.exp(param)))
res.backward()

print(res, param.grad)

tensor(4.4817, grad_fn=<SumBackward0>) tensor([ 0.1638, -1.0823, -0.2724,  0.3176, -1.7836])


## Building Block of ML #1: torch.nn.Module

In PyTorch, we can define Modules. Modules do two things for me:

1. They make it easy to collect a set of parameters together. By calling `module.parameters()` or `module.named_parameters()` I get a generator over the parameters not only of the module but all of its submodules.
2. They let me define a method called `forward` that gets called when I do `module(x)`.

Let's first code up a "from scratch" implementation of a module that might be useful for linear regression. To do this, we need to create a class that extends `Module`, defines `weight` and `bias` parameters in the constructor, and then defines a `forward` method that applies the familiar linear regression prediction equation, $w^{\top}x + b$

In [None]:
class LinearRegression(torch.nn.Module):
    def __init__(self, input_dim):
        """
        Args:
            input_dim: The number of features we expect in the dataset.
        """
        super().__init__()
        # Adding a "weight" and "bias" parameter to this class.
        self.weight = torch.nn.Parameter(torch.rand(input_dim, 1))
        self.bias = torch.nn.Parameter(torch.rand(1, 1))

    def forward(self, x):
        # Assume: x is (n, d)
        # Computing linear regression predictions for each x using the parameters you define above.
        prediction = (x @ self.weight) + self.bias
        return prediction

In [None]:
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# load data ==> in NumPy array
X, y = load_diabetes(return_X_y=True)
# split into test and train set
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.33)

# move from NumPy to be in Torch array
train_x, train_y = torch.from_numpy(train_x).float(), torch.from_numpy(train_y).float()
test_x, test_y = torch.from_numpy(test_x).float(), torch.from_numpy(test_y).float()

print(train_x.shape, train_y.shape, test_x.shape, test_y.shape)

torch.Size([296, 10]) torch.Size([296]) torch.Size([146, 10]) torch.Size([146])


In [None]:
# Making a new LinearRegression object with appropriate input_dim for the above dataset.
# Then, making predictions on train_x, and computing the derivative of the MSE with respect to your weight and bias.

linear_reg = LinearRegression(input_dim=10)
pred = linear_reg(train_x)
mse = torch.mean((pred - train_y) ** 2)
mse.backward()
print(linear_reg.weight.grad, linear_reg.bias.grad)

tensor([[-0.8927],
        [-0.3269],
        [-0.7421],
        [-0.5722],
        [-0.2978],
        [-0.2365],
        [ 0.3175],
        [-0.4559],
        [-0.5916],
        [-0.5155]]) tensor([[-305.3282]])


## Building Block of ML #2: torch.optim

We're clearly making progress! So far, what we've been able to do is define a linear regression module with appropriate parameters, compute a loss function (mean squared error), and compute the derivative of the loss with respect to the weight and bias. All we need now is a way to apply these gradients to the parameters to update them and learn.

To help accomplish this, PyTorch provides a `torch.optim` library with a variety of `Optimizers`. In PyTorch, an `Optimizer` has the following properties:

1. You create an optimizer with a list (or generator) of parameters to optimize, and typically a learning rate / step size.
1. Optimizers expose a `zero_grad()` method that resets the gradients of all parameters to zero.
1. Optimizers expose a `step()` method that, if all parameters have had gradients filled in, applies a step of optimization with those gradients.

Below, we'll be using one optimizer called Adam, which is provided via `torch.optim.Adam`.

In [None]:
# Using a torch.optim.Adam object to take a single step of learning on the MSE of our linear regressor above.
# Hint: Creating an Adam object can be done with torch.optim.Adam(some_parameters, lr=0.01)
# An optimization loop should: (1) zero the gradients, (2) compute the loss, (3) call backward, (4) call step.

opt = torch.optim.Adam(linear_reg.parameters(), lr=0.01)
opt.zero_grad()
pred = linear_reg(train_x)
loss = torch.mean((pred - train_y) ** 2)
loss.backward()
opt.step()

In [None]:
# Training the linear regression model for 2500 iterations, and print out the loss every 100 iterations.

linear_reg = LinearRegression(input_dim=10)
opt = torch.optim.Adam(linear_reg.parameters(), lr=0.1)

for i in range(2500):
    opt.zero_grad()
    pred = linear_reg(train_x)
    loss = torch.mean((pred.squeeze() - train_y) ** 2)
    if i % 100 == 0:
        print(f'Iteration {i} - Loss = {loss:.2f}')
    loss.backward()
    opt.step()

Iteration 0 - Loss = 29331.09
Iteration 100 - Loss = 26116.49
Iteration 200 - Loss = 23216.23
Iteration 300 - Loss = 20611.75
Iteration 400 - Loss = 18281.99
Iteration 500 - Loss = 16206.61
Iteration 600 - Loss = 14365.91
Iteration 700 - Loss = 12740.88
Iteration 800 - Loss = 11313.17
Iteration 900 - Loss = 10065.11
Iteration 1000 - Loss = 8979.85
Iteration 1100 - Loss = 8041.38
Iteration 1200 - Loss = 7234.57
Iteration 1300 - Loss = 6545.26
Iteration 1400 - Loss = 5960.17
Iteration 1500 - Loss = 5466.92
Iteration 1600 - Loss = 5053.97
Iteration 1700 - Loss = 4710.60
Iteration 1800 - Loss = 4426.96
Iteration 1900 - Loss = 4194.00
Iteration 2000 - Loss = 4003.58
Iteration 2100 - Loss = 3848.39
Iteration 2200 - Loss = 3722.03
Iteration 2300 - Loss = 3618.95
Iteration 2400 - Loss = 3534.44


In [None]:
# Computing test predictions and test MSE
pred = linear_reg(test_x)
test_error = torch.mean((pred.squeeze() - test_y) ** 2)
print(test_error)

tensor(3599.8232, grad_fn=<MeanBackward0>)


## torch.nn: A convenient library of pre baked modules.

torch.nn contains much more than just `Parameter` and `Module`: it contains a variety of pre baked modules that are useful for machine learning, including:
- `torch.nn.Linear`: Has weight and bias parameters, and applies an affine transformation $XW + b$ to an input.
- `torch.nn.ReLU`: Applies the rectified linear unit (ReLU) to the input.
- `torch.nn.Conv2d`: Applies 2D convolutions.
- `torch.nn.MaxPool2d`: Applies max pooling.

And many more! Let's use these to build a simple 1 hidden layer neural network on the boston data above.

In [None]:
# Repeating the above linear regression, but using a torch.nn.Linear module instead of our "from scratch" version.

linear_reg = torch.nn.Linear(in_features=10, out_features=1)
opt = torch.optim.Adam(linear_reg.parameters(), lr=0.1)

for i in range(2500):
    opt.zero_grad()
    pred = linear_reg(train_x)
    loss = torch.mean((pred.squeeze() - train_y) ** 2)
    if i % 100 == 0:
        print(f'Iteration {i} - Loss = {loss:.2f}')
    loss.backward()
    opt.step()

Iteration 0 - Loss = 29484.99
Iteration 100 - Loss = 26260.17
Iteration 200 - Loss = 23349.74
Iteration 300 - Loss = 20735.19
Iteration 400 - Loss = 18395.54
Iteration 500 - Loss = 16310.48
Iteration 600 - Loss = 14460.41
Iteration 700 - Loss = 12826.33
Iteration 800 - Loss = 11389.96
Iteration 900 - Loss = 10133.68
Iteration 1000 - Loss = 9040.67
Iteration 1100 - Loss = 8094.94
Iteration 1200 - Loss = 7281.39
Iteration 1300 - Loss = 6585.86
Iteration 1400 - Loss = 5995.10
Iteration 1500 - Loss = 5496.70
Iteration 1600 - Loss = 5079.14
Iteration 1700 - Loss = 4731.69
Iteration 1800 - Loss = 4444.46
Iteration 1900 - Loss = 4208.42
Iteration 2000 - Loss = 4015.36
Iteration 2100 - Loss = 3857.96
Iteration 2200 - Loss = 3729.78
Iteration 2300 - Loss = 3625.23
Iteration 2400 - Loss = 3539.53


In [None]:
# Computing test predictions and test MSE
pred = linear_reg(test_x)
test_error = torch.mean((pred.squeeze() - test_y) ** 2)
print(test_error)

tensor(3604.5852, grad_fn=<MeanBackward0>)


In [None]:
# Making a feed forward neural network Module with one hidden layer.

class NeuralNet(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        # First layer: map from input to hidden features.
        self.linear1 = torch.nn.Linear(input_dim, hidden_dim)

        # Second layer: map from hidden dim to prediction size (1)
        self.linear2 = torch.nn.Linear(hidden_dim, 1)

    def forward(self, x):
        hidden_feat = self.linear1(x)
        hidden_feat = torch.relu(hidden_feat)
        prediction = self.linear2(hidden_feat)
        return prediction

In [None]:
# Making a neural network with a hidden dimensionality of 64.
# Training the neural network using Adam for 5000 iterations using a learning rate of 0.01.
# Printing the loss every 100 iterations.

neural_net = NeuralNet(input_dim=10, hidden_dim=64)
opt = torch.optim.Adam(neural_net.parameters(), lr=0.01)

for i in range(5000):
    opt.zero_grad()
    pred = neural_net(train_x)
    loss = torch.mean((pred.squeeze() - train_y) ** 2)
    if i % 100 == 0:
        print(f'Iteration {i} - Loss = {loss:.2f}')
    loss.backward()
    opt.step()

Iteration 0 - Loss = 29460.39
Iteration 100 - Loss = 13482.79
Iteration 200 - Loss = 3622.69
Iteration 300 - Loss = 3224.86
Iteration 400 - Loss = 3044.63
Iteration 500 - Loss = 2965.34
Iteration 600 - Loss = 2929.07
Iteration 700 - Loss = 2910.57
Iteration 800 - Loss = 2900.31
Iteration 900 - Loss = 2894.39
Iteration 1000 - Loss = 2890.85
Iteration 1100 - Loss = 2888.62
Iteration 1200 - Loss = 2887.12
Iteration 1300 - Loss = 2886.01
Iteration 1400 - Loss = 2885.12
Iteration 1500 - Loss = 2884.34
Iteration 1600 - Loss = 2883.62
Iteration 1700 - Loss = 2882.92
Iteration 1800 - Loss = 2882.24
Iteration 1900 - Loss = 2881.56
Iteration 2000 - Loss = 2880.88
Iteration 2100 - Loss = 2880.21
Iteration 2200 - Loss = 2879.54
Iteration 2300 - Loss = 2878.87
Iteration 2400 - Loss = 2878.20
Iteration 2500 - Loss = 2877.55
Iteration 2600 - Loss = 2876.90
Iteration 2700 - Loss = 2876.26
Iteration 2800 - Loss = 2875.64
Iteration 2900 - Loss = 2875.03
Iteration 3000 - Loss = 2874.44
Iteration 3100 - L

In [None]:
# Computing test predictions and testing MSE
pred = neural_net(test_x)
test_error = torch.mean((pred.squeeze() - test_y) ** 2)
print(test_error)+

SyntaxError: ignored




## Convolutional Neural Networks (CNN)

#### Here we are using the PyTorch library to get the dataset and make a dataloader.


More details on the CIFAR-10 dataset as well as training a CNN model can be found [here](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html).

<img src='https://pytorch.org/tutorials/_images/cifar10.png'>

### Transform

#### Here we need to define a series of transformations we want to perform on our dataset before we feed it to the Neural Network.

In [None]:
#transformations applied to the images
transformations = transforms.Compose([transforms.ToTensor()])

In [None]:
# TODO:
train_dataset = torchvision.datasets.CIFAR10(root = './data', train=True, download=True, transform = transformations)
test_dataset = torchvision.datasets.CIFAR10(root = './data', train=False, download=True, transform = transformations)

### Creating a dataloader. For Train / Test

In [None]:
# TODO:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False)

### Training Image Visualization

In [None]:
def imshow(img):
    npimg = img.numpy()
    plt.imshow(img.permute(1, 2, 0))
    plt.show()

# TODO: Get some random training images
data_iter = iter(train_loader)
images, labels = next(data_iter)

# show images
imshow(torchvision.utils.make_grid(images))


### A Basic CNN Model

Check out [here](https://madebyollin.github.io/convnet-calculator/) for a helper calculator you can use for the layer dimensions!

### Some basic checks...

In [None]:
for inputs, labels in train_loader:
    print("The shape of inputs is:", inputs.shape)
    print("The shape of labels is:", labels.shape)
    break

print("Number of classes:", len(train_dataset.classes))

### Defining CNN Architecture

In [None]:
# TODO: Define a CNN architecture
class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Conv2d(in_channels=3, out_channels=20, kernel_size=4, stride=2)
        # self.conv2 = nn.Conv2d(in_channels=3, out_channels=40, kernel_size=4, stride=2)
        self.mp = nn.MaxPool2d(kernel_size=3)
        self.relu = nn.ReLU()
        self.flatten = nn.Flatten(start_dim=1)
        self.fc = nn.Linear(in_features=20*4*4, out_features=10)

    def forward(self, x):
        outputs = self.conv(x) # changes dimension here
        outputs = self.relu(outputs)
        outputs = self.mp(outputs) # changes dimension here ==> 16,20,5,5
        outputs = self.flatten(outputs) # 16, 20*5*5
        outputs = self.fc(outputs) # 16, 10
        return outputs

### Train the CNN

To check if GPU is available and setting up the device variable

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
## TODO:
# Sending the data to device (CPU or GPU)
cnn = CNN().to(device)
criterion = nn.CrossEntropyLoss() # equivalent to applying LogSoftmax() to output and then use NLLLoss()
optimizer = optim.Adam(cnn.parameters(), lr=1e-4) #lr - learning step

loss_LIST = []

# Epochs 3
for epoch in range(3):
  running_loss = 0.0
  for inputs, labels in train_loader:
      inputs, labels = inputs.to(device), labels.to(device) # Send the inputs and labels to the device
      outputs = cnn(inputs) # Feed the network the train data
      optimizer.zero_grad() # We need to reset the optimizer tensor gradient every mini-batch
      loss = criterion(outputs, labels) # this is the average loss for one mini-batch of inputs
      loss.backward() # Do a back propagation
      optimizer.step() # Update the weight using the gradients from back propagation by learning step

      running_loss += loss.item() #get the accumulated loss for each epoch
  loss_LIST.append(running_loss / len(train_loader)) # get the avg loss for each epoch

  # print statistics
  print(f'The loss for Epoch {epoch} is: {running_loss/len(train_loader)}')

The loss for Epoch 0 is: 1.9798614526367186
The loss for Epoch 1 is: 1.6965392213058472
The loss for Epoch 2 is: 1.6003207152175902


### Calculate the Accuracy

In [None]:
total = 0
correct = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device) # Send the inputs and labels to the device
        outputs = cnn(images)
        _, predicted = torch.max(outputs.data, 1) # use max to get the prediction
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Test Accuracy: {100 * correct / total}')

Test Accuracy: 46.25
