## Goals of the project:
- Understanding tensors
- Autograd
- Backpropagation
- Loss functions
- Optimizers
- Data Loading
- CPU vs GPU computation
- Debugging

In [None]:
# Understanding tensors
# Tensors are similar to numpy's ndarrays, with the addition being that
# tensors can also be used on a GPU to accelerate computing
# Tensors are also optimized for automatic differentiation (more on that later)
# Tensors are also used to encode the inputs and outputs of a model, as well
# as the model's parameters

# Import torch and other required modules
import torch

# create a tensor with a single number
t1 = torch.tensor(4.)
print(t1)
print(t1.shape)
print(t1.dtype)

# create a 3D tensor with 2 matrices (note: 19. is a float)
t4 = torch.tensor([[[11, 12, 13], [13, 14, 15]],
                   [[15, 16, 17], [17, 18, 19.]]])
print(t4)
print(t4.shape)
print(t4.dtype)

# Can tensor store integers? yes
t5 = torch.tensor([5, 6, 7, 8])
print(t5)
print(t5.shape)
print(t5.dtype)


# Tensor operations and gradients
# requires_grad param tells PyTorch that it should track the gradients of the tensor
# while we perform operations on it. This way, PyTorch can later perform backpropagation
# to calculate the gradients of the cost with respect to W and b
# To determine if a tensor is being tracked by gradient descent, check its
# requires_grad attribute

# Addition
x = torch.tensor(3.)
w = torch.tensor(4., requires_grad=True)
b = torch.tensor(5., requires_grad=True)
y = w * x + b


# The gradient for this tensor will be accumulated into .grad attribute
# We need to set the .grad attribute to zero before calling .backward()
# because PyTorch accumulates the gradients on subsequent backward passes
# (i.e. the .grad values are added to whatever already exists, rather than
# replacing them)
# See https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch

# compute gradients
y.backward()

# display gradients
print('dy/dx:', x.grad)
print('dy/dw:', w.grad)
print('dy/db:', b.grad)



tensor(4.)
torch.Size([])
torch.float32
tensor([[[11., 12., 13.],
         [13., 14., 15.]],

        [[15., 16., 17.],
         [17., 18., 19.]]])
torch.Size([2, 2, 3])
torch.float32
tensor([5, 6, 7, 8])
torch.Size([4])
torch.int64
dy/dx: None
dy/dw: tensor(3.)
dy/db: tensor(1.)


In [None]:
# Autograd is an automatic differentiation tool in PyTorch
# It calculates the gradients of your parameters with respect to a loss function
# The autograd package provides automatic differentiation for all operations on Tensors
# It is a define-by-run framework, which means that your backprop is defined by how your
# code is run, and that every single iteration can be different
# If you set the attribute .requires_grad as True, it starts to track all operations on it
# When you finish your computation you can call .backward() and have all the gradients computed automatically
# The gradient for this tensor will be accumulated into .grad attribute

In [None]:
# Reduction operations

# Indexing, slicing, joining, mutating ops

# create a 4d tensor with C=2, H=3, W=4, D=5
import torch

# Example numbers for a tensor of shape (2, 3, 4, 5)
tensor_data = torch.tensor([
    [
        [
            [1, 2, 3, 4, 5],
            [6, 7, 8, 9, 10],
            [11, 12, 13, 14, 15],
            [16, 17, 18, 19, 20]
        ],
        [
            [21, 22, 23, 24, 25],
            [26, 27, 28, 29, 30],
            [31, 32, 33, 34, 35],
            [36, 37, 38, 39, 40]
        ],
        [
            [41, 42, 43, 44, 45],
            [46, 47, 48, 49, 50],
            [51, 52, 53, 54, 55],
            [56, 57, 58, 59, 60]
        ]
    ],
    [
        [
            [61, 62, 63, 64, 65],
            [66, 67, 68, 69, 70],
            [71, 72, 73, 74, 75],
            [76, 77, 78, 79, 80]
        ],
        [
            [81, 82, 83, 84, 85],
            [86, 87, 88, 89, 90],
            [91, 92, 93, 94, 95],
            [96, 97, 98, 99, 100]
        ],
        [
            [101, 102, 103, 104, 105],
            [106, 107, 108, 109, 110],
            [111, 112, 113, 114, 115],
            [116, 117, 118, 119, 120]
        ]
    ]
])

# Examples of slicing the tensor
print("tensor_data[0, 0, 0, 0]:", tensor_data[0, 0, 0, 0])
print("tensor_data[0, 0, 0, :]:", tensor_data[0, 0, 0, :])
print("tensor_data[0, 0, :, 0]:", tensor_data[0, 0, :, 0])
print("tensor_data[0, 0, :, :]:", tensor_data[0, 0, :, :])
print("tensor_data[0, :, :, :]:", tensor_data[0, :, :, :])
print("tensor_data[:, :, :, :]:", tensor_data[:, :, :, :])


tensor_data[0, 0, 0, 0]: tensor(1)
tensor_data[0, 0, 0, :]: tensor([1, 2, 3, 4, 5])
tensor_data[0, 0, :, 0]: tensor([ 1,  6, 11, 16])
tensor_data[0, 0, :, :]: tensor([[ 1,  2,  3,  4,  5],
        [ 6,  7,  8,  9, 10],
        [11, 12, 13, 14, 15],
        [16, 17, 18, 19, 20]])
tensor_data[0, :, :, :]: tensor([[[ 1,  2,  3,  4,  5],
         [ 6,  7,  8,  9, 10],
         [11, 12, 13, 14, 15],
         [16, 17, 18, 19, 20]],

        [[21, 22, 23, 24, 25],
         [26, 27, 28, 29, 30],
         [31, 32, 33, 34, 35],
         [36, 37, 38, 39, 40]],

        [[41, 42, 43, 44, 45],
         [46, 47, 48, 49, 50],
         [51, 52, 53, 54, 55],
         [56, 57, 58, 59, 60]]])
tensor_data[:, :, :, :]: tensor([[[[  1,   2,   3,   4,   5],
          [  6,   7,   8,   9,  10],
          [ 11,  12,  13,  14,  15],
          [ 16,  17,  18,  19,  20]],

         [[ 21,  22,  23,  24,  25],
          [ 26,  27,  28,  29,  30],
          [ 31,  32,  33,  34,  35],
          [ 36,  37,  38,  39, 

In [None]:
# Examples of joining tensors
# Concatenation joins tensors along an existing axis
# Stacking joins tensors along a new axis

# create two tensors for concatenation and stacking
t1 = torch.tensor([[1, 2], [3, 4]])
t2 = torch.tensor([[5, 6], [7, 8]])

# concatenate on axis 0 (rows)
print("torch.cat([t1, t2], dim=0):\n", torch.cat([t1, t2], dim=0))
print("torch.cat([t1, t2], dim=0).shape:\n", torch.cat([t1, t2], dim=0).shape)
print("=========================================")

# concatenate on axis 1 (columns)
print("torch.cat([t1, t2], dim=1):\n", torch.cat([t1, t2], dim=1))
print("torch.cat([t1, t2], dim=1).shape:\n", torch.cat([t1, t2], dim=1).shape)
print("=========================================")

# stack on axis 0 (new dimension)
print("torch.stack([t1, t2], dim=0):\n", torch.stack([t1, t2], dim=0))
print("torch.stack([t1, t2], dim=0).shape:\n", torch.stack([t1, t2], dim=0).shape)
print("=========================================")

# stack on axis 1 (new dimension)
print("torch.stack([t1, t2], dim=1):\n", torch.stack([t1, t2], dim=1))
print("torch.stack([t1, t2], dim=1).shape:\n", torch.stack([t1, t2], dim=1).shape)
print("=========================================")

# stack on axis 2 (new dimension)
print("torch.stack([t1, t2], dim=2):\n", torch.stack([t1, t2], dim=2))
print("torch.stack([t1, t2], dim=2).shape:\n", torch.stack([t1, t2], dim=2).shape)
print("=========================================")

torch.cat([t1, t2], dim=0):
 tensor([[1, 2],
        [3, 4],
        [5, 6],
        [7, 8]])
torch.cat([t1, t2], dim=0).shape:
 torch.Size([4, 2])
torch.cat([t1, t2], dim=1):
 tensor([[1, 2, 5, 6],
        [3, 4, 7, 8]])
torch.cat([t1, t2], dim=1).shape:
 torch.Size([2, 4])
torch.stack([t1, t2], dim=0):
 tensor([[[1, 2],
         [3, 4]],

        [[5, 6],
         [7, 8]]])
torch.stack([t1, t2], dim=0).shape:
 torch.Size([2, 2, 2])
torch.stack([t1, t2], dim=1):
 tensor([[[1, 2],
         [5, 6]],

        [[3, 4],
         [7, 8]]])
torch.stack([t1, t2], dim=1).shape:
 torch.Size([2, 2, 2])
torch.stack([t1, t2], dim=2):
 tensor([[[1, 5],
         [2, 6]],

        [[3, 7],
         [4, 8]]])
torch.stack([t1, t2], dim=2).shape:
 torch.Size([2, 2, 2])


In [None]:
# Reduction operations
# Min, max, mean, argmax, argmin, sum, prod

# create a 3d tensor
t = torch.tensor([
    [
        [1, 3, 5],
        [10, 15, 3]
    ],
    [
        [2, 1, 4],
        [3, 7, 6]
    ]
])

# find the minimum value along axis 0 (rows)
print("torch.min(t, dim=0):", torch.min(t, dim=0).values)

# argmin returns the index location of the minimum value along axis 0 (rows)
print("torch.argmin(t, dim=0):", torch.argmin(t, dim=0))

# sum the values along axis 0 (rows)
print("torch.sum(t, dim=0):", torch.sum(t, dim=0))

# prod the values along axis 0 (rows)
print("torch.prod(t, dim=0):", torch.prod(t, dim=0))

torch.min(t, dim=0): tensor([[1, 1, 4],
        [3, 7, 3]])
torch.argmin(t, dim=0): tensor([[0, 1, 1],
        [1, 1, 0]])
torch.sum(t, dim=0): tensor([[ 3,  4,  9],
        [13, 22,  9]])
torch.prod(t, dim=0): tensor([[  2,   3,  20],
        [ 30, 105,  18]])


In [None]:
# reshaping the tensor

# there are two ways to reshape a tensor.
# 1. Using the .reshape() method
# 2. Using the .view() method

# the difference between the two methods is that .view() only works when the tensor is contiguous in memory (C contiguous layout)
# whereas .reshape() will work regardless of the tensor's memory layout

# in general, reshape() is flexible

# create a 3d tensor
t = torch.tensor([
    [
        [1, 2],
        [3, 4]
    ],
    [
        [5, 6],
        [7, 8]
    ]
])

# reshape the tensor into a 2x4 matrix
print("t.reshape(2, 4):\n", t.reshape(2, 4))

# reshape the tensor into a 4x2 matrix
print("t.reshape(4, 2):\n", t.reshape(4, 2))

# reshape the tensor into a 1D array
print("t.reshape(8):\n", t.reshape(8))

# view the tensor as a 2x4 matrix
print("t.view(2, 4):\n", t.view(2, 4))

# view the tensor as a 4x2 matrix
print("t.view(4, 2):\n", t.view(4, 2))

# view the tensor as a 1D array
print("t.view(8):\n", t.view(8))


t.reshape(2, 4):
 tensor([[1, 2, 3, 4],
        [5, 6, 7, 8]])
t.reshape(4, 2):
 tensor([[1, 2],
        [3, 4],
        [5, 6],
        [7, 8]])
t.reshape(8):
 tensor([1, 2, 3, 4, 5, 6, 7, 8])
t.view(2, 4):
 tensor([[1, 2, 3, 4],
        [5, 6, 7, 8]])
t.view(4, 2):
 tensor([[1, 2],
        [3, 4],
        [5, 6],
        [7, 8]])
t.view(8):
 tensor([1, 2, 3, 4, 5, 6, 7, 8])


In [None]:
# CPU vs GPU

# tensors can live on the CPU or the GPU.
# in cpu, tensors are stored as numpy arrays. in gpu, tensors are stored as cuda arrays
# the cuda array is similar to a numpy array, but it can be used on the GPU to accelerate computing
# .to() method can be used to move tensors from cpu to gpu and vice versa
# it is important to note that the .to() method returns a new tensor. it does not overwrite the original tensor
# if one tensor is on the cpu and another is on the gpu, we cannot perform operations between them. they must be on the same device (cpu or gpu)
# usually, we offload the bulk of the computation to the gpu, and then bring it back to the cpu to save it to disk
# does scikit-learn support gpu? no. scikit-learn is a python library that is built on top of numpy. numpy does not support gpu. therefore, scikit-learn does not support gpu

# print with description all the available devices
import time

print("Available GPUs with name and memory:")
for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_name(i), torch.cuda.get_device_properties(i).total_memory)

# compute time taken to multiply and store it for later comparison
t_cpu = torch.ones(100, 100, 100)
start_time_cpu = time.time()
t_cpu @ t_cpu
end_time_cpu = time.time()
cpu_time = end_time_cpu - start_time_cpu
print(f'CPU time: {cpu_time}')

# now calculate how much time it takes to move it to gpu and do the same multiplication
t_gpu = t_cpu.to('cuda')

start_time_gpu = time.time()
t_gpu @ t_gpu
end_time_gpu = time.time()
gpu_time = end_time_gpu - start_time_gpu
print(f'GPU time: {gpu_time}')

# print the factor of speedup from above
speedup_factor = cpu_time / gpu_time
print(f'Speedup factor: {speedup_factor}')



Available GPUs with name and memory:
Tesla T4 15835660288
Speedup factor: 9615.26481979559
