# PyTorch Basics
- tensors like numpy
- tensors on the gpu
- tensors and automatic derivatives
- tensors as neural network abstractions: `torch.nn`
- optimizers: `nn.optim`

## Init, helpers, utils, ...

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision

In [2]:
from pprint import pprint

import matplotlib.pyplot as plt
import numpy as np
from IPython.core.debugger import set_trace

# Tensors
tensors - the atoms of machine learning

## Tensors in numpy and pytorch

In [3]:
import numpy as np
from numpy.linalg import inv
from numpy.linalg import multi_dot as mdot

In [4]:
# numpy
np.eye(3)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [5]:
# torch
torch.eye(3)

tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]])

In [6]:
# numpy
X = np.random.random((5, 3))
X

array([[0.01761407, 0.22226662, 0.10141616],
       [0.29994794, 0.52538242, 0.63938118],
       [0.67686391, 0.6255289 , 0.12404051],
       [0.97030623, 0.90453374, 0.0239299 ],
       [0.19845467, 0.00667336, 0.77244887]])

In [7]:
# pytorch
Y = torch.rand((5, 3))
Y

tensor([[0.2812, 0.8975, 0.3753],
        [0.1442, 0.3561, 0.7479],
        [0.8167, 0.1642, 0.0646],
        [0.8707, 0.1257, 0.0439],
        [0.5695, 0.9224, 0.2349]])

In [8]:
X.shape

(5, 3)

In [9]:
Y.shape

torch.Size([5, 3])

In [10]:
# numpy
X.T @ X

array([[1.52930221, 1.46389941, 0.45404138],
       [1.46389941, 1.53494135, 0.46285221],
       [0.45404138, 0.46285221, 1.03172948]])

In [11]:
# torch
Y.t() @ Y

tensor([[1.8492, 1.0725, 0.4381],
        [1.0725, 1.8258, 0.8359],
        [0.4381, 0.8359, 0.7614]])

In [12]:
# numpy
inv(X.T @ X)

array([[ 7.52001688, -7.139912  , -0.1063019 ],
       [-7.139912  ,  7.53243033, -0.23706458],
       [-0.1063019 , -0.23706458,  1.12237883]])

In [13]:
# torch
torch.inverse(Y.t() @ Y)

tensor([[ 0.8252, -0.5376,  0.1153],
        [-0.5376,  1.4514, -1.2841],
        [ 0.1153, -1.2841,  2.6567]])

## More on PyTorch Tensors

Operations are also available as methods.

In [14]:
A = torch.eye(3)
A.add(1)

tensor([[2., 1., 1.],
        [1., 2., 1.],
        [1., 1., 2.]])

In [15]:
A

tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]])

Any operation that mutates a tensor in-place has a `_` suffix.

In [16]:
A.add_(1)
A

tensor([[2., 1., 1.],
        [1., 2., 1.],
        [1., 1., 2.]])

## Indexing and broadcasting
It works as expected/like numpy:

In [17]:
A[0, 0]

tensor(2.)

In [18]:
A[0]

tensor([2., 1., 1.])

In [19]:
A[0:2]

tensor([[2., 1., 1.],
        [1., 2., 1.]])

In [20]:
A[:, 1:3]

tensor([[1., 1.],
        [2., 1.],
        [1., 2.]])

## Converting

In [21]:
A = torch.eye(3)
A

tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]])

In [22]:
# torch --> numpy
B = A.numpy()
B

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]], dtype=float32)

Note: torch and numpy can share the same memory / zero-copy

In [23]:
A.add_(.5)
A

tensor([[1.5000, 0.5000, 0.5000],
        [0.5000, 1.5000, 0.5000],
        [0.5000, 0.5000, 1.5000]])

In [24]:
B

array([[1.5, 0.5, 0.5],
       [0.5, 1.5, 0.5],
       [0.5, 0.5, 1.5]], dtype=float32)

In [25]:
# numpy --> torch
torch.from_numpy(np.eye(3))

tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]], dtype=torch.float64)

## Much more

In [26]:
[o for o in dir(torch) if not o.startswith("_")]

['AVG',
 'AggregationType',
 'AnyType',
 'Argument',
 'ArgumentSpec',
 'Assert',
 'BFloat16Storage',
 'BFloat16Tensor',
 'BenchmarkConfig',
 'BenchmarkExecutionStats',
 'Block',
 'BoolStorage',
 'BoolTensor',
 'BoolType',
 'BufferDict',
 'ByteStorage',
 'ByteTensor',
 'CONV_BN_FUSION',
 'CallStack',
 'Capsule',
 'CharStorage',
 'CharTensor',
 'ClassType',
 'Code',
 'CompilationUnit',
 'CompleteArgumentSpec',
 'ComplexDoubleStorage',
 'ComplexFloatStorage',
 'ConcreteModuleType',
 'ConcreteModuleTypeBuilder',
 'CudaBFloat16StorageBase',
 'CudaBoolStorageBase',
 'CudaByteStorageBase',
 'CudaCharStorageBase',
 'CudaComplexDoubleStorageBase',
 'CudaComplexFloatStorageBase',
 'CudaDoubleStorageBase',
 'CudaFloatStorageBase',
 'CudaHalfStorageBase',
 'CudaIntStorageBase',
 'CudaLongStorageBase',
 'CudaShortStorageBase',
 'DeepCopyMemoTable',
 'DeviceObjType',
 'DictType',
 'DisableTorchFunction',
 'DoubleStorage',
 'DoubleTensor',
 'EnumType',
 'ErrorReport',
 'ExecutionPlan',
 'FUSE_ADD_REL

In [27]:
[o for o in dir(A) if not o.startswith("_")]

['T',
 'abs',
 'abs_',
 'absolute',
 'absolute_',
 'acos',
 'acos_',
 'acosh',
 'acosh_',
 'add',
 'add_',
 'addbmm',
 'addbmm_',
 'addcdiv',
 'addcdiv_',
 'addcmul',
 'addcmul_',
 'addmm',
 'addmm_',
 'addmv',
 'addmv_',
 'addr',
 'addr_',
 'align_as',
 'align_to',
 'all',
 'allclose',
 'amax',
 'amin',
 'angle',
 'any',
 'apply_',
 'arccos',
 'arccos_',
 'arccosh',
 'arccosh_',
 'arcsin',
 'arcsin_',
 'arcsinh',
 'arcsinh_',
 'arctan',
 'arctan_',
 'arctanh',
 'arctanh_',
 'argmax',
 'argmin',
 'argsort',
 'as_strided',
 'as_strided_',
 'as_subclass',
 'asin',
 'asin_',
 'asinh',
 'asinh_',
 'atan',
 'atan2',
 'atan2_',
 'atan_',
 'atanh',
 'atanh_',
 'backward',
 'baddbmm',
 'baddbmm_',
 'bernoulli',
 'bernoulli_',
 'bfloat16',
 'bincount',
 'bitwise_and',
 'bitwise_and_',
 'bitwise_not',
 'bitwise_not_',
 'bitwise_or',
 'bitwise_or_',
 'bitwise_xor',
 'bitwise_xor_',
 'bmm',
 'bool',
 'byte',
 'cauchy_',
 'ceil',
 'ceil_',
 'char',
 'cholesky',
 'cholesky_inverse',
 'cholesky_solve

# But what about the GPU?
How do I use the GPU?

If you have a GPU make sure that the right pytorch is installed
(check https://pytorch.org/ for details).

In [28]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

If you have a GPU you should get something like: 
`device(type='cuda', index=0)`

You can move data to the GPU by doing `.to(device)`.

In [29]:
data = torch.eye(3)
data = data.to(device)
data

tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]], device='cuda:0')

Now the computation happens on the GPU.

In [30]:
res = data + data
res

tensor([[2., 0., 0.],
        [0., 2., 0.],
        [0., 0., 2.]], device='cuda:0')

In [31]:
res.device

device(type='cuda', index=0)

Note: before `v0.4` one had to use `.cuda()` and `.cpu()` to move stuff to and from the GPU.
This littered the code with many:
```python
if CUDA:
    model = model.cuda()
```

# Automatic differentiation with `autograd`
Prior to `v0.4` PyTorch used the class `Variable` to record gradients. You had to wrap `Tensor`s in `Variable`s.
`Variable`s behaved exactly like `Tensors`.

With `v0.4` `Tensor` can record gradients directly if you tell it do do so, e.g. `torch.ones(3, requires_grad=True)`.
There is no need for `Variable` anymore.
Many tutorials still use `Variable`, be aware!

Ref:
- https://pytorch.org/docs/stable/autograd.html
- https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html

You rarely use `torch.autograd` directly.
Pretty much everything is part or `torch.Tensor` now.
Simply add `requires_grad=True` to the tensors you want to calculate the gradients for.
`nn.Module` track gradients automatically.

In [32]:
from torch import autograd

In [33]:
x = torch.tensor(2.)
x

tensor(2.)

In [34]:
x = torch.tensor(2., requires_grad=True)
x

tensor(2., requires_grad=True)

In [35]:
print(x.requires_grad)

True


In [36]:
print(x.grad)

None


In [37]:
y = x ** 2

print("Grad of x:", x.grad)

Grad of x: None


In [38]:
y = x ** 2
y.backward()

print("Grad of x:", x.grad)

Grad of x: tensor(4.)


In [39]:
# What is going to happen here?
# x = torch.tensor(2.)
# x.backward()

In [40]:
# Don't record the gradient
# Useful for inference

params = torch.tensor(2., requires_grad=True)

with torch.no_grad():
    y = x * x
    print(x.grad_fn)

None


`nn.Module` and `nn.Parameter` keep track of gradients for you.

In [41]:
lin = nn.Linear(2, 1, bias=True)
lin.weight

Parameter containing:
tensor([[-0.3140, -0.0996]], requires_grad=True)

In [42]:
type(lin.weight)

torch.nn.parameter.Parameter

In [43]:
isinstance(lin.weight, torch.FloatTensor)

True

## `torch.nn`
The neural network modules contains many different layers.

In [44]:
from torch import nn

In [45]:
lin_reg = nn.Linear(1, 1, bias=True)
lin_reg

Linear(in_features=1, out_features=1, bias=True)

In [46]:
nn.Conv2d

torch.nn.modules.conv.Conv2d

In [47]:
nn.Conv3d

torch.nn.modules.conv.Conv3d

In [48]:
nn.BatchNorm2d

torch.nn.modules.batchnorm.BatchNorm2d

### Activations

In [49]:
nn.ReLU

torch.nn.modules.activation.ReLU

In [50]:
nn.Sigmoid

torch.nn.modules.activation.Sigmoid

### Losses

In [51]:
nn.Softmax

torch.nn.modules.activation.Softmax

In [52]:
nn.CrossEntropyLoss

torch.nn.modules.loss.CrossEntropyLoss

In [53]:
nn.BCELoss

torch.nn.modules.loss.BCELoss

In [54]:
nn.MSELoss

torch.nn.modules.loss.MSELoss

### Functional (stateless) alternatives

In [55]:
from torch.nn import functional as F

In [56]:
F.mse_loss

<function torch.nn.functional.mse_loss(input, target, size_average=None, reduce=None, reduction='mean')>

In [57]:
F.relu

<function torch.nn.functional.relu(input: torch.Tensor, inplace: bool = False) -> torch.Tensor>

In [58]:
F.relu6

<function torch.nn.functional.relu6(input, inplace=False)>

## `torch.optim`

In [59]:
from torch import optim

In [60]:
optim.SGD

torch.optim.sgd.SGD

In [61]:
optim.Adam

torch.optim.adam.Adam

In [62]:
optim.AdamW

torch.optim.adamw.AdamW

# Exercise
- Do you remember the analytical solution to solve for the parameters of linear regression? Implement it.