## Tensor
A tensor is a d-dimensional array and serves as the input and output of every layer of a deep network. Tensors have many mathematical operations associated with them (extensions of matrix multiplication). However, those operations are not used in most deep learning frameworks, and a Tensor is simply an array of numbers.

1. https://en.wikipedia.org/wiki/Tensor
2. https://pytorch.org/docs/stable/tensors.html

## PyTorch

In [56]:
from __future__ import print_function
import torch
import torch.nn as nn
import numpy as np

In [186]:
x = torch.tensor([[1, 3, 0],[2, 4, 6]])
x.t()

tensor([[1, 2],
        [3, 4],
        [0, 6]])

In [187]:
x.unsqueeze(0)

tensor([[[1, 3, 0],
         [2, 4, 6]]])

In [168]:
x.view(-1)

tensor([1, 3, 0, 2, 4, 6])

In [171]:
x.view(3, -1)

tensor([[1, 3],
        [0, 2],
        [4, 6]])

In [173]:
x

tensor([[1, 3, 0],
        [2, 4, 6]])

In [176]:
x.view(1, 2, 3).expand(3, 2, 3)

tensor([[[1, 3, 0],
         [2, 4, 6]],

        [[1, 3, 0],
         [2, 4, 6]],

        [[1, 3, 0],
         [2, 4, 6]]])

In [184]:
x = torch.rand(3, 3, 3)
x

tensor([[[0.4042, 0.2023, 0.5143],
         [0.8907, 0.9357, 0.0373],
         [0.9496, 0.1768, 0.5923]],

        [[0.2776, 0.2183, 0.4221],
         [0.5166, 0.0942, 0.3265],
         [0.8599, 0.4793, 0.2411]],

        [[0.9283, 0.0325, 0.3695],
         [0.9139, 0.5492, 0.3188],
         [0.2296, 0.3747, 0.2233]]])

In [185]:
x.transpose(0, 1)

tensor([[[0.4042, 0.2023, 0.5143],
         [0.2776, 0.2183, 0.4221],
         [0.9283, 0.0325, 0.3695]],

        [[0.8907, 0.9357, 0.0373],
         [0.5166, 0.0942, 0.3265],
         [0.9139, 0.5492, 0.3188]],

        [[0.9496, 0.1768, 0.5923],
         [0.8599, 0.4793, 0.2411],
         [0.2296, 0.3747, 0.2233]]])

In [183]:
z = x.permute(2, 1, 0)
z

tensor([[[0.1626, 0.2973, 0.1778],
         [0.9956, 0.1095, 0.9746],
         [0.3998, 0.8508, 0.2644]],

        [[0.9032, 0.4908, 0.7925],
         [0.0098, 0.5688, 0.8994],
         [0.0382, 0.9372, 0.8715]],

        [[0.9011, 0.7393, 0.1442],
         [0.3166, 0.2809, 0.7144],
         [0.7246, 0.3827, 0.8788]]])

### Operations

In [112]:
torch.is_tensor(z2)

True

#### In-place operation

> All operations end with "_" is in place operations:


In [113]:
x = torch.tensor([2.0, 3.0])
y = torch.tensor([1.0, 4.0])
x.add_(y)
print(x)

tensor([3., 7.])


#### Out
> We can assign the operation result to a variable. Alternatively, all operation methods have an out parameter to store the result.

In [165]:
r1 = torch.Tensor(2, 3)
torch.add(x, y, out=r1)   

RuntimeError: The size of tensor a (3) must match the size of tensor b (6) at non-singleton dimension 0

#### Indexing

In [None]:
x1 = torch.empty(3, 3).random_(10)
x1

tensor([[3., 5., 5.],
        [9., 9., 2.],
        [0., 0., 8.]])

In [None]:
x1[:, 1]

tensor([5., 9., 0.])

In [None]:
x1[:, 1] = 0

In [None]:
x1

tensor([[3., 0., 5.],
        [9., 0., 2.],
        [0., 0., 8.]])

#### Conversion between NumPy ndarray and Tensor
> During the conversion, both ndarray and Tensor share the same memory storage. Change value from either side will affect the other.

In [None]:
# Conversion
a = np.array([1, 2, 3])
v = torch.from_numpy(a)         # Convert a numpy array to a Tensor

b = v.numpy()                   # Tensor to numpy
b[1] = -1                       # Numpy and Tensor share the same memory

In [None]:
if a[1] == b[1]:
    print('True') # Change Numpy will also change the Tensor

True


#### Tensor meta-data

> Size of the Tensor and number of elements in Tensor:


In [None]:
### Basic Tensor operation
x = torch.empty(3, 3).fill_(4)
x.size()                        # torch.Size([2, 3])

torch.Size([3, 3])

In [None]:
torch.numel(x)                  # 6: number of elements in x

9

#### Reshape Tensor

> Reshape a Tensor to different size:


In [None]:
### Tensor resizing
x = torch.randn(2, 3)            # Size 2x3
y = x.view(6)                    # Resize x to size 6
z = x.view(-1, 2)                # Size 3x2

In [None]:
z

tensor([[ 0.8635,  0.8067],
        [-1.7315,  0.2933],
        [ 0.9307,  1.2782]])

#### Create a random Tensor

>To increase the reproducibility of result, we often set the random seed to a specific value first.


In [None]:
torch.manual_seed(1)

<torch._C.Generator at 0x7f1f142c6350>

In [None]:
v = torch.rand(2, 3)            # Initialize with random number (uniform distribution)
v = torch.randn(2, 3)           # With normal distribution (SD=1, mean=0)
v = torch.randperm(4)           # Size 4. Random permutation of integers from 0 to 3

In [None]:
v

tensor([2, 0, 1, 3])

#### Identity matrices, Fill Tensor with 0, 1 or values

In [None]:
x = torch.empty(3, 3)
print(x)

tensor([[0.0000e+00, 0.0000e+00, 7.7052e+31],
        [7.2148e+22, 2.5226e-18, 2.5930e-09],
        [1.0413e-11, 3.0883e-09, 4.2731e-05]])


In [None]:
x = torch.zeros(3,3)
print(x)

tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])


In [None]:
x = torch.full((2,2), 3)
print(x)

tensor([[3, 3],
        [3, 3]])


In [None]:
x = torch.eye(3)
print(x)

tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]])


In [None]:
v = torch.ones(3, 2)
print(v)

tensor([[1., 1.],
        [1., 1.],
        [1., 1.]])


In [None]:
v[1].fill_(2)
v[2].fill_(3)

tensor([3., 3.])

In [None]:
v

tensor([[1., 1.],
        [2., 2.],
        [3., 3.]])

In [None]:
eye = torch.eye(3)              # Create an identity 3x3 tensor

u = torch.ones_like(eye)        # A tensor with same shape as eye. Fill it with 1.
u

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])

In [None]:
x = torch.empty(3, 3).normal_()
print(x)

tensor([[-0.6919, -0.4043,  0.2222],
        [ 0.5773, -1.7637,  0.2264],
        [-0.2355,  0.3019, -0.2770]])


In [None]:
x = torch.empty(3, 3).bernoulli_(.7)
print(x)

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])


In [None]:
x = torch.empty(3, 3).fill_(5)
print(x)

tensor([[5., 5., 5.],
        [5., 5., 5.],
        [5., 5., 5.]])


#### Initialize Tensor with a range of value

In [None]:
v = torch.arange(5)             # similar to range(5) but creating a Tensor
v

tensor([0, 1, 2, 3, 4])

In [None]:
v = torch.arange(0, 6, step=1)  # Size 5. Similar to range(0, 5, 1)
v

tensor([0, 1, 2, 3, 4, 5])

In [None]:
v = v.view(-1, 3)
v

tensor([[0, 1, 2],
        [3, 4, 5]])

In [None]:
v = torch.linspace(1, 10, steps=10) # Create a Tensor with 10 linear points for (1, 10) inclusively
v

tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.])

In [None]:
v = torch.logspace(start=-10, end=10, steps=5) # Size 5: 1.0e-10 1.0e-05 1.0e+00, 1.0e+05, 1.0e+10
v

tensor([1.0000e-10, 1.0000e-05, 1.0000e+00, 1.0000e+05, 1.0000e+10])

#### Concatenate, stack

In [None]:
x = torch.empty(3, 3).random_(10)
x

tensor([[8., 6., 9.],
        [9., 8., 3.],
        [3., 1., 7.]])

In [None]:
torch.cat((x, x), 0)

tensor([[8., 6., 9.],
        [9., 8., 3.],
        [3., 1., 7.],
        [8., 6., 9.],
        [9., 8., 3.],
        [3., 1., 7.]])

In [None]:
torch.cat((x, x), 1)

tensor([[8., 6., 9., 8., 6., 9.],
        [9., 8., 3., 9., 8., 3.],
        [3., 1., 7., 3., 1., 7.]])

In [None]:
torch.stack((x, x))

tensor([[[8., 6., 9.],
         [9., 8., 3.],
         [3., 1., 7.]],

        [[8., 6., 9.],
         [9., 8., 3.],
         [3., 1., 7.]]])

#### Reorganize data element

In [None]:
x = torch.empty(2, 2).random_(10)
x

tensor([[2., 8.],
        [9., 3.]])

In [None]:
# Gather element
# torch.gather(input, dim, index, out=None)

torch.gather(x, 0, torch.LongTensor([[0,0],[1,0]]))

tensor([[2., 8.],
        [9., 8.]])

#### Split a Tensor

In [None]:
r = torch.randn(3, 3)
s = torch.chunk(v, 3)
s

(tensor([1.0000e-10, 1.0000e-05]),
 tensor([1.0000e+00, 1.0000e+05]),
 tensor([1.0000e+10]))

In [None]:
torch.split(r, 2)

(tensor([[ 0.4771,  0.8310, -0.2477],
         [-0.8029,  0.2366,  0.2857]]),
 tensor([[ 0.6898, -0.6331,  0.8795]]))

#### Index select, mask select

In [None]:
v = torch.arange(9)
v = v.view(3, 3)
v

tensor([[0, 1, 2],
        [3, 4, 5],
        [6, 7, 8]])

In [None]:
indices = torch.LongTensor([0, 2])
r = torch.index_select(v, 0, indices) # Select element 0 and 2 for each dimension 1.
r

tensor([[0, 1, 2],
        [6, 7, 8]])

In [None]:
mask = v.ge(3) ## greater then
mask

tensor([[False, False, False],
        [ True,  True,  True],
        [ True,  True,  True]])

In [None]:
r = torch.masked_select(v, mask)
r

tensor([3, 4, 5, 6, 7, 8])

#### Squeeze and unsqueeze

In [None]:
t = torch.ones(2,1,2,1) # Size 2x1x2x1
t

tensor([[[[1.],
          [1.]]],


        [[[1.],
          [1.]]]])

In [None]:
r = torch.squeeze(t)     # Size 2x2
r

tensor([[1., 1.],
        [1., 1.]])

In [None]:
r = torch.squeeze(t, 1)  # Squeeze dimension 1: Size 2x2x1
r

tensor([[[1.],
         [1.]],

        [[1.],
         [1.]]])

In [None]:
# Un-squeeze a dimension
x = torch.Tensor([1, 2, 3])
r = torch.unsqueeze(x, 0)       # Size: 1x3
r = torch.unsqueeze(x, 1)       # Size: 3x1
r

tensor([[1.],
        [2.],
        [3.]])

#### Non-zero elements

In [None]:
torch.nonzero(v)

tensor([[0, 1],
        [0, 2],
        [1, 0],
        [1, 1],
        [1, 2],
        [2, 0],
        [2, 1],
        [2, 2]])

#### Distribution

In [None]:
# 2x2: A uniform distributed random matrix with range [0, 1]
r = torch.Tensor(2, 2).uniform_(0, 1)
r

tensor([[0.4259, 0.7812],
        [0.6607, 0.1251]])

In [None]:
# bernoulli
r = torch.bernoulli(r)   # Size: 2x2. Bernoulli with probability p stored in elements of r
r

tensor([[0., 1.],
        [1., 0.]])

In [None]:
# Multinomial
w = torch.Tensor([0, 4, 8, 2]) # Create a tensor of weights
r = torch.multinomial(w, 4, replacement=True) # Size 4: 3, 2, 1, 2
r

tensor([2, 2, 2, 1])

In [None]:
# Normal distribution
# From 10 means and SD
r = torch.normal(1, 0) # Size 10
r

TypeError: normal() received an invalid combination of arguments - got (int, int), but expected one of:
 * (Tensor mean, Tensor std, *, torch.Generator generator, Tensor out)
 * (Tensor mean, float std, *, torch.Generator generator, Tensor out)
 * (float mean, Tensor std, *, torch.Generator generator, Tensor out)
 * (float mean, float std, tuple of ints size, *, torch.Generator generator, Tensor out, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)


Random sampling
----------------------------------
- autofunction:: manual_seed    - Set a manual seed
- autofunction:: initial_seed   - Randomize a seed by the system
- autofunction:: get_rng_state
- autofunction:: set_rng_state
- autodata:: default_generator
- autofunction:: bernoulli
- autofunction:: multinomial
- autofunction:: normal
- autofunction:: rand
- autofunction:: randn
- autofunction:: randperm

In-place random sampling
-------------------------------------

There are a few more in-place random sampling functions defined on Tensors as well. Click through to refer to their documentation:

- :func:`torch.Tensor.bernoulli_` - in-place version of :func:`torch.bernoulli`
- :func:`torch.Tensor.cauchy_` - numbers drawn from the Cauchy distribution
- :func:`torch.Tensor.exponential_` - numbers drawn from the exponential distribution
- :func:`torch.Tensor.geometric_` - elements drawn from the geometric distribution
- :func:`torch.Tensor.log_normal_` - samples from the log-normal distribution
- :func:`torch.Tensor.normal_` - in-place version of :func:`torch.normal`
- :func:`torch.Tensor.random_` - numbers sampled from the discrete uniform distribution
- :func:`torch.Tensor.uniform_` - numbers sampled from the continuous uniform distribution

#### Point-wise operations

In [None]:
### Math operations
f= torch.FloatTensor([-1, -2, 3])
r = torch.abs(f)      # 1 2 3
r

tensor([1., 2., 3.])

In [None]:
# Add x, y and scalar 10 to all elements
r = torch.add(x, 10)
r = torch.add(x, 10, y)
r

RuntimeError: The size of tensor a (3) must match the size of tensor b (6) at non-singleton dimension 0

In [None]:
# Clamp the value of a Tensor
r = torch.clamp(v, min=-0.5, max=0.5)
r

tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]])

In [None]:
# Element-wise divide
r = torch.div(v, v+0.03)
r

In [None]:
# Element-wise multiple
r = torch.mul(v, v)

In [None]:
### Math operations
f= torch.FloatTensor([-1, -2, 3])
r = torch.abs(f)      # 1 2 3
r

tensor([1., 2., 3.])

Pointwise Ops
----------------------------------------

- autofunction:: abs
- autofunction:: acos           - arc cosine
- autofunction:: add
- autofunction:: addcdiv        - element wise: t1 + s * t2/t3
- autofunction:: addcmul        - element wise: t1 + s * t2 * t3
- autofunction:: asin           - arc sin
- autofunction:: atan
- autofunction:: atan2
- autofunction:: ceil           - ceiling
- autofunction:: clamp          - clamp elements into a range
- autofunction:: cos
- autofunction:: cosh
- autofunction:: div            - divide
- autofunction:: erf            - Gaussian error functiom
- autofunction:: erfinv         - Inverse
- autofunction:: exp
- autofunction:: expm1          - exponential of each element minus 1
- autofunction:: floor          
- autofunction:: fmod           - element wise remainder of division
- autofunction:: frac           - fraction part 3.4 -> 0.4
- autofunction:: lerp           - linear interpolation
- autofunction:: log            - natural log
- autofunction:: log1p          - y = log(1 + x)
- autofunction:: mul            - multiple
- autofunction:: neg 
- autofunction:: pow
- autofunction:: reciprocal     - 1/x
- autofunction:: remainder      - remainder of division
- autofunction:: round
- autofunction:: rsqrt          - the reciprocal of the square-root 
- autofunction:: sigmoid        - sigmode(x)
- autofunction:: sign
- autofunction:: sin
- autofunction:: sinh
- autofunction:: sqrt
- autofunction:: tan
- autofunction:: tanh
- autofunction:: trunc          - truncated integer

#### Reduction operations

In [None]:
v = torch.arange(9, dtype=torch.float64)
v = v.view(3, 3)
v

tensor([[0., 1., 2.],
        [3., 4., 5.],
        [6., 7., 8.]], dtype=torch.float64)

In [None]:
r = torch.cumsum(v, dim=0)
r

tensor([[ 0.,  1.,  2.],
        [ 3.,  5.,  7.],
        [ 9., 12., 15.]], dtype=torch.float64)

In [None]:
torch.mean(v, dim=0)

tensor([3., 4., 5.], dtype=torch.float64)

In [None]:
v.sum(dim=1)

tensor([ 3, 12, 21])

In [None]:
v.sum(dim=0)

tensor([ 9., 12., 15.], dtype=torch.float64)

In [None]:
x = torch.tensor([1.0, 2.0, 3.0])
x.norm(p=1)

tensor(6.)

Reduction Ops
--------------------------

- autofunction:: cumprod        - accumulate product of elements x1*x2*x3...
- autofunction:: cumsum
- autofunction:: dist           - L-p norm
- autofunction:: mean
- autofunction:: median
- autofunction:: mode
- autofunction:: norm           - L-p norm
- autofunction:: prod           - accumulate product
- autofunction:: std            - compute standard deviation
- autofunction:: sum
- autofunction:: var            - variance of all elements

#### Comparison operation

In [None]:
### Comparison
# Size 3x3: Element-wise comparison
r = torch.eq(v, v)
r

tensor([[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]], dtype=torch.uint8)

In [None]:
# Max element with corresponding index
r = torch.max(v, 1)
r

(tensor([2, 5, 8]), tensor([2, 2, 2]))

Comparison Ops
-----------------------

- autofunction:: eq             - Compare elements
- autofunction:: equal          - True of 2 tensors are the same 
- autofunction:: ge             - Element-wise greater or equal comparison
- autofunction:: gt
- autofunction:: kthvalue       - k-th element
- autofunction:: le
- autofunction:: lt
- autofunction:: max
- autofunction:: min
- autofunction:: ne
- autofunction:: sort
- autofunction:: topk           - top k

#### Matrix, vector multiplication

In [None]:
r = torch.dot(torch.tensor([4, 2]), torch.tensor([3, 1]))
r

tensor(14)

In [None]:
mat = torch.randn(2, 4)
vec = torch.randn(4)
r = torch.mv(mat, vec)
r

tensor([-1.8317,  0.8934])

In [None]:
# Matrix + Matrix X vector
# Size 2
M = torch.randn(2)
mat = torch.randn(2, 3)
vec = torch.randn(3)
r = torch.addmv(M, mat, vec)
r

tensor([-1.5831, -4.9747])

In [None]:
# Matrix x Matrix
# Size 2x4
mat1 = torch.randn(2, 3)
mat2 = torch.randn(3, 4)
r = torch.mm(mat1, mat2)
r

tensor([[-0.4561,  0.2306,  0.4803,  0.1904],
        [-0.5957, -0.9246, -1.1840, -0.2337]])

In [None]:
# Matrix + Matrix X Matrix
# Size 3x4
M = torch.randn(3, 4)
mat1 = torch.randn(3, 2)
mat2 = torch.randn(2, 4)
r = torch.addmm(M, mat1, mat2)
r

tensor([[-1.2168,  2.2809,  0.7730, -1.0231],
        [-0.3334,  0.1540,  0.0586, -0.6885],
        [ 1.4199, -1.0344, -1.5947,  1.2797]])

In [None]:
# Cross product
m1 = torch.ones(3, 5)
m2 = torch.ones(3, 5)

# Cross product
# Size 3x5
r = torch.cross(m1, m2)
r

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])

#### Histogram

In [None]:
torch.histc(torch.FloatTensor([1, 2, 1]), bins=4, min=0, max=3)

tensor([0., 2., 1., 0.])

In [None]:
# Renormalization
v = torch.randn(3,3)
r = torch.renorm(v, 1, 0, 1)
r

tensor([[-0.0797,  0.3801,  0.2044],
        [-0.2176, -0.2718, -0.5107],
        [ 0.1909,  0.0479,  0.7612]])

In [None]:
r = torch.diag(v)
r

tensor([-0.0797, -0.8637,  1.6980])

In [None]:
torch.trace(v)

tensor(0.7546)

In [None]:
torch.tril(v)

tensor([[-0.0797,  0.0000,  0.0000],
        [-0.6914, -0.8637,  0.0000],
        [ 0.4259,  0.1068,  1.6980]])

In [None]:
torch.triu(v)

tensor([[-0.0797,  0.3801,  0.2044],
        [ 0.0000, -0.8637, -1.6228],
        [ 0.0000,  0.0000,  1.6980]])

Tensors
----------------------------------
- autofunction:: is_tensor
- autofunction:: is_storage
- autofunction:: set_default_tensor_type
- autofunction:: numel
- autofunction:: set_printoptions

Serialization
----------------------------------
- autofunction:: save          - Saves an object to a disk file
- autofunction:: load          - Loads an object saved with torch.save() from a file

Parallelism
----------------------------------
- autofunction:: get_num_threads - Gets the number of OpenMP threads used for parallelizing CPU operations
- autofunction:: set_num_threads

Spectral Ops
~~~~~~~~~~~~~~~~~~~~~~
- autofunction:: stft          - Short-time Fourier transform 
- autofunction:: hann_window   - Hann window function
- autofunction:: hamming_window  - Hamming window function
- autofunction:: bartlett_window - Bartlett window function


BLAS and LAPACK Operations
~~~~~~~~~~~~~~~~~~~~~~~~~~~
- autofunction:: addbmm          - Batch add and mulitply matrices nxp + b×n×m X b×m×p -> bxnxp
- autofunction:: addmm           - Add and mulitply matrices nxp + n×m X m×p -> nxp
- autofunction:: addmv           - Add and matrix, vector multipy n + nxm X m -> n
- autofunction:: addr            - Outer product of vectors
- autofunction:: baddbmm         - Batch add and mulitply matrices
- autofunction:: bmm             - Batch mulitply matrices b×n×m X b×m×p -> b×n×p
- autofunction:: btrifact        - LU factorization
- autofunction:: btrifact_with_info
- autofunction:: btrisolve
- autofunction:: btriunpack
- autofunction:: dot             - Dot product of 2 tensors
- autofunction:: eig             - Eigenvalues and eigenvectors ofsquare matrix
- autofunction:: gels            - Solution for least square or p-norm(AX - B)
- autofunction:: geqrf
- autofunction:: ger             - Outer product of 2 vectors
- autofunction:: gesv            - Solve linear equations
- autofunction:: inverse         - Inverse of square matrix
- autofunction:: det             - Determinant of a 2D square Variable
- autofunction:: matmul          - Matrix product of tensors
- autofunction:: mm				- Matrix multiplication
- autofunction:: mv              - Matrix vector product
- autofunction:: orgqr           - Orthogal matrix Q 
- autofunction:: ormqr           - Multiplies matrix by the orthogonal Q matrix
- autofunction:: potrf           - Cholesky decomposition
- autofunction:: potri           - Inverse of a positive semidefinite matrix with Cholesky
- autofunction:: potrs           - Solve linear equation with positive semidefinite
- autofunction:: pstrf           - Cholesky decomposition of a positive semidefinite matrix
- autofunction:: qr              - QR decomposition
- autofunction:: svd             - SVD decomposition
- autofunction:: symeig          - Eigenvalues and eigenvectors
- autofunction:: trtrs           - Solves a system of equations with a triangular coefficient

#### Broadcasting

In [None]:
A = torch.tensor([[1.], [2.], [3.], [4.]])
B = torch.tensor([[5., -5., 5., -5., 5.]])
C = A + B
print(C)

tensor([[ 6., -4.,  6., -4.,  6.],
        [ 7., -3.,  7., -3.,  7.],
        [ 8., -2.,  8., -2.,  8.],
        [ 9., -1.,  9., -1.,  9.]])


#### Storage

In [None]:
 q = torch.arange(0, 20, dtype=torch.float32).storage()

In [None]:
print(q)

 0.0
 1.0
 2.0
 3.0
 4.0
 5.0
 6.0
 7.0
 8.0
 9.0
 10.0
 11.0
 12.0
 13.0
 14.0
 15.0
 16.0
 17.0
 18.0
 19.0
[torch.FloatStorage of size 20]


In [None]:
n = torch.linspace(1, 4, 4)
print(n)

tensor([1., 2., 3., 4.])


In [None]:
x = torch.empty(3, 3).random_(10)
x

tensor([[5., 4., 8.],
        [6., 8., 9.],
        [4., 5., 0.]])

In [None]:
x.stride()

(3, 1)

### Variables

A **Variable** wraps a Tensor. It supports nearly all the API defined by a Tensor. 
Variable also provides a _backward_ method to perform backpropagation. For example, 
to backpropagate a loss function to train model parameter $x$, we use a variable $loss$ 
to store the value computed by a loss function. Then, we call _loss.backward_ which computes 
the gradients $$\frac{\partial loss}{\partial x}$$ for all trainable parameters. 
PyTorch will store the gradient results back in the corresponding variable $x$.

> Create a 2x2 Variable to store input data: 

In [None]:
from torch.autograd import Variable


x = Variable(torch.ones(2, 2), requires_grad=True)
x

tensor([[1., 1.],
        [1., 1.]], requires_grad=True)

$requires\_grad$ indicates whether a variable is trainable. By default, $requires\_grad$ is False in creating a Variable. If one of the input to an operation requires gradient, its output and its subgraphs will also require gradient. To fine tune just part of a pre-trained model, we can set $requires\_grad$ to False at the base but then turn it on at the entrance of the subgraphs that we want to retrain.


### Compute gradient
Autograd is a PyTorch package for the differentiation for all operations on Tensors. It performs the backpropagation starting from a variable. In deep learning, this variable often holds the value of the cost function. _backward_ executes the backward pass and computes all the backpropagation gradients automatically. We access indvidual gradient through the attributes _grad_ of a variable.  _x.grad_ below returns a 2x2 gradient tensor for $$\frac{\partial out}{\partial x}​$$.

In [None]:
# Create tensors.
x = torch.tensor(1., requires_grad=True)
w = torch.tensor(2., requires_grad=True)
b = torch.tensor(3., requires_grad=True)

# Build a computational graph.
y = w * x + b    # y = 2 * x + 3

In [None]:
y.backward()
print(x.grad)    # x.grad = 2 
print(w.grad)    # w.grad = 1 
print(b.grad)    # b.grad = 1 

tensor(2.)
tensor(1.)
tensor(1.)


> To check the resule, we compute the gradient manually:

$$
\begin{split}
\frac{\partial out}{\partial x_i} & = \frac{1}{4} \sum_j \frac{\partial z_j}{\partial x_i} \\
& = \frac{1}{4} \sum_j \frac{\partial 2 y_j^2}{\partial x_i} \\
& = \frac{1}{4} \sum_j 4 y_j \frac{\partial y_j }{\partial x_i} \\
& = \sum_j  (x_j + 2) \frac{\partial (x_j + 2) }{\partial x_i} \\
& = x_i + 2 \quad \quad  & \frac{\partial x_j }{\partial x_i} = 0 \text{ if } i \neq j \\
& = 3 \quad \quad  & \text{ for } x_i=1\\
\end{split}
$$

### Dynamic vs Static computation graph (PyTorch vs TensorFlow)

The TensorFlow computation graph is static. Operation executions are delayed until the graph is completed. TensorFlow defines a graph first with placeholders. Once all operations are added, we execute the graph in a session by feeding data into the placeholders. The computation graph is static because it cannot be changed afterwards. We can repeat this process with different batch of data but the graph remains the same.

By design, PyTorch uses a dynamic computation graph. Whenever we create a variable or operations, it is executed immediately. We can add and execute operations anytime before _backward_ is called. _backwards_ follows the graph backward to compute the gradients. Then the graph will be disposed. (the retain_graph flag can override this behavior but rarely suggested.) For the training data in the next training iteration, a new graph is created. We can use the same code to create the same structure, or create a graph with different operations. In NLP, we deal with variable length sentences. Instead of padding the sentence to a fixed length, we create graphs with different number of LSTM cells based on the sentence's length. 

We call this a define-by-run framework. which the backpropagation is based on what has been running in the graph. Since we start a new graph for every iteration, the backpropagation path can be different for each iteration.

### Example

PyTorch provides functions to make training easier without processing the raw data of the gradients directly. This is demonstrated [here.](https://jhui.github.io/2018/02/09/PyTorch-neural-networks/)

But to tie all the APIs together, here is an example in doing backpropagation manually. 

In [None]:
import torch
from torch.autograd import Variable

dtype = torch.FloatTensor
N, D_in, H, D_out = 64, 1000, 100, 10

x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])

    loss.backward()

    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data

    w1.grad.data.zero_()
    w2.grad.data.zero_()



0 tensor(27975546.)
1 tensor(22730596.)
2 tensor(21506468.)
3 tensor(20818208.)
4 tensor(18978276.)
5 tensor(15495289.)
6 tensor(11307624.)
7 tensor(7492934.)
8 tensor(4719343.)
9 tensor(2940541.2500)
10 tensor(1884401.8750)
11 tensor(1268892.5000)
12 tensor(906192.7500)
13 tensor(683955.4375)
14 tensor(540437.8750)
15 tensor(442071.3438)
16 tensor(370744.7500)
17 tensor(316360.4375)
18 tensor(273224.0938)
19 tensor(238040.6250)
20 tensor(208771.6250)
21 tensor(184032.7969)
22 tensor(162902.6094)
23 tensor(144708.4844)
24 tensor(128949.0625)
25 tensor(115219.4922)
26 tensor(103188.2109)
27 tensor(92631.8047)
28 tensor(83351.3594)
29 tensor(75142.6875)
30 tensor(67857.2891)
31 tensor(61379.0078)
32 tensor(55603.4688)
33 tensor(50451.3242)
34 tensor(45845.5820)
35 tensor(41714.2422)
36 tensor(38000.4180)
37 tensor(34660.5273)
38 tensor(31650.2188)
39 tensor(28933.5039)
40 tensor(26476.6816)
41 tensor(24253.8672)
42 tensor(22240.4160)
43 tensor(20412.1230)
44 tensor(18749.7207)
45 tensor(

### Access data

We can access the raw data of a variable with _data_.

In [None]:
x = torch.randn(3)
x = Variable(x, requires_grad=True)

y = x * 2
while y.data.norm() < 100:
    y = y * 2

print(y)

tensor([-105.4056,   61.2772, -125.2646], grad_fn=<MulBackward>)


### Backward (non-scalar output)

_out_ below is a scalar and we do not need to specify any parameters for _backward_. By default, we backpropagate a gradient of 1.0 back.

In [None]:
out = x.mean()
out.backward()    # Same as out.backward(torch.FloatTensor([1.0]))
print(out)

tensor(-0.8823, grad_fn=<MeanBackward1>)


$y$ below is a Tensor of size 3. _backward_ requires a Tensor to specify each backpropagation gradient if the variable is not a scalar. To match each element of $y$, $gradients$ needs to match the size of _y_. In some situtation, the gradient values are computed from the model predictions and the true labels.

In [None]:
gradients = torch.FloatTensor([0.1, 1.0, 0.0001])
y.backward(gradients)

print(x.grad)

tensor([ 6.4000, 64.0000,  0.0064])


In [None]:
# Create tensors of shape (10, 3) and (10, 2).
x = torch.randn(10, 3)
y = torch.randn(10, 2)

# Build a fully connected layer.
linear = nn.Linear(3, 2)
print ('w: ', linear.weight)
print ('b: ', linear.bias)

w:  Parameter containing:
tensor([[-0.0260,  0.4203,  0.5215],
        [-0.0015,  0.3561,  0.1838]], requires_grad=True)
b:  Parameter containing:
tensor([ 0.0210, -0.0012], requires_grad=True)


In [None]:
# Build loss function and optimizer.
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(linear.parameters(), lr=0.01)

# Forward pass.
pred = linear(x)

# Compute loss.
loss = criterion(pred, y)
print('loss: ', loss.item())

loss:  1.267463207244873


In [None]:
# Backward pass.
loss.backward()

# Print out the gradients.
print ('dL/dw: ', linear.weight.grad) 
print ('dL/db: ', linear.bias.grad)

dL/dw:  tensor([[ 0.1509,  0.5630, -0.0100],
        [ 0.0341,  0.1880, -0.3291]])
dL/db:  tensor([ 0.1463, -0.2725])


In [None]:
# 1-step gradient descent.
optimizer.step()

In [None]:
# Print out the loss after 1-step gradient descent.
pred = linear(x)
loss = criterion(pred, y)
print('loss after 1 step optimization: ', loss.item())

loss after 1 step optimization:  1.2616931200027466


### Dataset and DataLoader

#### Transforms

> We compose a sequence of transformation to pre-process the image:
 _Compose_ creates a series of transformation to prepare the dataset. Torchvision reads datasets into PILImage (Python imaging format). _ToTensor_ converts the PIL Image from range \[0, 255\] to a FloatTensor of shape (C x H x W) with range \[0.0, 1.0\]. We then renormalize the input to \[-1, 1\] based on the following formula with $\mu=\text{standard deviation}=0.5$.

$$
input = \frac{input - \mu}{\text{standard deviation}} \\
input = \frac{input - 0.5}{0.5}
$$


In [None]:
import torchvision
import torchvision.transforms as transforms

transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

In [None]:
train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, transform=transform, download=True)

Files already downloaded and verified


In [None]:
image, label = train_dataset[0]
print(image.size())
print(label)

torch.Size([3, 32, 32])
6


In [None]:
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)

# Actual usage of the data loader is as below.
for images, labels in train_loader:
    pass
    # Training code should be written here.
    # print(images.size(), labels.size())

#### Image grid

We often want to display a grid of images to show samples for the training or testing images. torchvision.utils.make_grid a grid to be displayed.


In [None]:
def imshow(inp, title=None):
    """Imshow for Tensor."""
    inp = inp.numpy().transpose((1, 2, 0))
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    inp = std * inp + mean
    inp = np.clip(inp, 0, 1)
    plt.imshow(inp)
    if title is not None:
        plt.title(title)
    plt.pause(0.001)  # pause a bit so that plots are updated

In [None]:
# Get a batch of training data
# inputs contains 4 images because batch_size=4 for the dataloaders
inputs, classes = next(iter(dataloaders['train']))

# Make a grid from batch
out = torchvision.utils.make_grid(inputs)

imshow(out, title=[class_names[x] for x in classes])

### Transforms images

Here is another example in applying cropping, image flipping and scaling to pre-process image:

In [None]:
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomSizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Scale(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

data_dir = 'hymenoptera_data'
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train', 'val']}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4,
                                             shuffle=True, num_workers=4)
                  for x in ['train', 'val']}


dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes

### Advance Pytorch

In [None]:
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable


class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 5, padding = 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.pool = nn.MaxPool2d(2, stride = 2)
        self.fc1 = nn.Linear(400,120)
        self.fc2 = nn.Linear(120,84)
        self.fc3 = nn.Linear(84,10)
        self.softmax = nn.Softmax()
        
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = x.view(-1,400)   # Fix bug here x.view(1,-1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.softmax(self.fc3(x))
        return x

In [None]:
Net = LeNet()
tot_params = 0
for k in Net.parameters():
    print(k.size())

torch.Size([6, 1, 5, 5])
torch.Size([6])
torch.Size([16, 6, 5, 5])
torch.Size([16])
torch.Size([120, 400])
torch.Size([120])
torch.Size([84, 120])
torch.Size([84])
torch.Size([10, 84])
torch.Size([10])


In [None]:
for child in Net.children():
    print(child)

Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
Linear(in_features=400, out_features=120, bias=True)
Linear(in_features=120, out_features=84, bias=True)
Linear(in_features=84, out_features=10, bias=True)
Softmax()


In [None]:
for child in Net.children():
    for param in child.parameters():
        print("This is what a parameter looks like - \n",param)
        break
    break

This is what a parameter looks like - 
 Parameter containing:
tensor([[[[ 0.0771, -0.0229, -0.0646, -0.1097,  0.0852],
          [-0.1471,  0.1741,  0.1820, -0.0065,  0.1308],
          [-0.1051,  0.0019,  0.1518,  0.0463, -0.1765],
          [-0.0229,  0.0545, -0.1920,  0.0671, -0.0256],
          [-0.1228,  0.0484, -0.0432,  0.0382, -0.0107]]],


        [[[ 0.1101, -0.0561,  0.0649, -0.0617, -0.0302],
          [ 0.1456, -0.1711,  0.0800,  0.0824,  0.1458],
          [-0.0453, -0.0483,  0.1292,  0.1148, -0.1253],
          [-0.0077, -0.0183,  0.1940,  0.0082, -0.1202],
          [ 0.1916,  0.0205,  0.0627,  0.0722, -0.0667]]],


        [[[-0.1457, -0.0188, -0.1652, -0.1701, -0.1931],
          [ 0.0139,  0.0080, -0.0236,  0.1443,  0.0006],
          [-0.1246, -0.1696, -0.1484, -0.1861,  0.1070],
          [ 0.0832,  0.1782, -0.1268,  0.0601, -0.1732],
          [ 0.0988, -0.1908, -0.1523, -0.0203, -0.0877]]],


        [[[ 0.0064,  0.0919, -0.1561, -0.0387, -0.1095],
          [ 0.

In [None]:
for name,parameters in Net.named_parameters():
    print(name,':',parameters.size())

conv1.weight : torch.Size([6, 1, 5, 5])
conv1.bias : torch.Size([6])
conv2.weight : torch.Size([16, 6, 5, 5])
conv2.bias : torch.Size([16])
fc1.weight : torch.Size([120, 400])
fc1.bias : torch.Size([120])
fc2.weight : torch.Size([84, 120])
fc2.bias : torch.Size([84])
fc3.weight : torch.Size([10, 84])
fc3.bias : torch.Size([10])


### Custom weight init

PyTorch layers are initialized by default in their respective reset_parameters() method. For example:

- nn.Linear
weight and bias: uniform distribution [-limit, +limit] where limit is 1. / sqrt(fan_in) and fan_in is the number of input units in the weight tensor.
- nn.Conv2D
weight and bias: uniform distribution [-limit, +limit] where limit is 1. / sqrt(fan_in) and fan_in is the number of input units in the weight tensor.
With this implementation, the variance of the layer outputs is equal to Var(W) = 1 / 3 * sqrt(fan_in) which isn't the best initialization strategy out there.

Note that PyTorch provides convenience functions for some of the initializations. The input and output shapes are computed using the method _calculate_fan_in_and_fan_out() and a gain() method scales the standard deviation to suit a particular activation.


In [None]:
import torch
from torch import nn
import torch.nn.init as init
import numpy as np

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(5, 10, (3, 3))
        init.xavier_uniform_(self.conv1.weight, gain=np.sqrt(2))
        init.constant_(self.conv1.bias, 0.1)

network = Net()

### Weight Regularization

#### L2 Regularization
Heavily penalizes peaky weight vectors and encourages diffuse weight vectors. Has the appealing property of encouraging the network to use all of its inputs a little rather that some of its inputs a lot.

In [None]:
reg = 1e-6
l2_loss = Variable(torch.FloatTensor(1), requires_grad=True)
for name, param in model.named_parameters():
    if 'bias' not in name:
        l2_loss = l2_loss + (0.5 * reg * torch.sum(torch.pow(W, 2)))

#### L1 Regularization
Encourages sparsity, meaning we encourage the network to select the most useful inputs/features rather than use all.

In [None]:
reg = 1e-6
l1_loss = Variable(torch.FloatTensor(1), requires_grad=True)
for name, param in model.named_parameters():
    if 'bias' not in name:
        l1_loss = l1_loss + (reg * torch.sum(torch.abs(W)))

NameError: name 'Variable' is not defined

## Convolution

In [None]:
x = torch.tensor([[[0., 0., 1., 0., 0., 0., 0.]]])
k = torch.tensor([[[1., 2., 3.]]])

In [None]:
nn.functional.conv1d(x, k)

tensor([[[3., 2., 1., 0., 0.]]])

In [None]:
nn.functional.conv_transpose1d(x, k)

tensor([[[0., 0., 1., 2., 3., 0., 0., 0., 0.]]])