# Pytorch Basics

In [1]:
import torch
import torch.nn
import numpy as np

In [2]:
x = np.array([1, 2, 3], dtype=float)
y = np.array([[5, 2, 3], [1, 3, 1]], dtype=float)

In [3]:
# broadcasts x from (1x3) -> (2x3) to add elementwise
y + x

array([[6., 4., 6.],
       [2., 5., 4.]])

In [4]:
# broadcasts x from (1x3) -> (2x3) to multiply elementwise
y * x

array([[5., 4., 9.],
       [1., 6., 3.]])

In [5]:
# calculate the dot product
y @ x

array([18., 10.])

In [6]:
# change to tensors
x = torch.tensor(x)
y = torch.tensor(y)
# change to numpy
x = x.numpy()
y = y.numpy()
# change to tensors
x = torch.tensor(x)
y = torch.tensor(y)
print(f'{x = }')
print(f'{y = }')

x = tensor([1., 2., 3.], dtype=torch.float64)
y = tensor([[5., 2., 3.],
        [1., 3., 1.]], dtype=torch.float64)


## Network from Scratch with Optimizer

In [7]:
# define functions for layer 1 and 2
#  hidden layer = f
#  output layer = g
def f(x):
    return x[0]**2 + 2 * (x[1]**2)
def g(x):
    return (3*(x**2) - 2*x + 10).mean()

In [8]:
# the flow of input to output is: 
# x = input, v = output
#  x -> f -> u
#  u -> g -> v
x = torch.tensor([[1, 2, 3, 4], [-1, -2, -3, -4]], 
                 dtype=float, 
                 requires_grad=True)
u = f(x)
v = g(u)
v.backward() # backpropagate: update gradients
v

/opt/amdgpu/share/libdrm/amdgpu.ids: No such file or directory


tensor(2354.5000, dtype=torch.float64, grad_fn=<MeanBackward0>)

In [9]:
# use this to perform operation without affecting stored gradients
with torch.no_grad():
    pass

In [10]:
# use .detach() to:
#   remove the gradient (grad_fn=.. not shown)
#   remove the node from the network
v.detach()

tensor(2354.5000, dtype=torch.float64)

In [11]:
# reset tensor x
x = torch.tensor([[1, 2, 3, 4], [-1, -2, -3, -4]], 
                 dtype=float, 
                 requires_grad=True)

In [12]:
# stochastic gradient descent
optimizer = torch.optim.SGD([x], lr=0.01)

In [13]:
# try to train for 10 epochs
# the loss/gradient will explode before 10 epochs
for i in range(10):
    optimizer.zero_grad() # clears previous gradients
    loss = g(f(x)) # computes the loss
    loss.backward() # computes the gradients
    optimizer.step() # steps to greatest descent
    print(f'epoch{i}_loss: {loss.detach()}') # detaches to save memory

epoch0_loss: 2354.5
epoch1_loss: 9680.58271084
epoch2_loss: 10495288.069958264
epoch3_loss: 2.598978266316877e+16
epoch4_loss: 4.03786934268059e+44
epoch5_loss: 1.51619361030149e+129
epoch6_loss: inf
epoch7_loss: nan
epoch8_loss: nan
epoch9_loss: nan


## Gradient Descent with Optimizer

In [14]:
# define new a new function that wont explode f(x)
def f(x):
    return x[0]**2 + 2 * x[1]**2

In [15]:
# use 1 equation and x = [-5, -2] for better convergence example
x = torch.tensor([-5, -2], dtype=float, requires_grad=True)

In [16]:
optimizer = torch.optim.SGD([x], lr=0.01)

In [17]:
for i in range(20):
    optimizer.zero_grad() 
    loss = f(x)
    loss.backward()
    optimizer.step()
    print(f'epoch{i}_loss: {loss.detach()}')

epoch0_loss: 33.0
epoch1_loss: 31.382800000000003
epoch2_loss: 29.853976480000004
epoch3_loss: 28.408121839168
epoch4_loss: 27.040192196415312
epoch5_loss: 25.745481260120684
epoch6_loss: 24.51959615200815
epoch7_loss: 23.358435035713462
epoch8_loss: 22.258166412144174
epoch9_loss: 21.215209954406795
epoch10_loss: 20.226218764912623
epoch11_loss: 19.288062946345917
epoch12_loss: 18.397814386543782
epoch13_loss: 17.552732665052556
epoch14_loss: 16.750251996240255
epoch15_loss: 15.987969130406578
epoch16_loss: 15.263632140383667
epoch17_loss: 14.575130026702434
epoch18_loss: 13.920483079547264
epoch19_loss: 13.297833940470303


## Gradient Descent without Optimizer

In [18]:
x = torch.tensor([-5, -2], dtype=float, requires_grad=True)

In [19]:
lr = 0.01
for i in range(20):
    loss = f(x)
    loss.backward()
    with torch.no_grad():
        x -= lr * x.grad # update parameters
        x.grad.zero_() # manually zero gradients after
    print(f'epoch{i}_loss: {loss.detach()}')

epoch0_loss: 33.0
epoch1_loss: 31.382800000000003
epoch2_loss: 29.853976480000004
epoch3_loss: 28.408121839168
epoch4_loss: 27.040192196415312
epoch5_loss: 25.745481260120684
epoch6_loss: 24.51959615200815
epoch7_loss: 23.358435035713462
epoch8_loss: 22.258166412144174
epoch9_loss: 21.215209954406795
epoch10_loss: 20.226218764912623
epoch11_loss: 19.288062946345917
epoch12_loss: 18.397814386543782
epoch13_loss: 17.552732665052556
epoch14_loss: 16.750251996240255
epoch15_loss: 15.987969130406578
epoch16_loss: 15.263632140383667
epoch17_loss: 14.575130026702434
epoch18_loss: 13.920483079547264
epoch19_loss: 13.297833940470303


## Gradient Descent from Scratch

In [20]:
# g(x) is the gradient of f(x)
def f(x):
    return x[0]**2 + 2 * x[1]**2
def g(x):
    return np.array([2*x[0], 4*x[1]])

In [21]:
# implement gradient descent from scratch without the optimizer
def gradient_descent(x0, step, num_steps):
    x = np.array(x0)
    history = [[x[0], x[1], f(x)]]
    for i in range(num_steps):
        gradient = g(x)
        dx = step * gradient
        x = x - dx
        history.append([x[0], x[1], f(x)])
    return np.array(history)

In [22]:
history = gradient_descent((-5, -2), step=0.01, num_steps=19)

In [23]:
epoch = 0
for x0, x1, y in history:
    print(f'epoch{epoch}_loss: {y}')
    epoch += 1

epoch0_loss: 33.0
epoch1_loss: 31.382800000000003
epoch2_loss: 29.853976480000004
epoch3_loss: 28.408121839168
epoch4_loss: 27.040192196415312
epoch5_loss: 25.745481260120684
epoch6_loss: 24.51959615200815
epoch7_loss: 23.358435035713462
epoch8_loss: 22.258166412144174
epoch9_loss: 21.215209954406795
epoch10_loss: 20.226218764912623
epoch11_loss: 19.288062946345917
epoch12_loss: 18.397814386543782
epoch13_loss: 17.552732665052556
epoch14_loss: 16.750251996240255
epoch15_loss: 15.987969130406578
epoch16_loss: 15.263632140383667
epoch17_loss: 14.575130026702434
epoch18_loss: 13.920483079547264
epoch19_loss: 13.297833940470303


## Stochastic Gradient Descent from Scratch

In [24]:
# implement stochastic gradient descent from scratch without the optimizer
# this just simulates how it works. actual SGD performs better
def stochastic_gradient_descent(x0, step, num_steps):
    x = np.array(x0)
    history = [[x[0], x[1], f(x)]]
    for i in range(num_steps):
        true_gradient = g(x)
        # simulates the natural noise of data 
        noise = np.random.normal(0, 1.5, size=x.shape) # mean=0, std=1.5
        stochastic_gradient = true_gradient + noise
        dx = step * stochastic_gradient
        x = x - dx
        history.append([x[0], x[1], f(x)])
    return np.array(history)

In [25]:
history = stochastic_gradient_descent((-5, -2), step=0.01, num_steps=19)

In [26]:
epoch = 0
for x0, x1, y in history:
    epoch += 1
    print(f'epoch{epoch}_loss: {y}')

epoch1_loss: 33.0
epoch2_loss: 31.3492616527654
epoch3_loss: 30.042182826756978
epoch4_loss: 28.568651324520665
epoch5_loss: 27.099011892757616
epoch6_loss: 25.84270129967055
epoch7_loss: 24.31611137409537
epoch8_loss: 23.198138186529228
epoch9_loss: 22.148425752499683
epoch10_loss: 21.323934753288103
epoch11_loss: 20.32407157563845
epoch12_loss: 19.730218268569427
epoch13_loss: 18.619518178393186
epoch14_loss: 17.510458908358988
epoch15_loss: 16.827931920949144
epoch16_loss: 16.20585525000218
epoch17_loss: 15.404857327690376
epoch18_loss: 14.709496834174645
epoch19_loss: 14.100490724399016
epoch20_loss: 13.510629715747092


## Momentum Gradient Descent from Scratch

In [27]:
# implement momentum gradient descent from scratch without the optimizer
# beta is the percentage of velocity to retain from prev step
# x_new = x_old - velocity
def momentum_gradient_descent(x0, step, num_steps, beta=0.8):
    x = np.array(x0)
    dx = np.zeros_like(x)
    history = [[x[0], x[1], f(x)]]
    for i in range(num_steps):
        gradient = g(x)
        dx = step * gradient + beta * dx
        x = x - dx
        history.append([x[0], x[1], f(x)])
    return np.array(history)

In [28]:
history = momentum_gradient_descent((-5, -2), step=0.01, num_steps=19)

In [29]:
epoch = 0
for x0, x1, y in history:
    print(f'epoch{epoch}_loss: {y}')
    epoch += 1

epoch0_loss: 33.0
epoch1_loss: 31.382800000000003
epoch2_loss: 28.628389280000004
epoch3_loss: 25.207211492928003
epoch4_loss: 21.524154336581454
epoch5_loss: 17.891827195148906
epoch6_loss: 14.525311361163372
epoch7_loss: 11.55092878825017
epoch8_loss: 9.022658706056212
epoch9_loss: 6.9412746602224225
epoch10_loss: 5.27278611021891
epoch11_loss: 3.9641439910793164
epoch12_loss: 2.9552893679009293
epoch13_loss: 2.1874417854391126
epoch14_loss: 1.6080465783485112
epoch15_loss: 1.173070351379103
epoch16_loss: 0.8474103338387285
epoch17_loss: 0.6041290709607978
epoch18_loss: 0.4230976757475776
epoch19_loss: 0.28947408430327937


## Adam Gradient Descent from Scrath

In [30]:
# implement adam gradient descent from scratch without the optimizer
# epsilon is a small values thats added to prevent /0 for friction
# x_new = x_old - velocity / friction
def adam_gradient_descent(x0, step, num_steps, beta1=0.8, beta2=0.999, epsilon=1e-8):
    x = np.array(x0)
    m = np.zeros_like(x) # momentum part
    v = np.zeros_like(x) # RMSProp
    history = [[x[0], x[1], f(x)]]
    for t in range(1, num_steps + 1):
        gradient = g(x)
        m = beta1 * m + (1 - beta1) * gradient # update momentum: mean of gradients
        v = beta2 * v + (1 - beta2) * (gradient**2) # update velocity: mean of gradients^2
        m_hat = m / (1 - beta1**t)# boost small values m and v, on startup
        v_hat = v / (1 - beta2**t)
        x = x - step * m_hat / (np.sqrt(v_hat) + epsilon) # update parameters
        history.append([x[0], x[1], f(x)])
    return np.array(history)

In [31]:
history = adam_gradient_descent((-5, -2), step=0.01, num_steps=19)

In [32]:
# notice it looks worse. Adam takes consistent small steps regardless of slope
# this is more stable for exploding gradients
# in actual training itll perform better
epoch = 0
for x0, x1, y in history:
    print(f'epoch{epoch}_loss: {y}')
    epoch += 1

epoch0_loss: 33.0
epoch1_loss: 32.8203000001993
epoch2_loss: 32.641233282723995
epoch3_loss: 32.46282135062469
epoch4_loss: 32.28508494115806
epoch5_loss: 32.10804383941745
epoch6_loss: 31.931716718604264
epoch7_loss: 31.756121010764296
epoch8_loss: 31.58127281008595
epoch9_loss: 31.40718680915527
epoch10_loss: 31.233876267018715
epoch11_loss: 31.061353006619456
epoch12_loss: 30.88962743820752
epoch13_loss: 30.7187086046996
epoch14_loss: 30.548604244666247
epoch15_loss: 30.37932086861347
epoch16_loss: 30.21086384444574
epoch17_loss: 30.04323748838567
epoch18_loss: 29.876445158120006
epoch19_loss: 29.71048934548749
