In [1]:
#pytorch version 1.6.0
import torch
device =  torch.device("cpu")

In [2]:
#create random tensors for data and weights
N, D_in, H, D_out = 64,1000, 100, 10
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)
w1 = torch.randn(D_in, H, device=device)
w2 = torch.randn(H, D_out, device=device)

In [3]:
learning_rate=1e-6
for t in range(500):
    #forward pass compute predictions and loss
    h = x.mm(w1)
    h_relu= h.clamp(min=0)
    y_pred= h_relu.mm(w2)
    loss= (y_pred-y).pow(2).sum()
    
    #backword pass compute gradients manually
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h=grad_h_relu.clone()
    grad_h[h<0]=0
    grad_w1=x.t().mm(grad_h)
    
    #gradient descent steps on weights
    w1-=learning_rate*grad_w1
    w2-=learning_rate*grad_w2

In [4]:
#creating tesnosrs with requires_grad enables autograd 
N, D_in, H, D_out = 64,1000, 100, 10
#we dont want gradients of loss wrt to data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)
#we want gradient wrt to weights
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

In [5]:
learning_rate=1e-6
for t in range(500):
    
    #forward pass looks the same but we dont need to track intermediate values pytorch keeps track of them for us in the graph
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    loss=(y_pred-y).pow(2).sum()
    
    #computes gradients wrt all the inputs that have require_grad=True
    #every operation on tensor with require_grad=True will add to computation graph  and the resulting tensors will 
    #also have the requires_grad=true
    loss.backward()
    #ths will backprop to all input that requires grad
    #after backword finishes gradients are accumulated into w1.grad and w2.grad and the graph is destroyed
    
    #it tells pytorch not to build a graph for these operations.
    with torch.no_grad():
        #making step on weights
        w1-=learning_rate*w1.grad
        w2-=learning_rate*w2.grad
        #set gradient explicitely to zero after performing gradient descent steps 
        #when we perform backpropogation some of tensors might already have some gradients hanging around in their associated
        #tensors and when you call locc,backword() it doesnt over write the the gradients instead it computed new gradients and add
        #to existed gradients but we want fresh computed gradients at every iteration. 
        w1.grad.zero_()
        w2.grad.zero_()

In [6]:
#can define new operations using python functions
N, D_in, H, D_out = 64,1000, 100, 10
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)


In [7]:
def sigmoid(x):
    #inputs pytorch tensors and output pytorch tensors
    return(1.0/(1.0+(-x).exp()))

In [8]:
learning_rate=1e-6
for t in range(500):
    y_pred = sigmoid(x.mm(w1)).mm(w2)
    #when we use python function perform modular computation inside neural networks then still at computational graph level
    #it doent know about python function then really the way it works is that when u call the python function then each premitive
    #pytorch operations that happens inside of your python function which keeps on adding to the overall computation graph
    #so verytime it will be gaint version of computational graph which some kind flattens version of all the operations 
    #that you perform as your program tecae through all the different functions that you call
    loss=(y_pred-y).pow(2).sum()
    
    loss.backward()
    if t % 50==0:
        #so if we compute backword pass of sigmoid  by back propogating though computation graphs (loss, 1.0/, +1, exp
        #, *1.0, x) then very frequently we will get Nans 
        print(t, loss.item())
    
    with torch.no_grad():
        w1-=learning_rate*w1.grad
        w2-=learning_rate*w2.grad
        w1.grad.zero_()
        w2.grad.zero_()

0 31829.169921875
50 nan
100 nan
150 nan
200 nan
250 nan
300 nan
350 nan
400 nan
450 nan


In [9]:
#define new autograd operators by subclassing function, define forward and backword
class Sigmoid(torch.autograd.Function):
    
    #calculating forward pass
    @staticmethod
    def forward(ctx, x):
        y = 1.0/(1.0 + (-x).exp())
        ctx.save_for_backward(y)
        return y
    
    #reciveing upstream gradient calculationg local gradient and returning downstream gradien
    @staticmethod
    def backword(ctx, grad_y):
        y,= ctx.saved_tensors
        grad_x = grad_y * y *(1.0-y)
        return grad_x
    
#and now when this fucntion runs it adds only only one node to the graph
#and to backpropogate through it would just use above backord function that we have implemented 
def sigmoid(x):
    return Sigmoid.apply(x)
#but in practice this is very rare to see

In [10]:
#pytorch nn modules
#higher level wrapper for working with neural nets
N, D_in, H, D_out = 64,1000, 100, 10
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

#defines model object as sequence of layer objects, each of which holds weight tensors 
model =torch.nn.Sequential(torch.nn.Linear(D_in, H),
                          torch.nn.ReLU(),
                          torch.nn.Linear(H, D_out))
learning_rate = 1e-2
for t in range(500):
    #feed data to model to compute loss
    #torchnn.fucntional has useful helpers like loss functions 
    y_pred=model(x)
    loss=torch.nn.functional.mse_loss(y_pred, y)
    
    #computes gradients wrt all model weights(they have require_grad=True)
    loss.backward()
    
    with torch.no_grad():
        #make gradient descent step on each model parameter
        for param in model.parameters():
            param-=learning_rate*param.grad
    model.zero_grad()

In [11]:
#pytorch nn modules with optimizer
N, D_in, H, D_out = 64,1000, 100, 10
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

model =torch.nn.Sequential(torch.nn.Linear(D_in, H),
                          torch.nn.ReLU(),
                          torch.nn.Linear(H, D_out))
learning_rate = 1e-4

#use optimizer for different update rules

optimizer= torch.optim.Adam(model.parameters(),
                           lr=learning_rate)
for t in range(500):
    y_pred=model(x)
    loss=torch.nn.functional.mse_loss(y_pred, y)
    
    loss.backward()
    
    #after cimputing gradients use optimizer to update and zero gradients
    optimizer.step()
    optimizer.zero_grad()

In [12]:
#pytorch defining nn modules
class TwoLayerNet(torch.nn.Module):
    #defining our whole module as single module
    def __init__(self, D_in, H,D_out):
        super(TwoLayerNet, self).__init__()
        #initializer setup contains 2 modules(modules can contain module)
        self.linear1=torch.nn.Linear(D_in, H)
        self.linear2=torch.nn.Linear(H, D_out)
    
    #define forward pass using child module and tensor operations
    #no need to define backward autograd will handle it
    def forward(self,x):
        h_relu=self.linear1(x).clamp(min=0)
        y_pred=self.linear2(h_relu)
        return y_pred

N, D_in, H, D_out = 64,1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

learning_rate = 1e-4

model=TwoLayerNet(D_in, H, D_out)
optimizer= torch.optim.SGD(model.parameters(),
                           lr=learning_rate)
for t in range(500):
    y_pred=model(x)
    loss=torch.nn.functional.mse_loss(y_pred, y)
    
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

In [13]:
#very common to mix and match custome module subclasses and sequential containers 
class ParallelBlock(torch.nn.Module):
    
    def __init__(self, D_in,D_out):
        super(ParallelBlock, self).__init__()
        self.linear1=torch.nn.Linear(D_in, D_out)
        self.linear2=torch.nn.Linear(D_in, D_out)
    
    def forward(self,x):
        h1=self.linear1(x)
        h2=self.linear2(x)
        #elementwise multiplication of 2 parallel liner operations and then applying relu on it 
        return (h1*h2).clamp(min=0)

N, D_in, H, D_out = 64,1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

learning_rate = 1e-4

model=torch.nn.Sequential(
        ParallelBlock(D_in,H),
        ParallelBlock(H,H),
        torch.nn.Linear(H, D_out))
optimizer= torch.optim.SGD(model.parameters(),
                           lr=learning_rate)
for t in range(500):
    y_pred=model(x)
    loss=torch.nn.functional.mse_loss(y_pred, y)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

In [14]:
#pytorch dataloaders
from torch.utils.data import TensorDataset, DataLoader

N, D_in, H, D_out = 64,1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

#It wraps a dataset and provides minibatching, shuffling, multithreading
loader=DataLoader(TensorDataset(x,y), batch_size=8)
model=TwoLayerNet(D_in, H, D_out)

learning_rate = 1e-2

optimizer=torch.optim.SGD(model.parameters(), lr=learning_rate)
for epoch in range(20):
    #iterate over data to form minibatches
    for x_batch, y_batch in loader:
        y_pred=model(x_batch)
        loss = torch.nn.functional.mse_loss(y_pred, y_batch)
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

In [15]:
#pytorch dynamic computation graphs 
#dynamic graphs let u use regular python control flow during the forward pass
N, D_in, H, D_out = 64,1000, 100, 10
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2a = torch.randn(H, D_out, device=device, requires_grad=True)
w2b = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate=1e-6
prev_loss=5.0
for t in range(500):
    #decide which one to use at each layer based on loss at previous iteration
    #this doenst make any sense just a simple dynamic example
    w2=w2a if prev_loss < 5.0 else w2b
    y_pred=x.mm(w1).clamp(min=0).mm(w2)
    loss=(y_pred-y).pow(2).sum()
    
    loss.backward()
    prev_loss = loss.item()


In [16]:
#pytorch static computation graphs
#1. build a graph describing our omputation 
#2. reuse the same graph on every iteration

#define model as a python function 
def model(x,y,w1,w2a,w2b,prev_loss:float):
    w2=w2a if prev_loss <5.0 else w2b
    y_pred=x.mm(w1).clamp(min=0).mm(w2)
    loss=(y_pred-y).pow(2).sum()
    return loss

N, D_in, H, D_out = 64,1000, 100, 10
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2a = torch.randn(H, D_out, device=device, requires_grad=True)
w2b = torch.randn(H, D_out, device=device, requires_grad=True)

#just in time compilation introspect the source code of the function compile it into a graph object 
#that graph includes conditional node to hande both conditions which we have adeed in model for w2
graph=torch.jit.script(model)

prev_loss=5.0
learning_rate=1e-6
for t in range(500):
    #use our compiled graph object at each forward pass
    loss=graph(x,y,w1,w2a,w2b,prev_loss)
    
    loss.backward()
    prev_loss=loss.item()

In [17]:
#pytorch static computation graphs
#1. build a graph describing our omputation 
#2. reuse the same graph on every iteration

#even easier add annotation to function python function compiled to a graph when it is defined 
@torch.jit.script
def model(x,y,w1,w2a,w2b,prev_loss:float):
    w2=w2a if prev_loss <5.0 else w2b
    y_pred=x.mm(w1).clamp(min=0).mm(w2)
    loss=(y_pred-y).pow(2).sum()
    return loss

N, D_in, H, D_out = 64,1000, 100, 10
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2a = torch.randn(H, D_out, device=device, requires_grad=True)
w2b = torch.randn(H, D_out, device=device, requires_grad=True)

prev_loss=5.0
learning_rate=1e-6
for t in range(500):
    #calling function uses graph
    loss=model(x,y,w1,w2a,w2b,prev_loss)
    
    loss.backward()
    prev_loss=loss.item()