In [75]:
import torch
from torch.autograd import Variable
from torch.nn.modules import Module
from torch import nn

# EX

In [102]:
class Exp(torch.autograd.Function):
    @staticmethod
    def forward(ctx, z):
        y = torch.exp(z)
        ctx.save_for_backward(y)    # let's save y as it's useful in backward()
        return y
    @staticmethod
    def backward(ctx, grad_output):
        # automatically, z' (derv of z) is computed and given in 'grad_output'
        y, = ctx.saved_tensors
        # so y' = z'*exp(z), where y=exp(z)--> grad_output * y 
        return grad_output * y 

In [104]:
a = torch.rand(4)
output = Exp.apply(a)

In [105]:
output

tensor([2.4388, 1.7422, 1.1160, 2.6243])

In [None]:
# 'save_for_backward' for saving 
# 'ctx.saved_tensors' for retrieving the save value

# EX

In [106]:
class Square(torch.autograd.Function):
    @staticmethod
    def forward(ctx, z):
        # Because we are saving one of the inputs use `save_for_backward`
        # Save non-tensors and non-inputs/non-outputs directly on ctx Example: ctx.weight= inp_weight (not in 
        # this class example but at other places you can)
        ctx.save_for_backward(z) # because we need z in df/dz = z'*2*z, otherwise, we can save y or anyting else
        return z**2              # y = z**2
 
    @staticmethod
    def backward(ctx, grad_out):
        # A function support double backward automatically if autograd
        # is able to record the computations performed in backward
        z, = ctx.saved_tensors
        return grad_out * 2 * z    # df/dz = z'*2*z


In [107]:
# Use double precision because finite differencing method magnifies errors
x = torch.rand(3, 3, requires_grad=True, dtype=torch.double)
torch.autograd.gradcheck(Square.apply, x)
# Use gradcheck to verify second-order derivatives
torch.autograd.gradgradcheck(Square.apply, x)

True

In [110]:
# Look at the two examples above. Once we needed to save z and once we needed to save y
# to be used in backward() function.

In [111]:
# Some other times you need to save more that 1 value:
# if y=f(z)=sinh(z) then it might be useful to reuse exp(x) and exp(-x). In these cases you 
# do as followings:

# EX

In [112]:
class Sinh(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x):
        expx = torch.exp(x)
        expnegx = torch.exp(-x)
        ctx.save_for_backward(expx, expnegx)
        # In order to be able to save the intermediate results, a trick is to
        # include them as our outputs, so that the backward graph is constructed
        return (expx - expnegx) / 2, expx, expnegx

    @staticmethod
    def backward(ctx, grad_out, _grad_out_exp, _grad_out_negexp):
        expx, expnegx = ctx.saved_tensors
        grad_input = grad_out * (expx + expnegx) / 2
        # We cannot skip accumulating these even though we won't use the outputs
        # directly. They will be used later in the second backward.
        grad_input += _grad_out_exp * expx
        grad_input -= _grad_out_negexp * expnegx
        return grad_input



In [121]:

def sinh(x):
    # Create a wrapper that only returns the first output
    return Sinh.apply(x)[0]

x = torch.rand(3, 3, requires_grad=True, dtype=torch.double)
torch.autograd.gradcheck(sinh, x)
torch.autograd.gradgradcheck(sinh, x)

True

In [66]:
# is saves two values in ctx as showed above.
# In addition to this, look at the header of function:
# def backward(ctx, grad_out, _grad_out_exp, _grad_out_negexp):
# it simply says that there are 3 outputs given by forward() function
# and their derivatives are these 3 argumetns.

In [67]:
# now, you can all you want with these

# EX

In [115]:
# implementing equation 2 in https://ieeexplore.ieee.org/document/8403889 
# : y = Sign(z) but because its derivative at 0 is not defined we definetly need
# to define customized backward(), otherwise using such activation function in neural
# network is not applicable. We manually modify/define the derivative as: 
# dy/dz = {1,  -1<=z<=1
#          0,  o.w}
# This is implemented as follows:

In [117]:
class BinaryActication(torch.autograd.Function):
    @staticmethod
    def forward(ctx, z):
        
        ctx.save_for_backward(z) # saves z in y=f(z). z is an array for all output neurons.
        y = torch.sign(z)        # y shows y in y=f(z). Here just I know the size is like z'size.              
        return y
    
    @staticmethod
    def backward(ctx, grad_output):
        z, = ctx.saved_tensors
        dy_dz = z.clone()
        dy_dz[(dy_dz>=-1) * (dy_dz<=1)] = 1
        dy_dz[(dy_dz!=1)] = 0
        # print('dy/dz=', dy_dz)
        return  dy_dz

In [123]:
a = torch.tensor([1,0.2,0.1,-0.2,-0.5])
a.requires_grad=True
output = BinaryActication.apply(a)
output
# it's impossible to check backward() from this simple test. It needs to be checked in a nn or with 
# checking mentioned above in previous class example.

tensor([ 1.,  1.,  1., -1., -1.], grad_fn=<BinaryActicationBackward>)

# EX

In [124]:
class L1Penalty(torch.autograd.Function):
    @staticmethod
    
    # defines activation function
    def forward(ctx, input, l1weight = 0.1):
        ctx.save_for_backward(input)
        ctx.l1weight = l1weight
        return input
    
    #  defines the gradient formula of the activation funciton.
    # It should return as many tensors as there were inputs
    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_variables
        grad_input = input.clone().sign().mul(ctx.l1weight)
        grad_input+=grad_output
        return grad_input

In [125]:

a = torch.tensor([1,0.2,0.1,-0.2,-0.5])
a.requires_grad=True
L1Penalty().apply(a)

tensor([ 1.0000,  0.2000,  0.1000, -0.2000, -0.5000],
       grad_fn=<L1PenaltyBackward>)

In [126]:
# in some examples above, and also in the  following class backward is unneccessay 
# as it's differentiable and autograd engine
# takes care of it (it already knows what to do with that). But just for showing how it works
# look at that...

In [127]:
class SinActivation(torch.autograd.Function):
    @staticmethod
    def forward(ctx, z):
        # y = sin(z)
        y = torch.sin(z)
        ctx.save_for_backward(y)
        return y
    @staticmethod
    def backward(ctx, grad_output):
        # automatically, z' (derv of z) is stored in grad_output
        y, = ctx.saved_tensors
        # so y' = z'*cos(z), where y=sin(z)
        return grad_output * torch.cos(y) 

In [56]:
# implementing equation 2 in https://ieeexplore.ieee.org/document/8403889 
# :

In [57]:
# class BinaryActication(torch.autograd.Function):
#     @staticmethod
#     def forward(ctx, z):
        
#         ctx.save_for_backward(z) # saves z in y=f(z). z is an array for all output neurons.
#         y = z.clone()            # y shows y in y=f(z). Here just I know the size is like z'size.
#         print('z=', z)
#         for i, element in enumerate(z): 
#             if (element>=0):
#                 y[i] = 1
#             else:
#                 y[i] = -1
#         print('y=sign(z)', y)        
#         return y
    
#     @staticmethod
#     def backward(ctx, grad_output):
#         z, = ctx.saved_tensors
#         dy_div_dz = z.clone()   # showing gradient by z
#         for i, element in enumerate(z): 
#             if (element>=-1 and element<=1):
#                 dy_div_dz[i] = 1
#             else:
#                 dy_div_dz[i] = 0
#         print('dy/dz=', dy_div_dz)
#         return  dy_div_dz

In [128]:
# a = torch.tensor([1,0.2,0.1,-0.2,-0.5])
# a.requires_grad=True
# output = BinaryActication.apply(a)
# output
# # output.retain_grad()
# # output.backward()

In [129]:
# a.mul()

# Junks

In [101]:
a = torch.tensor([1.5,3.2,-50.1,-0.2,-0.5])
a[(a>=-1) * (a<=1)] = 1
a[(a!=1)] = 0
a

tensor([0., 0., 0., 1., 1.])

In [143]:
#https://towardsdatascience.com/extending-pytorch-with-custom-activation-functions-2d8b065ef2fa
# https://pytorch.org/tutorials/intermediate/custom_function_double_backward_tutorial.html

# EX1

In [26]:
class SinActivation(torch.nn.Module):
    def __init__(self):
        super(SinActivation, self).__init__()
        return
    def forward(self, x):
        return torch.sin(x)


In [29]:
model = nn.Sequential(
        nn.Linear(4, 10, bias=False),
        SinActivation(),
        nn.Linear(10, 10, bias=True),
        SinActivation(),
        nn.Linear(10, 3, bias=False)        
)

In [30]:
inp = torch.tensor([-0.0617, -0.0059,  0.0449, 0.104])
model(inp)

tensor([-0.2053, -0.1000,  0.2663], grad_fn=<SqueezeBackward3>)

# EX2

In [131]:
class LearnedSiLU(nn.Module):
    def __init__(self, slope = 1):
        super().__init__()
        self.slope = slope * torch.nn.Parameter(torch.ones(1))

    def forward(self, x):
        return self.slope * x * torch.sigmoid(x)

In [132]:
import torch
import torch.nn as nn
from torch.nn.modules import Module

## Other examples

In [136]:
# In the following case the function is treated as if it's differntible, otherwise I
# should implement the backward() function as well.

In [137]:
# custom activation 
class Act(Module):
    def forward(self, z):
        temp = torch.zeros(z.shape)
#         print(z)
        for i, elm in enumerate(z):
            if(elm > 0.0):
                temp[i] = torch.tanh(elm)
            else:
                temp[i] = torch.tanh(elm)+1
        
        return temp

In [138]:
model = nn.Sequential(
        nn.Linear(10, 10, bias=False),
        Act(),
        nn.Linear(10, 10, bias=True),
        Act(),
        nn.Linear(10, 10, bias=False)        
)

In [139]:
inp = torch.rand((10))
model(inp)

tensor([-0.3563, -0.0428,  0.3658,  0.1742,  0.8193,  0.2885, -0.5353,  0.2849,
         0.1380, -0.5452], grad_fn=<SqueezeBackward3>)

In [140]:
# beware that I'm using Module class ( not Function as at the eariler section 
# in this notebook) in here that means I can use if in nn.Sequential like regualr 
# activation functions. But, when I implement an activation function and have a
# subclass of torch.autograd.Function, I need to use '<class-name>.apply()' to use
# it in nerual network layers. See bellow:

In [42]:
# now let's imagine that the custom function is not differentible: 
# if you worry about backpropagation you need to learn/use the following: 

In [43]:
class L1Penalty(torch.autograd.Function):
    @staticmethod
    
    # defines activation function
    def forward(ctx, input, l1weight = 0.1):
        ctx.save_for_backward(input)
        ctx.l1weight = l1weight
        return input
    
    #  defines the gradient formula of the activation funciton.
#     It should return as many tensors as there were inputs
    # It should return as many tensors as there were inputs
    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_variables
        grad_input = input.clone().sign().mul(ctx.l1weight)
        grad_input+=grad_output
        return grad_input

In [44]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10,10)
        self.fc2 = nn.Linear(10,6)
        self.fc3 = nn.Linear(6,10)
        self.fc4 = nn.Linear(10,10)
        self.relu = nn.ReLU(inplace=True)
        self.penalty = L1Penalty()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.penalty.apply(x)
        x = self.fc3(x)
        x = self.relu(x)
        x = self.fc4(x)
        x = self.relu(x)
        return x


In [45]:


model = Model()
a = torch.rand(50,10)
b = model(a)
print(b.shape)

torch.Size([50, 10])


In [46]:
# Inherit from Function
class LinearFunction(torch.autograd.Function):

    # Note that forward, setup_context, and backward are @staticmethods
    @staticmethod
    def forward(input, weight, bias):
        output = input.mm(weight.t())
        if bias is not None:
            output += bias.unsqueeze(0).expand_as(output)
        return output

    @staticmethod
    # inputs is a Tuple of all of the inputs passed to forward.
    # output is the output of the forward().
    def setup_context(ctx, inputs, output):
        input, weight, bias = inputs
        ctx.save_for_backward(input, weight, bias)

    # This function has only a single output, so it gets only one gradient
    @staticmethod
    def backward(ctx, grad_output):
        # This is a pattern that is very convenient - at the top of backward
        # unpack saved_tensors and initialize all gradients w.r.t. inputs to
        # None. Thanks to the fact that additional trailing Nones are
        # ignored, the return statement is simple even when the function has
        # optional inputs.
        input, weight, bias = ctx.saved_tensors
        grad_input = grad_weight = grad_bias = None

        # These needs_input_grad checks are optional and there only to
        # improve efficiency. If you want to make your code simpler, you can
        # skip them. Returning gradients for inputs that don't require it is
        # not an error.
        if ctx.needs_input_grad[0]:
            grad_input = grad_output.mm(weight)
        if ctx.needs_input_grad[1]:
            grad_weight = grad_output.t().mm(input)
        if bias is not None and ctx.needs_input_grad[2]:
            grad_bias = grad_output.sum(0)

        return grad_input, grad_weight, grad_bias


In [47]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10,10)
        self.fc2 = nn.Linear(10,6)
        self.fc3 = nn.Linear(6,10)
        self.fc4 = nn.Linear(10,10)
        self.relu = nn.ReLU(inplace=True)
        self.custom_func = LinearFunction()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.custom_func.apply(x, 1)
        x = self.fc3(x)
        x = self.relu(x)
        x = self.fc4(x)
        x = self.relu(x)
        return x

In [48]:

model = Model()
a = torch.rand(50,10)
b = model(a)
print(b.shape)

AttributeError: 'LinearFunctionBackward' object has no attribute 'mm'

# Resources