In [4]:
# move to project's root directory to make
# numpy_nn and pytorch_nn packages accessable 
# %cd ../..


# # Another possible solution is appending to the sys.path

import sys
import  os
project_root = os.path.dirname(os.path.dirname(sys.path[0]))
if project_root not in sys.path:
    sys.path.append(project_root)

In [5]:
from typing import List, Tuple, Callable

import torch
torch.use_deterministic_algorithms(True)

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from test_layer import test_module

from numpy_nn.modules.np_nn import (
    FullyConnectedLayer,
    ReLULayer,
    SigmoidLayer,
    ReLULayer,
    AdamOptimizer,
    CrossEntropyLoss,
    LinearActivation,
    Sequential,
    Optimizer,
    SoftMaxLayer,
    GradientDescentOptimizer,
    CrossEntropyLossWithSoftMax,
    Conv2d,
    Conv2dWithLoops,
    Flatten,
    MaxPool2d,
    AdamOptimizer,
    BatchNormalization2d,
    ActivationLayer,
    TrainableLayer,
)

from pytorch_nn.models.resnet import (
    Bottleneck as Bottleneck_torch,
    resnet101 as resnet101_torch
)

from pytorch_nn.models.resnet_without_batchnorm import (
    Bottleneck as Bottleneck_torch_without_batchnorm,
    resnet101 as resnet101_torch_without_batchnorm
)

from numpy_nn.models.resnet import Bottleneck, resnet101

from numpy_nn.models.resnet_without_batchnorm import (
    Bottleneck as Bottleneck_np_without_batchnorm,
    resnet101 as resnet101_np_without_batchnorm
)

IndentationError: expected an indented block after function definition on line 448 (np_nn.py, line 455)

In [3]:
import sys
import  os
project_root = os.path.dirname(os.path.dirname(sys.path[0]))
if project_root not in sys.path:
    sys.path.append(project_root)



from typing import Callable, Tuple, List

import numpy as np
import torch

from numpy_nn.modules.np_nn import (
    FullyConnectedLayer,
    BatchNormalization2d,
    TrainableLayer,
    Module,
)


def copy_trainable_parameters(my_module: Module, torch_module: torch.nn.Module) -> None:
    if isinstance(my_module, FullyConnectedLayer):
        my_module.weights = torch_module.weight.detach().numpy().T
        my_module.bias = torch_module.bias.detach().numpy().reshape(-1, 1).T
    elif isinstance(my_module, BatchNormalization2d):
        n_channels = my_module.n_channels
        my_module.gamma = torch_module.weight.detach().numpy().reshape(1, n_channels, 1, 1)
        my_module.beta = torch_module.bias.detach().numpy().reshape(1, n_channels, 1, 1)
        my_module.running_mean = torch_module.running_mean.detach().numpy().reshape(1, n_channels, 1, 1)
        my_module.running_var = torch_module.running_var.detach().numpy().reshape(1, n_channels, 1, 1)
    else:
        my_module.weights = torch_module.weight.detach().numpy()
        if my_module.bias is not None:
            my_module.bias = torch_module.bias.detach().numpy()


def test_module(my_module: Module,
                torch_module: torch.nn.Module,
                input_shape: Tuple[int, ...],
                output_shape: Tuple[int, ...],
                atol: float = 1e-5,
                random_sampler: Callable = np.random.rand,
                skip_parameter_copying: bool = False,
                print_tensors: bool = False,
                print_results: bool = False) -> None:
    """
    Compares the output and gradients of a numpy layer and a torch layer

    Args:
        my_module: neural network layer implemented in numpy.
        torch_module: neural network layer implemented in torch.
        input_shape: shape of the input tensor.
        output_shape: shape of the output tensor. It's used to generate
            a random tensor representing partial derivative of the loss
            function with respect to the output of the layer.
        atol: absolute tolerance for comparing
            numpy and torch tensors (used in np.allclose).
        random_sampler: function that generates random tensors of the given shape.
        skip_parameter_copying: if True, the weights and biases will be held intact.
            By default, weights and biases are copied from torch_module to my_module.
    """    
    # copy weights from torch_module to my_module
    # if the numpy layer is trainable
    if not skip_parameter_copying and isinstance(my_module, TrainableLayer):
        copy_trainable_parameters(my_module, torch_module)

    input_np = random_sampler(*input_shape).astype(np.float32)
    input_torch = torch.from_numpy(input_np)
    input_torch.requires_grad = True

    output_np = my_module.forward(input_np)
    output_torch = torch_module(input_torch)


    if print_tensors:
        print("my and torch outputs:")
        print(output_np.flatten(), output_torch.detach().numpy().flatten())
    assert np.allclose(output_np, output_torch.detach().numpy(), atol=atol), "Outputs are not equal"
    if print_results:
        print("Outputs are equal")
    
    # print(output_np.dtype)
    # print(output_torch.dtype)
    # print(output_torch.detach().numpy().dtype)

    output_grad_np = random_sampler(*output_shape)
    output_grad_torch = torch.from_numpy(output_grad_np)

    input_grad_np = my_module.backward(output_grad_np)
    output_torch.backward(output_grad_torch)
    input_grad_torch = input_torch.grad.detach().numpy()

    if print_tensors:
        print("my and torch input gradients:")
        print(input_grad_np.flatten(), input_grad_torch.flatten())
    assert np.allclose(input_grad_np, input_grad_torch, atol=atol), "Input gradients are not equal"
    if print_results:
        print("Input gradients are equal")


    if not isinstance(my_module, TrainableLayer):
        return
    

    # compare weight and bias gradients
    if isinstance(my_module, FullyConnectedLayer):
        weight_grad_np = my_module.weights_gradient
        weight_grad_torch = torch_module.weight.grad.detach().numpy().T
        bias_grad_np = my_module.bias_gradient
        bias_grad_torch = torch_module.bias.grad.detach().numpy().reshape(-1, 1).T
    elif isinstance(my_module, BatchNormalization2d):
        weight_grad_np = my_module.gamma_gradient.flatten()
        weight_grad_torch = torch_module.weight.grad.detach().numpy()
        bias_grad_np = my_module.beta_gradient.flatten()
        bias_grad_torch = torch_module.bias.grad.detach().numpy()

        if print_tensors:
            print("my and torch running means:")
            print(my_module.running_mean.flatten(), torch_module.running_mean.detach().numpy().flatten())
        running_mean_close = np.allclose(
            my_module.running_mean.flatten(), torch_module.running_mean.detach().numpy().flatten(), atol=atol)
        assert running_mean_close, "Running mean is not equal"
        if print_results:
            print("Running means are equal")

        if print_tensors:
            print("my and torch running vars:")
            print(my_module.running_var.flatten(), torch_module.running_var.detach().numpy().flatten())
        running_var_close = np.allclose(
            my_module.running_var.flatten(), torch_module.running_var.detach().numpy().flatten(), atol=atol)
        assert running_var_close, "Running var is not equal"
        if print_results:
            print("Running vars are equal")
            
    else:
        weight_grad_np = my_module.weights_gradient
        weight_grad_torch = torch_module.weight.grad.detach().numpy()
        if my_module.bias is not None:
            bias_grad_np = my_module.bias_gradient
            bias_grad_torch = torch_module.bias.grad.detach().numpy()
    
    weight_grads_close = np.allclose(weight_grad_np, weight_grad_torch, atol=atol)

    if print_tensors:
        print("my and torch weight gradients:")
        print(weight_grad_np.flatten(), weight_grad_torch.flatten())
        
    assert weight_grads_close, "Weight gradients are not equal"
    if print_results:
        print("Weight gradients are equal")

    if isinstance(my_module, BatchNormalization2d) or my_module.bias is not None:
        if print_tensors:
            print("my and torch bias gradients:")
            print(bias_grad_np.flatten(), bias_grad_torch.flatten())

        assert np.allclose(bias_grad_np, bias_grad_torch, atol=atol), "Bias gradients are not equal"
        if print_results:
            print("Bias gradients are equal")

In [4]:
# # reload a user's module test_layer
# %load_ext autoreload
# %autoreload 2

In [20]:
def conv2d_test(my_conv2d_constructor: Callable, batch_size: int,
                input_height: int, input_width: int, n_input_channels,
                n_output_channels, kernel_size: int, stride: int, padding: int,
                bias: bool, atol: float = 1e-5, random_sampler: Callable = np.random.rand,
                print_tensors: bool = False, print_results: bool = False) -> None:
    
    my_conv2d_kwargs = torch_conv2d_kwargs = {
        "in_channels": n_input_channels,
        "out_channels": n_output_channels,
        "kernel_size": kernel_size,
        "stride": stride,
        "padding": padding,
        "bias": bias
    }

    output_height = (input_height + 2 * padding - kernel_size) // stride + 1
    output_width = (input_width + 2 * padding - kernel_size) // stride + 1

    input_shape = (batch_size, n_input_channels, input_height, input_width)
    output_shape = (batch_size, n_output_channels, output_height, output_width)

    my_conv2d = my_conv2d_constructor(**my_conv2d_kwargs)
    torch_conv2d = torch.nn.Conv2d(**torch_conv2d_kwargs)

    test_module(my_conv2d, torch_conv2d, input_shape,
                output_shape, atol=atol, random_sampler=random_sampler,
                print_tensors=print_tensors, print_results=print_results)
    


def max_pool_2d_test(batch_size: int, height: int, width: int, n_channels: int,
                     kernel_size: int, stride: int, padding: int, atol: float = 1e-5,
                     random_sampler: Callable = np.random.rand, print_tensors: bool = False,
                     print_results: bool = False):
    
    output_width = (width + 2 * padding - kernel_size) // stride + 1
    output_height = (height + 2 * padding - kernel_size) // stride + 1

    my_pool_args = torch_pool_args = [kernel_size, stride, padding]

    my_pool = MaxPool2d(*my_pool_args)
    torch_pool = torch.nn.MaxPool2d(*torch_pool_args)

    test_module(my_pool,
                torch_pool,
                input_shape=[batch_size, n_channels, height, width],
                output_shape=[batch_size, n_channels, output_height, output_width],
                atol=atol,
                random_sampler = random_sampler,
                print_tensors=print_tensors,
                print_results=print_results)



def activation_test(my_activation: Callable,
                    torch_activation: Callable,
                    input_dim: List[int],
                    atol: float = 1e-5,
                    random_sampler: Callable = np.random.rand,
                    print_tensors: bool = False,
                    print_results: bool = False):
    """
    Samples input data and output gradient from a uniform
    distribution and tests if the output and input gradients
    are close to the ones computed by pytorch
    """
    test_module(my_activation(), torch_activation(),
                input_shape=input_dim, output_shape=input_dim,
                atol=atol, random_sampler=random_sampler,
                print_tensors=print_tensors, print_results=print_results)

def relu_test(input_dim: List[int], atol: float = 1e-5,
              random_sampler: Callable = np.random.rand,
              print_tensors: bool = False, print_results: bool = False):
    activation_test(ReLULayer, torch.nn.ReLU, input_dim,
                    atol=atol, random_sampler=random_sampler,
                    print_tensors=print_tensors, print_results=print_results)

def sigmoid_test(input_dim: List[int], atol: float = 1e-5,
                 random_sampler: Callable = np.random.rand,
                 print_tensors: bool = False, print_results: bool = False):
    activation_test(SigmoidLayer, torch.nn.Sigmoid, input_dim,
                    atol=atol, random_sampler=random_sampler,
                    print_tensors=print_tensors, print_results=print_results)



def flatten_test(input_shape: List[int], atol: float = 1e-5,
                 random_sampler: Callable = np.random.rand,
                 print_tensors: bool = False, print_results: bool = False):
    
    batch_size, *rest_input_dim = input_shape
    output_shape = [batch_size, np.prod(rest_input_dim)]

    test_module(Flatten(), torch.nn.Flatten(), input_shape=input_shape,
               output_shape=output_shape, atol=atol, random_sampler=random_sampler,
               print_tensors=print_tensors, print_results=print_results)



def batchnorm2d_iterative_test(n_channels: int, batch_size: int, height: int,
                               width: int, n_iter: int, phase: str = "train",
                               momentum: float = 0.1, atol: float = 1e-5,
                               random_sampler: Callable = np.random.rand,
                               print_tensors = False, print_results = False) -> None:
            
    # since each iteration changes the running mean, running variance, mean and variance
    # this test runs several iterations and checks that the results are the same

    # maybe there should be a custom test that calls
    # forward multiple times and calls backward only once
    
    input_shape = output_shape = [batch_size, n_channels, height, width]

    my_bn = BatchNormalization2d(n_channels, momentum=momentum)
    torch_bn = torch.nn.BatchNorm2d(n_channels, momentum=momentum)
    if phase == "eval":
        my_bn.eval()
        torch_bn.eval()
    else:
        my_bn.train()
        torch_bn.train()

    for i in range(n_iter):
        print("Iteration:", i)
        # print(my_bn.training)
        test_module(my_bn, torch_bn, input_shape, output_shape,
                    atol=atol, random_sampler=random_sampler,
                    skip_parameter_copying=bool(i),
                    print_tensors=print_tensors,
                    print_results=print_results)
        # reset torch gradients
        torch_bn.weight.grad = None
        torch_bn.bias.grad = None
        print()

In [21]:
atol = 1e-6
print(f"atol = {atol}, np.random.randn")
batchnorm2d_iterative_test(n_channels=5, batch_size=4, height=8, width=8,
                           n_iter=10, phase="train", momentum=0.1,
                           atol=atol, random_sampler=np.random.randn, print_results=False)

atol = 1e-06, np.random.randn
Iteration: 0

Iteration: 1

Iteration: 2

Iteration: 3

Iteration: 4

Iteration: 5

Iteration: 6

Iteration: 7

Iteration: 8

Iteration: 9



In [22]:
atol = 1e-4
print(f"atol = {atol}, np.random.rand")
batchnorm2d_iterative_test(n_channels=5, batch_size=4, height=8, width=8,
                           n_iter=10, phase="train", momentum=0.8,
                           atol=atol, random_sampler=np.random.rand, print_results=False)

atol = 0.0001, np.random.rand
Iteration: 0

Iteration: 1

Iteration: 2

Iteration: 3

Iteration: 4

Iteration: 5

Iteration: 6

Iteration: 7

Iteration: 8

Iteration: 9



In [24]:
atol = 1e-4
print(f"atol = {atol}, np.random.rand")
batchnorm2d_iterative_test(n_channels=5, batch_size=4, height=8, width=8,
                           n_iter=10, phase="train", momentum=0.8,
                           atol=atol, random_sampler=np.random.rand, print_results=False, print_tensors=True)

atol = 0.0001, np.random.rand
Iteration: 0
my and torch outputs:
[-1.4301041  -1.082526   -0.19767104 ...  0.9150699   0.8313913
 -1.6011153 ] [-1.4301041  -1.082526   -0.19767106 ...  0.9150702   0.83139145
 -1.6011153 ]
my and torch input gradients:
[ 0.136774040017978  1.214768857994491  0.724841884986875 ...
 -0.610479597399505 -0.378000684800799 -1.016681103117026] [ 0.1367743   1.2147692   0.724842   ... -0.6104797  -0.37800083
 -1.0166808 ]
my and torch running means:
[0.3952297  0.38302088 0.39440167 0.3867925  0.3916015 ] [0.39522967 0.3830209  0.39440167 0.38679248 0.39160147]
my and torch running vars:
[0.2785678  0.2648916  0.26677704 0.26196226 0.27252778] [0.2785678  0.26489156 0.26677704 0.26196226 0.27252775]
my and torch weight gradients:
[-2.483329650028675  2.095407849137224 -2.46034050029717
  2.691619001228386 -3.71991454658423 ] [-2.4833174  2.095394  -2.4603398  2.6916323 -3.7199028]
my and torch bias gradients:
[128.74741043319176 125.57231519081566 114.56492972

In [25]:
atol = 1e-5
print(f"atol = {atol}, np.random.rand")
batchnorm2d_iterative_test(n_channels=5, batch_size=4, height=8, width=8,
                           n_iter=10, phase="train", momentum=0.8,
                           atol=atol, random_sampler=np.random.rand, print_results=True)

atol = 1e-05, np.random.rand
Iteration: 0
Outputs are equal
Input gradients are equal
Running means are equal
Running vars are equal


AssertionError: Weight gradients are not equal

In [None]:
"""
Conv2dWithLoops and Conv2d tests
"""

batch_size = 5
n_input_channels = 4
n_output_channels = 2
width = 3
height = 5

kernel_size = 3
stride = 1
padding = 1

print("Conv2dWithLoops test")
conv2d_test(Conv2dWithLoops, batch_size, height, width,
            n_input_channels, n_output_channels, kernel_size,
            stride, padding, bias=True, atol=1e-6, random_sampler=np.random.randn)

conv2d_test(Conv2dWithLoops, batch_size, height, width,
            n_input_channels, n_output_channels, kernel_size,
            stride, padding, bias=True, atol=1e-6, random_sampler=np.random.rand)
print("passed")

print()

print("Conv2d test")
conv2d_test(Conv2d, batch_size, height, width, n_input_channels, n_output_channels,
            kernel_size, stride, padding, bias=True, atol=1e-6, random_sampler=np.random.randn)
conv2d_test(Conv2d, batch_size, height, width, n_input_channels, n_output_channels,
            kernel_size, stride, padding, bias=True, atol=1e-6, random_sampler=np.random.rand)
print("passed")

print()

print("Conv2d test without bias")
conv2d_test(Conv2d, batch_size, height, width, n_input_channels, n_output_channels,
            kernel_size, stride, padding, bias=False, atol=1e-6, random_sampler=np.random.randn)
conv2d_test(Conv2d, batch_size, height, width, n_input_channels, n_output_channels,
            kernel_size, stride, padding, bias=False, atol=1e-6, random_sampler=np.random.rand)
print("passed")



Conv2dWithLoops test
passed

Conv2d test
passed

Conv2d test without bias
passed


In [None]:
"""
FullyConnectedLayer test
"""

n_input_neurons = 6
n_output_neurons = 3
n_samples = 5

my_fc_params = torch_fc_params = [n_input_neurons, n_output_neurons]

test_module(FullyConnectedLayer(*my_fc_params),
            torch.nn.Linear(*torch_fc_params),
            input_shape=[n_samples, n_input_neurons],
            output_shape=[n_samples, n_output_neurons],
            atol=1e-6,
            random_sampler=np.random.randn)

test_module(FullyConnectedLayer(*my_fc_params),
            torch.nn.Linear(*torch_fc_params),
            input_shape=[n_samples, n_input_neurons],
            output_shape=[n_samples, n_output_neurons],
            atol=1e-6,
            random_sampler=np.random.rand)

print("passed")

passed


In [None]:
"""
CrossEntropyLoss test random.rand
"""

def one_hot(y: np.ndarray, n_classes: int):
    encoded = np.zeros((y.size, n_classes))
    encoded[np.arange(y.size), y] = 1
    return encoded


batch_size = 5
n_classes = 3
pred = np.random.rand(batch_size, n_classes).astype(np.float32)
true = one_hot(np.random.randint(0, n_classes, batch_size), n_classes)
pred_torch = torch.from_numpy(pred).float()
true_torch = torch.from_numpy(true).float()
pred_torch.requires_grad = True

torch_loss  = torch.nn.CrossEntropyLoss()
torch_loss_val = torch_loss(pred_torch, true_torch)
torch_loss_val.backward()

my_loss = CrossEntropyLossWithSoftMax()
my_loss_val = my_loss.forward(pred, true)
my_loss.backward()

print("loss_val all close:", np.allclose(my_loss_val, torch_loss_val.detach().numpy()))
print("loss gradients all close:", np.allclose(my_loss.backward(), pred_torch.grad))

loss_val all close: True
loss gradients all close: True


In [None]:
"""
CrossEntropyLoss test random.randn
"""

def one_hot(y: np.ndarray, n_classes: int):
    encoded = np.zeros((y.size, n_classes))
    encoded[np.arange(y.size), y] = 1
    return encoded


batch_size = 5
n_classes = 3
pred = np.random.randn(batch_size, n_classes).astype(np.float32)
true = one_hot(np.random.randint(0, n_classes, batch_size), n_classes)
pred_torch = torch.from_numpy(pred).float()
true_torch = torch.from_numpy(true).float()
pred_torch.requires_grad = True

torch_loss  = torch.nn.CrossEntropyLoss()
torch_loss_val = torch_loss(pred_torch, true_torch)
torch_loss_val.backward()

my_loss = CrossEntropyLossWithSoftMax()
my_loss_val = my_loss.forward(pred, true)
my_loss.backward()

print("loss_val all close:", np.allclose(my_loss_val, torch_loss_val.detach().numpy()))
print("loss gradients all close:", np.allclose(my_loss.backward(), pred_torch.grad))

loss_val all close: True
loss gradients all close: True


In [None]:
"""ReLU test"""
n_input_features = 6
n_samples = 5

relu_test([n_samples, n_input_features], random_sampler=np.random.randn)
relu_test([n_samples, n_input_features], random_sampler=np.random.rand)
print("passed")

passed


If we sample random numbers from normanl distribution instead of uniform distribution (randn instead of rand) sigmoid layer won't pass tests!

In [8]:
"""SigmoidLayer test"""

n_input_features = 6
n_samples = 5
height = 4
width = 4

atol = 1e-10
random_sampler = np.random.rand
sigmoid_test([n_samples, n_input_features],  atol=atol, random_sampler=random_sampler)
sigmoid_test([n_samples, n_input_features, height, width], atol=atol, random_sampler=random_sampler)
print(f"passed with atol={atol} and random_sampler={random_sampler.__name__}")


atol = 1e-10
random_sampler = np.random.randn
sigmoid_test([n_samples, n_input_features], print_results=False, atol=atol, random_sampler=random_sampler)
sigmoid_test([n_samples, n_input_features, height, width],  print_results=False, atol=atol, random_sampler=random_sampler)
print(f"passed with atol={atol} and random_sampler={random_sampler.__name__}")

passed with atol=1e-10 and random_sampler=rand
passed with atol=1e-10 and random_sampler=randn


In [14]:
"""Flatten test"""

flatten_test(input_shape = [2, 3, 5, 5], print_results=True, random_sampler=np.random.rand)
flatten_test(input_shape = [2, 3, 5, 5], print_results=True, random_sampler=np.random.randn)

Outputs are equal
Input gradients are equal
Outputs are equal
Input gradients are equal


In [22]:
"""MaxPool2d tests"""

batch_size = 10
n_channels = 3
height = 16
width = 16

kernel_size = 3
stride = 2
padding = 1


max_pool_2d_test(batch_size, height, width, n_channels, kernel_size, stride, padding, atol=1e-6, print_results=False, random_sampler=np.random.rand)
max_pool_2d_test(batch_size, height, width, n_channels, kernel_size, stride, padding, atol=1e-6, print_results=False, random_sampler=np.random.randn)
print("passed")


print()

max_pool_2d_test(batch_size = 2, height = 6, width = 4, n_channels = 3,
                 kernel_size = 2, stride = 1, padding = 0, atol=1e-6, print_results=False, random_sampler=np.random.rand)

max_pool_2d_test(batch_size = 2, height = 6, width = 4, n_channels = 3,
                 kernel_size = 2, stride = 1, padding = 0, atol=1e-6, print_results=False, random_sampler=np.random.randn)
print("passed")

passed

passed


In [30]:
"""
BottleNeckLayer test
"""

batch_size = 5
in_channels = 8
bottleneck_depth = 2
width = 6
height = 6

expansion_factor = 4
n_output_channels = bottleneck_depth * expansion_factor

momentum = 0.1

print("\n" + "\033[1m" + "afaik the tests are passed if atol = 1e-4 "\
      "or smaller regardless the sampler. If atol = 1e-6 "
      "unifrom distribution leads to problems" + "\033[0m" + "\n")

for sampler in (np.random.rand, np.random.randn):
    print(f"sampler = {sampler.__name__}")
    for stride_for_downsampling in (1, 2):  # Checking both cases: no downsampling and downsampling
        print(f"stride = {stride_for_downsampling}")
        input_data = sampler(batch_size, in_channels, width, height).astype(np.float32)
        input_data_torch = torch.from_numpy(input_data).float()
        input_data_torch.requires_grad = True

        output_width = width // stride_for_downsampling
        output_height = height // stride_for_downsampling
        output_gradient = sampler(batch_size, n_output_channels, output_width, output_height).astype(np.float32)

        torch_bottleneck = Bottleneck_torch(in_channels, bottleneck_depth, stride_for_downsampling)
        my_bottleneck = Bottleneck(in_channels, bottleneck_depth, stride_for_downsampling)

        conv_layer_pairs = [
            (my_bottleneck.conv1, torch_bottleneck.conv1),
            (my_bottleneck.conv2, torch_bottleneck.conv2),
            (my_bottleneck.conv3, torch_bottleneck.conv3)]

        for my_conv, torch_conv in conv_layer_pairs:
            my_conv.weights = torch_conv.weight.detach().numpy() #.reshape(my_conv.weights.shape)
        
        bn_pairs = [
            (my_bottleneck.bn1, torch_bottleneck.bn1),
            (my_bottleneck.bn2, torch_bottleneck.bn2),
            (my_bottleneck.bn3, torch_bottleneck.bn3)]
        
        for my_bn, torch_bn in bn_pairs:
            my_bn.gamma = torch_bn.weight.detach().numpy().reshape(my_bn.gamma.shape)
            my_bn.beta = torch_bn.bias.detach().numpy().reshape(my_bn.beta.shape)
            my_bn.running_mean = torch_bn.running_mean.detach().numpy().reshape(my_bn.running_mean.shape)
            my_bn.running_var = torch_bn.running_var.detach().numpy().reshape(my_bn.running_var.shape)
            my_bn.momentum = torch_bn.momentum = momentum
        

        if my_bottleneck.conv_to_match_dimensions:
            my_bottleneck.conv_to_match_dimensions.weights = torch_bottleneck.conv_to_match_dimensions.weight.detach().numpy()
            my_bottleneck.bn_for_residual.gamma = torch_bottleneck.bn_for_residual.weight.detach().numpy().reshape(my_bottleneck.bn_for_residual.gamma.shape)
            my_bottleneck.bn_for_residual.beta = torch_bottleneck.bn_for_residual.bias.detach().numpy().reshape(my_bottleneck.bn_for_residual.beta.shape)
            my_bottleneck.bn_for_residual.running_mean = torch_bottleneck.bn_for_residual.running_mean.detach().numpy().reshape(my_bottleneck.bn_for_residual.running_mean.shape)
            my_bottleneck.bn_for_residual.running_var = torch_bottleneck.bn_for_residual.running_var.detach().numpy().reshape(my_bottleneck.bn_for_residual.running_var.shape)
            torch_bottleneck.bn_for_residual.momentum = my_bottleneck.bn_for_residual.momentum = momentum
        
        my_bottleneck.train()
        torch_bottleneck.train()

        my_out = my_bottleneck.forward(input_data)
        torch_out = torch_bottleneck(input_data_torch)

        torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
        torch_input_g = input_data_torch.grad.detach().numpy()

        my_input_g = my_bottleneck.backward(output_gradient)

        atol = 1e-6
        print("output all close:", np.allclose(my_out, torch_out.detach().numpy(), atol=atol))
        print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))
        print("conv1 weights gradients all close:", np.allclose(my_bottleneck.conv1.weights_gradient, torch_bottleneck.conv1.weight.grad.detach().numpy(), atol=atol))
        print("conv2 weights gradients all close:", np.allclose(my_bottleneck.conv2.weights_gradient, torch_bottleneck.conv2.weight.grad.detach().numpy(), atol=atol))
        print("conv3 weights gradients all close:", np.allclose(my_bottleneck.conv3.weights_gradient, torch_bottleneck.conv3.weight.grad.detach().numpy(), atol=atol))
        if my_bottleneck.conv_to_match_dimensions:
            print("conv_to_match_dimensions weights gradients all close:", np.allclose(my_bottleneck.conv_to_match_dimensions.weights_gradient, torch_bottleneck.conv_to_match_dimensions.weight.grad.detach().numpy(), atol=atol))
            print("bn_for_residual gamma gradients all close:", np.allclose(my_bottleneck.bn_for_residual.gamma_gradient.flatten(), torch_bottleneck.bn_for_residual.weight.grad.detach().numpy(), atol=atol))
            print("bn_for_residual beta gradients all close:", np.allclose(my_bottleneck.bn_for_residual.beta_gradient.flatten(), torch_bottleneck.bn_for_residual.bias.grad.detach().numpy(), atol=atol))
        

        print("bn1 gamma gradients all close:", np.allclose(my_bottleneck.bn1.gamma_gradient.flatten(), torch_bottleneck.bn1.weight.grad.detach().numpy(), atol=atol))
        print("bn1 beta gradients all close:", np.allclose(my_bottleneck.bn1.beta_gradient.flatten(), torch_bottleneck.bn1.bias.grad.detach().numpy(), atol=atol))
        print("bn2 gamma gradients all close:", np.allclose(my_bottleneck.bn2.gamma_gradient.flatten(), torch_bottleneck.bn2.weight.grad.detach().numpy(), atol=atol))
        print("bn2 beta gradients all close:", np.allclose(my_bottleneck.bn2.beta_gradient.flatten(), torch_bottleneck.bn2.bias.grad.detach().numpy(), atol=atol))
        print("bn3 gamma gradients all close:", np.allclose(my_bottleneck.bn3.gamma_gradient.flatten(), torch_bottleneck.bn3.weight.grad.detach().numpy(), atol=atol))  
        print("bn3 beta gradients all close:", np.allclose(my_bottleneck.bn3.beta_gradient.flatten(), torch_bottleneck.bn3.bias.grad.detach().numpy(), atol=atol))  
        print()


[1mafaik the tests are passed if atol = 1e-4 or smaller regardless the sampler. If atol = 1e-6 unifrom distribution leads to problems[0m

sampler = rand
stride = 1
output all close: True
input gradients all close: False
conv1 weights gradients all close: True
conv2 weights gradients all close: False
conv3 weights gradients all close: False
bn1 gamma gradients all close: False
bn1 beta gradients all close: True
bn2 gamma gradients all close: True
bn2 beta gradients all close: True
bn3 gamma gradients all close: True
bn3 beta gradients all close: True

stride = 2
output all close: True
input gradients all close: False
conv1 weights gradients all close: True
conv2 weights gradients all close: False
conv3 weights gradients all close: False
conv_to_match_dimensions weights gradients all close: False
bn_for_residual gamma gradients all close: True
bn_for_residual beta gradients all close: True
bn1 gamma gradients all close: True
bn1 beta gradients all close: True
bn2 gamma gradients all c

In [31]:
"""
resnet 101 test
"""

batch_size = 10
height = width = 32
n_channels = 1

input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
input_data_torch = torch.from_numpy(input_data).float()
input_data_torch.requires_grad = True
output_gradient = np.random.rand(batch_size, 10).astype(np.float32)

torch_resnet = resnet101_torch(10, 1)
my_resnet = resnet101(10, 1)
torch_resnet.train()
my_resnet.train()

my_resnet.clone_weights_from_torch(torch_resnet)

my_out = my_resnet.forward(input_data)

torch_out = torch_resnet(input_data_torch)
torch_out_np = torch_out.detach().numpy()

torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
torch_input_g = input_data_torch.grad.detach().numpy()

my_input_g = my_resnet.backward(output_gradient)

atol=1e-4

print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
print("input gradients all close:", np.allclose(my_input_g.flatten(), torch_input_g.flatten(), atol=atol))

print("fc weights gradients all close:", np.allclose(my_resnet.fc.weights_gradient, torch_resnet.fc.weight.grad.detach().numpy().T, atol=atol))
print("fc bias gradients all close:", np.allclose(my_resnet.fc.bias_gradient, torch_resnet.fc.bias.grad.detach().numpy(), atol=atol))

print("It's ok that gradients don't match in eval mode. "\
      "Batch norm's backward is different in train and eval mode.\n"\
      "I didn't implement eval backward since it won't ever be used in training the network")

output all close: False
input gradients all close: False
fc weights gradients all close: False
fc bias gradients all close: True
It's ok that gradients don't match in eval mode. Batch norm's backward is different in train and eval mode.
I didn't implement eval backward since it won't ever be used in training the network


In [32]:
my_input_g.flatten(), torch_input_g.flatten()

(array([-49.13575793,  75.4237405 ,  28.05470836, ...,  73.2363413 ,
        -36.06450964,  57.12665886]),
 array([-44.48268 ,  64.007614,  27.99699 , ...,  74.460556, -35.49483 ,
         52.25823 ], dtype=float32))

In [33]:
atol=1e-2

print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

print("fc weights gradients all close:", np.allclose(my_resnet.fc.weights_gradient, torch_resnet.fc.weight.grad.detach().numpy().T, atol=atol))
print("fc bias gradients all close:", np.allclose(my_resnet.fc.bias_gradient, torch_resnet.fc.bias.grad.detach().numpy(), atol=atol))

output all close: True
input gradients all close: False
fc weights gradients all close: True
fc bias gradients all close: True


In [34]:
atol=1e+2

print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

print("fc weights gradients all close:", np.allclose(my_resnet.fc.weights_gradient, torch_resnet.fc.weight.grad.detach().numpy().T, atol=atol))
print("fc bias gradients all close:", np.allclose(my_resnet.fc.bias_gradient, torch_resnet.fc.bias.grad.detach().numpy(), atol=atol))

output all close: True
input gradients all close: True
fc weights gradients all close: True
fc bias gradients all close: True


In [63]:
"""
resnet 101 without batchnormtest
"""

batch_size = 10
height = width = 32
n_channels = 1

input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
input_data_torch = torch.from_numpy(input_data).float()
input_data_torch.requires_grad = True
output_gradient = np.random.rand(batch_size, 10).astype(np.float32)

torch_resnet = resnet101_torch_without_batchnorm(10, 1)
torch_resnet.train()
my_resnet = resnet101_np_without_batchnorm(10, 1)
my_resnet.train()

my_resnet.clone_weights_from_torch(torch_resnet)

my_out = my_resnet.forward(input_data)

torch_out = torch_resnet(input_data_torch)
torch_out_np = torch_out.detach().numpy()

torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
torch_input_g = input_data_torch.grad.detach().numpy()

my_input_g = my_resnet.backward(output_gradient)

atol=1e-6

print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

print("fc weights gradients all close:", np.allclose(my_resnet.fc.weights_gradient, torch_resnet.fc.weight.grad.detach().numpy().T, atol=atol))
print("fc bias gradients all close:", np.allclose(my_resnet.fc.bias_gradient, torch_resnet.fc.bias.grad.detach().numpy(), atol=atol))

output all close: True
input gradients all close: True
fc weights gradients all close: True
fc bias gradients all close: True


In [35]:
"""
conv1 test
"""

batch_size = 10
height = width = 32
n_channels = 1

input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
input_data_torch = torch.from_numpy(input_data).float()
input_data_torch.requires_grad = True
output_gradient = np.random.rand(batch_size, 64, 16, 16).astype(np.float32)

torch_resnet = resnet101_torch(10, 1)
torch_resnet.eval()
my_resnet = resnet101(10, 1)
my_resnet.eval()

my_resnet.clone_weights_from_torch(torch_resnet)

my_out = my_resnet.conv1.forward(input_data)

torch_out = torch_resnet.conv1(input_data_torch)
torch_out_np = torch_out.detach().numpy()

torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
torch_input_g = input_data_torch.grad.detach().numpy()

my_input_g = my_resnet.conv1.backward(output_gradient)

atol=1e-6

print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

output all close: True
input gradients all close: True


In [36]:
"""
bn1 test
"""

batch_size = 10
height = width = 16
n_channels = 64

n_classes = 10

for phase in ['train', 'eval']:
    print(f"phase: {phase}")

    # ! Has been noticed that moving 4 lines below outside the loop leads to not passing tests
    input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
    input_data_torch = torch.from_numpy(input_data).float()
    input_data_torch.requires_grad = True
    output_gradient = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)

    torch_resnet = resnet101_torch(n_classes, n_channels)
    my_resnet = resnet101(n_classes, n_channels)
    if phase == 'train':
        my_resnet.train()
        torch_resnet.train()
    elif phase == 'eval':
        my_resnet.eval()
        torch_resnet.eval()
    else:
        raise Exception("unknown phase") 

    my_resnet.clone_weights_from_torch(torch_resnet)

    my_out = my_resnet.bn1.forward(input_data)

    torch_out = torch_resnet.bn1(input_data_torch)
    torch_out_np = torch_out.detach().numpy()

    torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
    torch_input_g = input_data_torch.grad.detach().numpy()

    my_input_g = my_resnet.bn1.backward(output_gradient)

    atol=1e-6

    print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
    print("input gradients all close:", np.allclose(my_input_g.flatten(), torch_input_g.flatten(), atol=atol))
    print()

print("It's ok that gradients don't match in eval mode. "\
      "Batch norm's backward is different in train and eval mode.\n"\
      "I didn't implement eval backward since it won't ever be used in training the network")

phase: train
output all close: True
input gradients all close: True

phase: eval
output all close: True
input gradients all close: False

It's ok that gradients don't match in eval mode. Batch norm's backward is different in train and eval mode.
I didn't implement eval backward since it won't ever be used in training the network


In [37]:
"""
conv2_x test
"""


batch_size = 10
height = width = 8
n_channels = 64

for phase in ['train', 'eval']:
    print(f"phase: {phase}")

    input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
    input_data_torch = torch.from_numpy(input_data).float()
    input_data_torch.requires_grad = True
    output_gradient = np.random.rand(batch_size, 256, 8, 8).astype(np.float32)

    torch_resnet = resnet101_torch(10, 1)
    my_resnet = resnet101(10, 1)

    if phase == 'train':
        my_resnet.train()
        torch_resnet.train()
    elif phase == 'eval':
        my_resnet.eval()
        torch_resnet.eval()
    else:
        raise Exception("unknown phase") 


    my_resnet.clone_weights_from_torch(torch_resnet)

    my_out = my_resnet.conv2_x.forward(input_data)

    torch_out = torch_resnet.conv2_x(input_data_torch)
    torch_out_np = torch_out.detach().numpy()

    torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
    torch_input_g = input_data_torch.grad.detach().numpy()

    my_input_g = my_resnet.conv2_x.backward(output_gradient)

    atol=1e-3

    print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
    print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))
    print()

print("It's ok that gradients don't match in eval mode. "\
      "Batch norm's backward is different in train and eval mode.\n"\
      "I didn't implement eval backward since it won't ever be used in training the network")

phase: train
output all close: True
input gradients all close: False

phase: eval
output all close: True
input gradients all close: False

It's ok that gradients don't match in eval mode. Batch norm's backward is different in train and eval mode.
I didn't implement eval backward since it won't ever be used in training the network


In [38]:
"""
conv3_x test
"""

batch_size = 10
height = width = 8
n_channels = 256

input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
input_data_torch = torch.from_numpy(input_data).float()
input_data_torch.requires_grad = True
output_gradient = np.random.rand(batch_size, 512, 4, 4).astype(np.float32)

torch_resnet = resnet101_torch(10, 1)
my_resnet = resnet101(10, 1)

my_resnet.clone_weights_from_torch(torch_resnet)

my_out = my_resnet.conv3_x.forward(input_data)

torch_out = torch_resnet.conv3_x(input_data_torch)
torch_out_np = torch_out.detach().numpy()

torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
torch_input_g = input_data_torch.grad.detach().numpy()

my_input_g = my_resnet.conv3_x.backward(output_gradient)

atol=1e-3

print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

output all close: True
input gradients all close: False


In [39]:
"""
conv4_x test
"""

batch_size = 10
height = width = 4
n_channels = 512

input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
input_data_torch = torch.from_numpy(input_data).float()
input_data_torch.requires_grad = True
output_gradient = np.random.rand(batch_size, 1024, height//2, width//2).astype(np.float32)

torch_resnet = resnet101_torch(10, 1)
my_resnet = resnet101(10, 1)

my_resnet.train()
torch_resnet.train()

my_resnet.clone_weights_from_torch(torch_resnet)

my_out = my_resnet.conv4_x.forward(input_data)

torch_out = torch_resnet.conv4_x(input_data_torch)

torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
torch_input_g = input_data_torch.grad.detach().numpy()
torch_out_np = torch_out.detach().numpy()


my_input_g = my_resnet.conv4_x.backward(output_gradient)

atol=1e-3

print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

output all close: True
input gradients all close: False


In [40]:
my_input_g.flatten(), torch_input_g.flatten()

(array([-92.64273227,   0.        ,  81.38443665, ...,   0.        ,
          0.        ,   0.        ]),
 array([-92.29285,   0.     ,  81.22645, ...,   0.     ,   0.     ,
          0.     ], dtype=float32))

In [41]:
"""
conv5_x test
"""

batch_size = 10
height = width = 2
n_channels = 1024

input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
input_data_torch = torch.from_numpy(input_data).float()
input_data_torch.requires_grad = True
output_gradient = np.random.rand(batch_size, 2048, 1, 1).astype(np.float32)

torch_resnet = resnet101_torch(10, 1)
my_resnet = resnet101(10, 1)

my_resnet.clone_weights_from_torch(torch_resnet)

my_out = my_resnet.conv5_x.forward(input_data)

torch_out = torch_resnet.conv5_x(input_data_torch)
torch_out_np = torch_out.detach().numpy()

torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
torch_input_g = input_data_torch.grad.detach().numpy()

my_input_g = my_resnet.conv5_x.backward(output_gradient)

atol=1e-3

print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

output all close: True
input gradients all close: True


In [43]:
"""
AdamOptimizer test
"""

from torch.optim import Adam as Adam_torch

n_input_features = 6
n_output_features = 3
batch_size = 5
input_data = np.random.rand(batch_size, n_input_features).astype(np.float32)
input_data_torch = torch.from_numpy(input_data).float()
input_data_torch.requires_grad = True
output_gradient = np.random.rand(batch_size, n_output_features).astype(np.float32)

torch_fc = torch.nn.Linear(n_input_features, n_output_features)
torch_out = torch_fc(input_data_torch)
torch_out_np = torch_out.detach().numpy()
torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
torch_wg = torch_fc.weight.grad.detach().numpy().T
torch_bg = torch_fc.bias.grad.detach().numpy().reshape(-1, 1).T
torch_input_g = input_data_torch.grad.detach().numpy()


my_fc = FullyConnectedLayer(n_input_features, n_output_features)
my_fc.weights = torch_fc.weight.detach().numpy().T
my_fc.bias = torch_fc.bias.detach().numpy().reshape(-1, 1).T
my_out = my_fc.forward(input_data)
my_input_g = my_fc.backward(output_gradient)
my_wg = my_fc.weights_gradient
my_bg = my_fc.bias_gradient

atol=1e-3

print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))
print("before adam weights gradients all close:", np.allclose(my_wg, torch_wg, atol=atol))
print("before adam bias gradients all close:", np.allclose(my_bg, torch_bg, atol=atol))
print(my_wg, "\n", torch_wg)

my_adam = AdamOptimizer(my_fc.get_trainable_layers(), 0.001, 0.9, 0.999, 1e-8)
my_adam.step()

torch_adam = Adam_torch(torch_fc.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-8)
torch_adam.step()

print("after adam weights gradients all close:", np.allclose(my_fc.weights_gradient, torch_fc.weight.grad.detach().numpy().T, atol=atol))
print("after adam bias gradients all close:", np.allclose(my_fc.bias_gradient, torch_fc.bias.grad.detach().numpy().reshape(-1, 1).T, atol=atol))
print(my_fc.weights_gradient, "\n", torch_fc.weight.grad.detach().numpy().T)

output all close: True
input gradients all close: True
before adam weights gradients all close: True
before adam bias gradients all close: True
[[1.3891779  1.7336051  1.2286544 ]
 [1.3048313  0.99781084 1.0630406 ]
 [1.6351756  1.4931997  1.3154104 ]
 [1.5244397  1.4477336  1.157729  ]
 [0.9432854  0.58816004 0.951926  ]
 [1.6254252  2.1075394  1.3502117 ]] 
 [[1.3891779  1.733605   1.2286544 ]
 [1.3048313  0.99781084 1.0630406 ]
 [1.6351756  1.4931998  1.3154104 ]
 [1.5244397  1.4477336  1.1577291 ]
 [0.9432854  0.58816004 0.95192605]
 [1.6254252  2.1075392  1.3502117 ]]
after adam weights gradients all close: True
after adam bias gradients all close: True
[[1.3891779  1.7336051  1.2286544 ]
 [1.3048313  0.99781084 1.0630406 ]
 [1.6351756  1.4931997  1.3154104 ]
 [1.5244397  1.4477336  1.157729  ]
 [0.9432854  0.58816004 0.951926  ]
 [1.6254252  2.1075394  1.3502117 ]] 
 [[1.3891779  1.733605   1.2286544 ]
 [1.3048313  0.99781084 1.0630406 ]
 [1.6351756  1.4931998  1.3154104 ]
 [1.52

In [44]:
"""
Conv2d vs Conv2dWithLoops vs torch.nn.Conv2d time comparison forward and backward
"""

import time

n_input_channels = 4
n_output_channels = 2
width = 3
height = 5

kernel_size = 3
stride = 1
padding = 3

output_width = (width + 2 * padding - kernel_size) // stride + 1
output_height = (height + 2 * padding - kernel_size) // stride + 1

for batch_size in [1, 2, 4, 8, 16]:
    print("batch_size:", batch_size)

    input_data = np.random.rand(batch_size, n_input_channels, height, width).astype(np.float32)
    input_data_torch = torch.from_numpy(input_data).float()
    input_data_torch.requires_grad = True
    output_gradient = np.random.rand(batch_size, n_output_channels, output_height, output_width).astype(np.float32)

    torch_conv = torch.nn.Conv2d(n_input_channels, n_output_channels, kernel_size, stride, padding)

    my_conv_with_loops = Conv2dWithLoops(n_input_channels, n_output_channels, kernel_size, stride, padding)
    my_conv_with_loops.weights = torch_conv.weight.detach().numpy()

    my_conv = Conv2d(n_input_channels, n_output_channels, kernel_size, stride, padding)
    my_conv.weights = torch_conv.weight.detach().numpy()

    n_iterations = 1000

    start = time.time()
    for i in range(n_iterations):
        my_out = my_conv.forward(input_data)
    end = time.time()
    print(f"my_conv forward time: {end - start}")

    start = time.time()
    for i in range(n_iterations):
        my_out_with_loops = my_conv_with_loops.forward(input_data)
    end = time.time()
    print(f"my_conv_with_loops forward time: {end - start}")

    start = time.time()
    for i in range(n_iterations):
        torch_out = torch_conv(input_data_torch)
    end = time.time()
    print(f"torch_conv forward time: {end - start}")

    start = time.time()
    for i in range(n_iterations):
        my_input_g = my_conv.backward_as_matrix_multiplication(output_gradient)
    end = time.time()
    print(f"my_conv backward time: {end - start}")

    start = time.time()
    for i in range(n_iterations):
        my_input_g_with_loops = my_conv_with_loops.backward(output_gradient)
    end = time.time()
    print(f"my_conv_with_loops backward time: {end - start}")

    start = time.time()
    for i in range(n_iterations):
        torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
    end = time.time()
    print(f"torch_conv backward time: {end - start}")

    print()

batch_size: 1
my_conv forward time: 0.5448141098022461
my_conv_with_loops forward time: 3.00417423248291
torch_conv forward time: 0.07560348510742188
my_conv backward time: 0.684333324432373
my_conv_with_loops backward time: 2.6092746257781982
torch_conv backward time: 0.17493200302124023

batch_size: 2
my_conv forward time: 0.5803859233856201
my_conv_with_loops forward time: 2.1322405338287354
torch_conv forward time: 0.15914392471313477
my_conv backward time: 1.183729648590088
my_conv_with_loops backward time: 4.955396413803101
torch_conv backward time: 0.365201473236084

batch_size: 4
my_conv forward time: 1.063096284866333
my_conv_with_loops forward time: 2.2665672302246094
torch_conv forward time: 0.16541314125061035
my_conv backward time: 2.5409464836120605
my_conv_with_loops backward time: 9.793752670288086
torch_conv backward time: 0.414461612701416

batch_size: 8
my_conv forward time: 1.992255687713623
my_conv_with_loops forward time: 2.455413341522217
torch_conv forward time:

In [54]:
class GlobalAveragePooling2D:
    def __init__(self):
        # self.input_shape = None
        pass

    def forward(self, input_):
        self.input_ = input_
        self.input_shape = input_.shape
        return np.mean(input_, axis=(2, 3))
    
    def backward(self, d_J_d_out):
        d_out_d_in = np.ones(self.input_shape) / np.prod(self.input_shape[2:])
        d_J_d_out = d_J_d_out[:, :, np.newaxis, np.newaxis]
        return d_out_d_in * d_J_d_out

In [76]:
gap = GlobalAveragePooling2D()

torch_gap = torch.nn.Sequential(
    torch.nn.AdaptiveAvgPool2d((1, 1)),
    torch.nn.Flatten(start_dim=1))

In [81]:
# make torch_gap require grad
torch_gap[0].requires_grad = True
torch_gap[1].requires_grad = True

In [83]:
torch_gap.train()

Sequential(
  (0): AdaptiveAvgPool2d(output_size=(1, 1))
  (1): Flatten(start_dim=1, end_dim=-1)
)

In [86]:
input_ = np.random.rand(2, 3, 4, 5)

input_torch = torch.from_numpy(input_).float()

input_torch.requires_grad = True

d_J_d_out = np.random.rand(2, 3)

out = gap.forward(input_)

torch_out = torch_gap(input_torch)

d_J_d_in = gap.backward(d_J_d_out)

torch_out.backward(torch.tensor(d_J_d_out), retain_graph=True)

print("out all close:", np.allclose(out, torch_out.detach().numpy()))
print("d_J_d_in all close:", np.allclose(d_J_d_in, input_torch.grad.detach().numpy()))

out all close: True
d_J_d_in all close: True
