In [1]:
# move to project's root directory to make
# numpy_nn and pytorch_nn packages accessable 
# %cd ../..


# # Another possible solution is appending to the sys.path

import sys
import  os
project_root = os.path.dirname(os.path.dirname(sys.path[0]))
if project_root not in sys.path:
    sys.path.append(project_root)

In [2]:
from typing import List, Tuple, Callable

import torch
torch.use_deterministic_algorithms(True)

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from test_layer import test_module

from numpy_nn.modules.np_nn import (
    FullyConnectedLayer,
    ReLULayer,
    SigmoidLayer,
    ReLULayer,
    AdamOptimizer,
    CrossEntropyLoss,
    LinearActivation,
    Sequential,
    Optimizer,
    SoftMaxLayer,
    GradientDescentOptimizer,
    CrossEntropyLossWithSoftMax,
    Conv2d,
    Conv2dWithLoops,
    Flatten,
    MaxPool2d,
    AdamOptimizer,
    BatchNormalization2d,
    ActivationLayer,
    TrainableLayer,
)

from pytorch_nn.models.resnet import (
    Bottleneck as Bottleneck_torch,
    resnet101 as resnet101_torch
)

from pytorch_nn.models.resnet_without_batchnorm import (
    Bottleneck as Bottleneck_torch_without_batchnorm,
    resnet101 as resnet101_torch_without_batchnorm
)

from numpy_nn.models.resnet import Bottleneck, resnet101

from numpy_nn.models.resnet_without_batchnorm import (
    Bottleneck as Bottleneck_np_without_batchnorm,
    resnet101 as resnet101_np_without_batchnorm
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# # reload a user's module test_layer
# %load_ext autoreload
# %autoreload 2

In [4]:
def conv2d_test(my_conv2d_constructor: Callable, batch_size: int,
                input_height: int, input_width: int, n_input_channels,
                n_output_channels, kernel_size: int, stride: int, padding: int,
                bias: bool, atol: float = 1e-5, random_sampler: Callable = np.random.rand) -> None:
    
    my_conv2d_kwargs = torch_conv2d_kwargs = {
        "in_channels": n_input_channels,
        "out_channels": n_output_channels,
        "kernel_size": kernel_size,
        "stride": stride,
        "padding": padding,
        "bias": bias
    }

    output_height = (input_height + 2 * padding - kernel_size) // stride + 1
    output_width = (input_width + 2 * padding - kernel_size) // stride + 1

    input_shape = (batch_size, n_input_channels, input_height, input_width)
    output_shape = (batch_size, n_output_channels, output_height, output_width)

    my_conv2d = my_conv2d_constructor(**my_conv2d_kwargs)
    torch_conv2d = torch.nn.Conv2d(**torch_conv2d_kwargs)

    test_module(my_conv2d, torch_conv2d, input_shape,
                output_shape, atol=atol, random_sampler=random_sampler)
    


def max_pool_2d_test(batch_size: int, height: int, width: int, n_channels: int, kernel_size: int,
                     stride: int, padding: int, atol: float = 1e-5, random_sampler: Callable = np.random.rand):
    
    output_width = (width + 2 * padding - kernel_size) // stride + 1
    output_height = (height + 2 * padding - kernel_size) // stride + 1

    my_pool_args = torch_pool_args = [kernel_size, stride, padding]

    my_pool = MaxPool2d(*my_pool_args)
    torch_pool = torch.nn.MaxPool2d(*torch_pool_args)

    test_module(my_pool, torch_pool, input_shape=[batch_size, n_channels, height, width],
               output_shape=[batch_size, n_channels, output_height, output_width],
               atol=atol, random_sampler = random_sampler)



def activation_test(my_activation: Callable,
                    torch_activation: Callable,
                    input_dim: List[int],
                    atol: float = 1e-5):
    """
    Samples input data and output gradient from a uniform
    distribution and tests if the output and input gradients
    are close to the ones computed by pytorch
    """
    test_module(my_activation(), torch_activation(), input_shape=input_dim, output_shape=input_dim, atol=atol)

def relu_test(input_dim: List[int], atol: float = 1e-5):
    activation_test(ReLULayer, torch.nn.ReLU, input_dim, atol=atol)

def sigmoid_test(input_dim: List[int], atol: float = 1e-5):
    activation_test(SigmoidLayer, torch.nn.Sigmoid, input_dim, atol=atol)



def flatten_test(input_shape: List[int], atol: float = 1e-5,
                 random_sampler: Callable = np.random.rand):
    batch_size, *rest_input_dim = input_shape
    output_shape = [batch_size, np.prod(rest_input_dim)]

    test_module(Flatten(), torch.nn.Flatten(), input_shape=input_shape,
               output_shape=output_shape, atol=atol, random_sampler=random_sampler)



def batchnorm2d_iterative_test(n_channels: int, batch_size: int, height: int,
                     width: int, n_iter: int, phase: str = "train",
                     momentum: float = 0.1, atol: float = 1e-5,
                     random_sampler: Callable = np.random.rand, print_tensors = False) -> None:
    
    # since each iteration changes the running mean, running variance, mean and variance
    # this test runs several iterations and checks that the results are the same

    # maybe there should be a custom test that calls
    # forward multiple times and calls backward only once
    
    input_shape = output_shape = [batch_size, n_channels, height, width]

    my_bn = BatchNormalization2d(n_channels, momentum=momentum)
    torch_bn = torch.nn.BatchNorm2d(n_channels, momentum=momentum)

    if phase == "eval":
        my_bn.eval()
        torch_bn.eval()
    else:
        my_bn.train()
        torch_bn.train()

    for i in range(n_iter):
        print("Iteration:", i)
        print(my_bn.training)
        test_module(my_bn, torch_bn, input_shape, output_shape,
                    atol=atol, random_sampler=random_sampler,
                    skip_parameter_copying=bool(i), print_tensors=print_tensors)
        # reset torch gradients
        torch_bn.weight.grad = None
        torch_bn.bias.grad = None
        print()

In [5]:
batchnorm2d_iterative_test(n_channels=5, batch_size=4, height=8, width=8,
                           n_iter=3, phase="train", momentum=0.1,
                           atol=1e-4, random_sampler=np.random.rand)

Iteration: 0
True
Outputs are equal
Input gradients are equal
Running means are equal
Running vars are equal
Weight gradients are equal
Bias gradients are equal

Iteration: 1
True
Outputs are equal
Input gradients are equal
Running means are equal
Running vars are equal
Weight gradients are equal
Bias gradients are equal

Iteration: 2
True
Outputs are equal
Input gradients are equal
Running means are equal
Running vars are equal
Weight gradients are equal
Bias gradients are equal



In [6]:
batchnorm2d_iterative_test(n_channels=5, batch_size=4, height=8, width=8,
                           n_iter=3, phase="train", momentum=0.8,
                           atol=1e-4, random_sampler=np.random.rand)

Iteration: 0
True
Outputs are equal
Input gradients are equal
Running means are equal
Running vars are equal
Weight gradients are equal
Bias gradients are equal

Iteration: 1
True
Outputs are equal
Input gradients are equal
Running means are equal
Running vars are equal
Weight gradients are equal
Bias gradients are equal

Iteration: 2
True
Outputs are equal
Input gradients are equal
Running means are equal
Running vars are equal
Weight gradients are equal
Bias gradients are equal



In [5]:
"""
Conv2dWithLoops and Conv2d tests
"""

batch_size = 5
n_input_channels = 4
n_output_channels = 2
width = 3
height = 5

kernel_size = 3
stride = 1
padding = 1

print("Conv2dWithLoops test")
conv2d_test(Conv2dWithLoops, batch_size, height, width, n_input_channels, n_output_channels, kernel_size, stride, padding, bias=True)

print()

print("Conv2d test")
conv2d_test(Conv2d, batch_size, height, width, n_input_channels, n_output_channels, kernel_size, stride, padding, bias=True)

print()

print("Conv2d test without bias")
conv2d_test(Conv2d, batch_size, height, width, n_input_channels, n_output_channels, kernel_size, stride, padding, bias=False)

Conv2dWithLoops test
Outputs are equal
Input gradients are equal
Weight gradients are equal
Bias gradients are equal

Conv2d test
Outputs are equal
Input gradients are equal
Weight gradients are equal
Bias gradients are equal

Conv2d test without bias
Outputs are equal
Input gradients are equal
Weight gradients are equal


In [8]:
"""
FullyConnectedLayer test
"""

n_input_neurons = 6
n_output_neurons = 3
n_samples = 5

my_fc_params = torch_fc_params = [n_input_neurons, n_output_neurons]

test_module(FullyConnectedLayer(*my_fc_params), torch.nn.Linear(*torch_fc_params),
            input_shape=[n_samples, n_input_neurons], output_shape=[n_samples, n_output_neurons])

Outputs are equal
Input gradients are equal
Weight gradients are equal
Bias gradients are equal


In [9]:
"""
CrossEntropyLoss test
"""

def one_hot(y: np.ndarray, n_classes: int):
    encoded = np.zeros((y.size, n_classes))
    encoded[np.arange(y.size), y] = 1
    return encoded


batch_size = 5
n_classes = 3
pred = np.random.rand(batch_size, n_classes).astype(np.float32)
true = one_hot(np.random.randint(0, n_classes, batch_size), n_classes)
pred_torch = torch.from_numpy(pred).float()
true_torch = torch.from_numpy(true).float()
pred_torch.requires_grad = True

torch_loss  = torch.nn.CrossEntropyLoss()
torch_loss_val = torch_loss(pred_torch, true_torch)
torch_loss_val.backward()

my_loss = CrossEntropyLossWithSoftMax()
my_loss_val = my_loss.forward(pred, true)
my_loss.backward()

print("loss_val all close:", np.allclose(my_loss_val, torch_loss_val.detach().numpy()))
print("loss gradients all close:", np.allclose(my_loss.backward(), pred_torch.grad))

loss_val all close: True
loss gradients all close: True


In [13]:
pred_torch.grad

tensor([[ 0.0581,  0.0729, -0.1309],
        [ 0.0631,  0.0617, -0.1248],
        [ 0.0558,  0.0446, -0.1004],
        [ 0.0668,  0.0803, -0.1471],
        [-0.1166,  0.0491,  0.0675]])

In [10]:
"""ReLU test"""
n_input_features = 6
n_samples = 5

relu_test([n_samples, n_input_features])

Outputs are equal
Input gradients are equal


If we sample random numbers from normanl distribution instead of uniform distribution (randn instead of rand) sigmoid layer won't pass tests!

In [11]:
"""SigmoidLayer test"""

n_input_features = 6
n_samples = 5
height = 4
width = 4

sigmoid_test([n_samples, n_input_features])
print()
sigmoid_test([n_samples, n_input_features, height, width])

Outputs are equal
Input gradients are equal

Outputs are equal
Input gradients are equal


In [12]:
"""Flatten test"""

flatten_test(input_shape = [2, 3, 5, 5])

Outputs are equal
Input gradients are equal


In [15]:
"""MaxPool2d tests"""

batch_size = 10
n_channels = 3
height = 16
width = 16

kernel_size = 3
stride = 2
padding = 1

max_pool_2d_test(batch_size, height, width, n_channels, kernel_size, stride, padding, atol=1e-5)


print()

max_pool_2d_test(batch_size = 2, height = 6, width = 4, n_channels = 3,
                 kernel_size = 2, stride = 1, padding = 0, atol=1e-5)

Outputs are equal
Input gradients are equal

Outputs are equal
Input gradients are equal


In [17]:
"""
BottleNeckLayer test
"""

batch_size = 5
in_channels = 8
bottleneck_depth = 2
width = 6
height = 6

expansion_factor = 4
n_output_channels = bottleneck_depth * expansion_factor

momentum = 0.1

for stride_for_downsampling in (1, 2):  # Checking both cases: no downsampling and downsampling
    print(f"stride = {stride_for_downsampling}")
    input_data = np.random.rand(batch_size, in_channels, width, height).astype(np.float32)
    input_data_torch = torch.from_numpy(input_data).float()
    input_data_torch.requires_grad = True

    output_width = width // stride_for_downsampling
    output_height = height // stride_for_downsampling
    output_gradient = np.random.rand(batch_size, n_output_channels, output_width, output_height).astype(np.float32)

    torch_bottleneck = Bottleneck_torch(in_channels, bottleneck_depth, stride_for_downsampling)
    my_bottleneck = Bottleneck(in_channels, bottleneck_depth, stride_for_downsampling)

    conv_layer_pairs = [
        (my_bottleneck.conv1, torch_bottleneck.conv1),
        (my_bottleneck.conv2, torch_bottleneck.conv2),
        (my_bottleneck.conv3, torch_bottleneck.conv3)]

    for my_conv, torch_conv in conv_layer_pairs:
        my_conv.weights = torch_conv.weight.detach().numpy() #.reshape(my_conv.weights.shape)
    
    bn_pairs = [
        (my_bottleneck.bn1, torch_bottleneck.bn1),
        (my_bottleneck.bn2, torch_bottleneck.bn2),
        (my_bottleneck.bn3, torch_bottleneck.bn3)]
    
    for my_bn, torch_bn in bn_pairs:
        my_bn.gamma = torch_bn.weight.detach().numpy().reshape(my_bn.gamma.shape)
        my_bn.beta = torch_bn.bias.detach().numpy().reshape(my_bn.beta.shape)
        my_bn.running_mean = torch_bn.running_mean.detach().numpy().reshape(my_bn.running_mean.shape)
        my_bn.running_var = torch_bn.running_var.detach().numpy().reshape(my_bn.running_var.shape)
        my_bn.momentum = torch_bn.momentum = momentum
    

    if my_bottleneck.conv_to_match_dimensions:
        my_bottleneck.conv_to_match_dimensions.weights = torch_bottleneck.conv_to_match_dimensions.weight.detach().numpy()
        my_bottleneck.bn_for_residual.gamma = torch_bottleneck.bn_for_residual.weight.detach().numpy().reshape(my_bottleneck.bn_for_residual.gamma.shape)
        my_bottleneck.bn_for_residual.beta = torch_bottleneck.bn_for_residual.bias.detach().numpy().reshape(my_bottleneck.bn_for_residual.beta.shape)
        my_bottleneck.bn_for_residual.running_mean = torch_bottleneck.bn_for_residual.running_mean.detach().numpy().reshape(my_bottleneck.bn_for_residual.running_mean.shape)
        my_bottleneck.bn_for_residual.running_var = torch_bottleneck.bn_for_residual.running_var.detach().numpy().reshape(my_bottleneck.bn_for_residual.running_var.shape)
        torch_bottleneck.bn_for_residual.momentum = my_bottleneck.bn_for_residual.momentum = momentum
    
    my_bottleneck.train()
    torch_bottleneck.train()

    my_out = my_bottleneck.forward(input_data)
    torch_out = torch_bottleneck(input_data_torch)

    torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
    torch_input_g = input_data_torch.grad.detach().numpy()

    my_input_g = my_bottleneck.backward(output_gradient)

    atol = 1e-4
    print("output all close:", np.allclose(my_out, torch_out.detach().numpy(), atol=atol))
    print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))
    print("conv1 weights gradients all close:", np.allclose(my_bottleneck.conv1.weights_gradient, torch_bottleneck.conv1.weight.grad.detach().numpy(), atol=atol))
    print("conv2 weights gradients all close:", np.allclose(my_bottleneck.conv2.weights_gradient, torch_bottleneck.conv2.weight.grad.detach().numpy(), atol=atol))
    print("conv3 weights gradients all close:", np.allclose(my_bottleneck.conv3.weights_gradient, torch_bottleneck.conv3.weight.grad.detach().numpy(), atol=atol))
    if my_bottleneck.conv_to_match_dimensions:
        print("conv_to_match_dimensions weights gradients all close:", np.allclose(my_bottleneck.conv_to_match_dimensions.weights_gradient, torch_bottleneck.conv_to_match_dimensions.weight.grad.detach().numpy(), atol=atol))
        print("bn_for_residual gamma gradients all close:", np.allclose(my_bottleneck.bn_for_residual.gamma_gradient.flatten(), torch_bottleneck.bn_for_residual.weight.grad.detach().numpy(), atol=atol))
        print("bn_for_residual beta gradients all close:", np.allclose(my_bottleneck.bn_for_residual.beta_gradient.flatten(), torch_bottleneck.bn_for_residual.bias.grad.detach().numpy(), atol=atol))
    

    print("bn1 gamma gradients all close:", np.allclose(my_bottleneck.bn1.gamma_gradient.flatten(), torch_bottleneck.bn1.weight.grad.detach().numpy(), atol=atol))
    print("bn1 beta gradients all close:", np.allclose(my_bottleneck.bn1.beta_gradient.flatten(), torch_bottleneck.bn1.bias.grad.detach().numpy(), atol=atol))
    print("bn2 gamma gradients all close:", np.allclose(my_bottleneck.bn2.gamma_gradient.flatten(), torch_bottleneck.bn2.weight.grad.detach().numpy(), atol=atol))
    print("bn2 beta gradients all close:", np.allclose(my_bottleneck.bn2.beta_gradient.flatten(), torch_bottleneck.bn2.bias.grad.detach().numpy(), atol=atol))
    print("bn3 gamma gradients all close:", np.allclose(my_bottleneck.bn3.gamma_gradient.flatten(), torch_bottleneck.bn3.weight.grad.detach().numpy(), atol=atol))  
    print("bn3 beta gradients all close:", np.allclose(my_bottleneck.bn3.beta_gradient.flatten(), torch_bottleneck.bn3.bias.grad.detach().numpy(), atol=atol))  
    print()

stride = 1
output all close: True
input gradients all close: True
conv1 weights gradients all close: True
conv2 weights gradients all close: True
conv3 weights gradients all close: True
bn1 gamma gradients all close: True
bn1 beta gradients all close: True
bn2 gamma gradients all close: True
bn2 beta gradients all close: True
bn3 gamma gradients all close: True
bn3 beta gradients all close: True

stride = 2
output all close: True
input gradients all close: True
conv1 weights gradients all close: True
conv2 weights gradients all close: True
conv3 weights gradients all close: True
conv_to_match_dimensions weights gradients all close: True
bn_for_residual gamma gradients all close: True
bn_for_residual beta gradients all close: True
bn1 gamma gradients all close: True
bn1 beta gradients all close: True
bn2 gamma gradients all close: True
bn2 beta gradients all close: True
bn3 gamma gradients all close: True
bn3 beta gradients all close: True



In [18]:
"""
resnet 101 test
"""

batch_size = 10
height = width = 32
n_channels = 1

input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
input_data_torch = torch.from_numpy(input_data).float()
input_data_torch.requires_grad = True
output_gradient = np.random.rand(batch_size, 10).astype(np.float32)

torch_resnet = resnet101_torch(10, 1)
my_resnet = resnet101(10, 1)
torch_resnet.train()
my_resnet.train()

my_resnet.clone_weights_from_torch(torch_resnet)

my_out = my_resnet.forward(input_data)

torch_out = torch_resnet(input_data_torch)
torch_out_np = torch_out.detach().numpy()

torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
torch_input_g = input_data_torch.grad.detach().numpy()

my_input_g = my_resnet.backward(output_gradient)

atol=1e-3

print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

print("fc weights gradients all close:", np.allclose(my_resnet.fc.weights_gradient, torch_resnet.fc.weight.grad.detach().numpy().T, atol=atol))
print("fc bias gradients all close:", np.allclose(my_resnet.fc.bias_gradient, torch_resnet.fc.bias.grad.detach().numpy(), atol=atol))

output all close: False
input gradients all close: False
fc weights gradients all close: False
fc bias gradients all close: True


In [19]:
atol=1e-0

print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

print("fc weights gradients all close:", np.allclose(my_resnet.fc.weights_gradient, torch_resnet.fc.weight.grad.detach().numpy().T, atol=atol))
print("fc bias gradients all close:", np.allclose(my_resnet.fc.bias_gradient, torch_resnet.fc.bias.grad.detach().numpy(), atol=atol))

output all close: True
input gradients all close: False
fc weights gradients all close: True
fc bias gradients all close: True


In [27]:
"""
resnet 101 without batchnormtest
"""

batch_size = 10
height = width = 32
n_channels = 1

input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
input_data_torch = torch.from_numpy(input_data).float()
input_data_torch.requires_grad = True
output_gradient = np.random.rand(batch_size, 10).astype(np.float32)

torch_resnet = resnet101_torch_without_batchnorm(10, 1)
torch_resnet.train()
my_resnet = resnet101_np_without_batchnorm(10, 1)
my_resnet.train()

my_resnet.clone_weights_from_torch(torch_resnet)

my_out = my_resnet.forward(input_data)

torch_out = torch_resnet(input_data_torch)
torch_out_np = torch_out.detach().numpy()

torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
torch_input_g = input_data_torch.grad.detach().numpy()

my_input_g = my_resnet.backward(output_gradient)

atol=1e-4

print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

print("fc weights gradients all close:", np.allclose(my_resnet.fc.weights_gradient, torch_resnet.fc.weight.grad.detach().numpy().T, atol=atol))
print("fc bias gradients all close:", np.allclose(my_resnet.fc.bias_gradient, torch_resnet.fc.bias.grad.detach().numpy(), atol=atol))


print("It's ok that gradients don't match in eval mode. "\
      "Batch norm's backward is different in train and eval mode.\n"\
      "I didn't implement eval backward since it won't ever be used in training the network")

output all close: True
input gradients all close: True
fc weights gradients all close: True
fc bias gradients all close: True


In [23]:
"""
conv1 test
"""

batch_size = 10
height = width = 32
n_channels = 1

input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
input_data_torch = torch.from_numpy(input_data).float()
input_data_torch.requires_grad = True
output_gradient = np.random.rand(batch_size, 64, 16, 16).astype(np.float32)

torch_resnet = resnet101_torch(10, 1)
torch_resnet.eval()
my_resnet = resnet101(10, 1)
my_resnet.eval()

my_resnet.clone_weights_from_torch(torch_resnet)

my_out = my_resnet.conv1.forward(input_data)

torch_out = torch_resnet.conv1(input_data_torch)
torch_out_np = torch_out.detach().numpy()

torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
torch_input_g = input_data_torch.grad.detach().numpy()

my_input_g = my_resnet.conv1.backward(output_gradient)

atol=1e-3

print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

output all close: True
input gradients all close: True


In [25]:
"""
bn1 test
"""

batch_size = 10
height = width = 16
n_channels = 64

n_classes = 10

for phase in ['train', 'eval']:
    print(f"phase: {phase}")

    # ! Has been noticed that moving 4 lines below outside the loop leads to not passing tests
    input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
    input_data_torch = torch.from_numpy(input_data).float()
    input_data_torch.requires_grad = True
    output_gradient = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)

    torch_resnet = resnet101_torch(n_classes, n_channels)
    my_resnet = resnet101(n_classes, n_channels)
    if phase == 'train':
        my_resnet.train()
        torch_resnet.train()
    elif phase == 'eval':
        my_resnet.eval()
        torch_resnet.eval()
    else:
        raise Exception("unknown phase") 

    my_resnet.clone_weights_from_torch(torch_resnet)

    my_out = my_resnet.bn1.forward(input_data)

    torch_out = torch_resnet.bn1(input_data_torch)
    torch_out_np = torch_out.detach().numpy()

    torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
    torch_input_g = input_data_torch.grad.detach().numpy()

    my_input_g = my_resnet.bn1.backward(output_gradient)

    atol=1e-3

    print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
    print("input gradients all close:", np.allclose(my_input_g.flatten(), torch_input_g.flatten(), atol=atol))
    print()

print("It's ok that gradients don't match in eval mode. "\
      "Batch norm's backward is different in train and eval mode.\n"\
      "I didn't implement eval backward since it won't ever be used in training the network")

phase: train
output all close: True
input gradients all close: True

phase: eval
output all close: True
input gradients all close: False



In [31]:
"""
conv2_x test
"""


batch_size = 10
height = width = 8
n_channels = 64

for phase in ['train', 'eval']:
    print(f"phase: {phase}")

    input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
    input_data_torch = torch.from_numpy(input_data).float()
    input_data_torch.requires_grad = True
    output_gradient = np.random.rand(batch_size, 256, 8, 8).astype(np.float32)

    torch_resnet = resnet101_torch(10, 1)
    my_resnet = resnet101(10, 1)

    if phase == 'train':
        my_resnet.train()
        torch_resnet.train()
    elif phase == 'eval':
        my_resnet.eval()
        torch_resnet.eval()
    else:
        raise Exception("unknown phase") 


    my_resnet.clone_weights_from_torch(torch_resnet)

    my_out = my_resnet.conv2_x.forward(input_data)

    torch_out = torch_resnet.conv2_x(input_data_torch)
    torch_out_np = torch_out.detach().numpy()

    torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
    torch_input_g = input_data_torch.grad.detach().numpy()

    my_input_g = my_resnet.conv2_x.backward(output_gradient)

    atol=1e-3

    print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
    print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))
    print()

print("It's ok that gradients don't match in eval mode. "\
      "Batch norm's backward is different in train and eval mode.\n"\
      "I didn't implement eval backward since it won't ever be used in training the network")

phase: train
output all close: True
input gradients all close: True

phase: eval
output all close: True
input gradients all close: False

it's ok that gradients don't match in eval mode. Batch norm's backward is different in train and eval mode.
I didn't implement eval backward since it won't ever be used in training the network


In [None]:
# """
# conv3_x test
# """

# batch_size = 10
# height = width = 8
# n_channels = 256

# input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
# input_data_torch = torch.from_numpy(input_data).float()
# input_data_torch.requires_grad = True
# output_gradient = np.random.rand(batch_size, 512, 4, 4).astype(np.float32)

# torch_resnet = resnet101_torch(10, 1)
# my_resnet = resnet101(10, 1)

# my_resnet.clone_weights_from_torch(torch_resnet)

# my_out = my_resnet.conv3_x.forward(input_data)

# torch_out = torch_resnet.conv3_x(input_data_torch)
# torch_out_np = torch_out.detach().numpy()

# torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
# torch_input_g = input_data_torch.grad.detach().numpy()

# my_input_g = my_resnet.conv3_x.backward(output_gradient)

# atol=1e-3

# print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
# print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

In [None]:
# """
# conv4_x test
# """

# batch_size = 10
# height = width = 4
# n_channels = 512

# input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
# input_data_torch = torch.from_numpy(input_data).float()
# input_data_torch.requires_grad = True
# output_gradient = np.random.rand(batch_size, 1024, 2, 2).astype(np.float32)

# torch_resnet = resnet101_torch(10, 1)
# my_resnet = resnet101(10, 1)

# my_resnet.clone_weights_from_torch(torch_resnet)

# my_out = my_resnet.conv4_x.forward(input_data)

# torch_out = torch_resnet.conv4_x(input_data_torch)
# torch_out_np = torch_out.detach().numpy()

# torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
# torch_input_g = input_data_torch.grad.detach().numpy()

# my_input_g = my_resnet.conv4_x.backward(output_gradient)

# atol=1e-3

# print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
# print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

In [None]:
# """
# conv5_x test
# """

# batch_size = 10
# height = width = 2
# n_channels = 1024

# input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
# input_data_torch = torch.from_numpy(input_data).float()
# input_data_torch.requires_grad = True
# output_gradient = np.random.rand(batch_size, 2048, 1, 1).astype(np.float32)

# torch_resnet = resnet101_torch(10, 1)
# my_resnet = resnet101(10, 1)

# my_resnet.clone_weights_from_torch(torch_resnet)

# my_out = my_resnet.conv5_x.forward(input_data)

# torch_out = torch_resnet.conv5_x(input_data_torch)
# torch_out_np = torch_out.detach().numpy()

# torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
# torch_input_g = input_data_torch.grad.detach().numpy()

# my_input_g = my_resnet.conv5_x.backward(output_gradient)

# atol=1e-3

# print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
# print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

In [None]:
# """
# AdamOptimizer test
# """

# from torch.optim import Adam as Adam_torch

# n_input_features = 6
# n_output_features = 3
# batch_size = 5
# input_data = np.random.rand(batch_size, n_input_features).astype(np.float32)
# input_data_torch = torch.from_numpy(input_data).float()
# input_data_torch.requires_grad = True
# output_gradient = np.random.rand(batch_size, n_output_features).astype(np.float32)

# torch_fc = torch.nn.Linear(n_input_features, n_output_features)
# torch_out = torch_fc(input_data_torch)
# torch_out_np = torch_out.detach().numpy()
# torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
# torch_wg = torch_fc.weight.grad.detach().numpy().T
# torch_bg = torch_fc.bias.grad.detach().numpy().reshape(-1, 1).T
# torch_input_g = input_data_torch.grad.detach().numpy()


# my_fc = FullyConnectedLayer(n_input_features, n_output_features)
# my_fc.weights = torch_fc.weight.detach().numpy().T
# my_fc.bias = torch_fc.bias.detach().numpy().reshape(-1, 1).T
# my_out = my_fc.forward(input_data)
# my_input_g = my_fc.backward(output_gradient)
# my_wg = my_fc.weights_gradient
# my_bg = my_fc.bias_gradient

# atol=1e-3

# print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
# print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))
# print("before adam weights gradients all close:", np.allclose(my_wg, torch_wg, atol=atol))
# print("before adam bias gradients all close:", np.allclose(my_bg, torch_bg, atol=atol))
# print(my_wg, "\n", torch_wg)

# my_adam = AdamOptimizer(my_fc.get_trainable_layers(), 0.001, 0.9, 0.999, 1e-8)
# my_adam.step()

# torch_adam = Adam_torch(torch_fc.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-8)
# torch_adam.step()

# print("after adam weights gradients all close:", np.allclose(my_fc.weights_gradient, torch_fc.weight.grad.detach().numpy().T, atol=atol))
# print("after adam bias gradients all close:", np.allclose(my_fc.bias_gradient, torch_fc.bias.grad.detach().numpy().reshape(-1, 1).T, atol=atol))
# print(my_fc.weights_gradient, "\n", torch_fc.weight.grad.detach().numpy().T)

In [None]:
"""
Conv2d vs Conv2dWithLoops vs torch.nn.Conv2d time comparison forward and backward
"""

import time

n_input_channels = 4
n_output_channels = 2
width = 3
height = 5

kernel_size = 3
stride = 1
padding = 3

output_width = (width + 2 * padding - kernel_size) // stride + 1
output_height = (height + 2 * padding - kernel_size) // stride + 1

for batch_size in [1, 2, 4, 8, 16]:
    print("batch_size:", batch_size)

    input_data = np.random.rand(batch_size, n_input_channels, height, width).astype(np.float32)
    input_data_torch = torch.from_numpy(input_data).float()
    input_data_torch.requires_grad = True
    output_gradient = np.random.rand(batch_size, n_output_channels, output_height, output_width).astype(np.float32)

    torch_conv = torch.nn.Conv2d(n_input_channels, n_output_channels, kernel_size, stride, padding)

    my_conv_with_loops = Conv2dWithLoops(n_input_channels, n_output_channels, kernel_size, stride, padding)
    my_conv_with_loops.weights = torch_conv.weight.detach().numpy()

    my_conv = Conv2d(n_input_channels, n_output_channels, kernel_size, stride, padding)
    my_conv.weights = torch_conv.weight.detach().numpy()

    n_iterations = 1000

    start = time.time()
    for i in range(n_iterations):
        my_out = my_conv.forward(input_data)
    end = time.time()
    print(f"my_conv forward time: {end - start}")

    start = time.time()
    for i in range(n_iterations):
        my_out_with_loops = my_conv_with_loops.forward(input_data)
    end = time.time()
    print(f"my_conv_with_loops forward time: {end - start}")

    start = time.time()
    for i in range(n_iterations):
        torch_out = torch_conv(input_data_torch)
    end = time.time()
    print(f"torch_conv forward time: {end - start}")

    start = time.time()
    for i in range(n_iterations):
        my_input_g = my_conv.backward_as_matrix_multiplication(output_gradient)
    end = time.time()
    print(f"my_conv backward time: {end - start}")

    start = time.time()
    for i in range(n_iterations):
        my_input_g_with_loops = my_conv_with_loops.backward(output_gradient)
    end = time.time()
    print(f"my_conv_with_loops backward time: {end - start}")

    start = time.time()
    for i in range(n_iterations):
        torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
    end = time.time()
    print(f"torch_conv backward time: {end - start}")

    print()

batch_size: 1
my_conv forward time: 0.5312726497650146
my_conv_with_loops forward time: 2.215890645980835
torch_conv forward time: 0.06669497489929199
my_conv backward time: 0.7423074245452881
my_conv_with_loops backward time: 2.638596773147583
torch_conv backward time: 0.1759324073791504

batch_size: 2
my_conv forward time: 0.5586578845977783
my_conv_with_loops forward time: 2.2504141330718994
torch_conv forward time: 0.17436599731445312
my_conv backward time: 1.2079191207885742
my_conv_with_loops backward time: 5.122586965560913
torch_conv backward time: 0.3848881721496582

batch_size: 4
my_conv forward time: 1.064781904220581
my_conv_with_loops forward time: 2.2000110149383545
torch_conv forward time: 0.1567244529724121
my_conv backward time: 2.2330682277679443
my_conv_with_loops backward time: 9.913957118988037
torch_conv backward time: 0.3933844566345215

batch_size: 8
my_conv forward time: 1.9885218143463135
my_conv_with_loops forward time: 2.341416597366333
torch_conv forward ti