In [1]:
import torch
torch.use_deterministic_algorithms(True)

import numpy as np

from keras.datasets import mnist
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

from NumpyNN.NN_np import (
    FullyConnectedLayer,
    ReLULayer,
    SigmoidLayer,
    ReLULayer,
    AdamOptimizer,
    CrossEntropyLoss,
    LinearActivation,
    Sequential,
    Optimizer,
    SoftMaxLayer,
    GradientDescentOptimizer,
    CrossEntropyLossWithSoftMax,
    Conv2d,
    Conv2dWithLoops,
    Flatten,
    MaxPool2d,
    AdamOptimizer,
    BatchNormalization2d,
)

from numpy_resnet import Bottleneck, resnet101
from numpy_resnet_without_batchnorm import Bottleneck as Bottleneck_np_without_batchnorm
from numpy_resnet_without_batchnorm import resnet101 as resnet101_np_without_batchnorm

plt.gray()


  from .autonotebook import tqdm as notebook_tqdm


<Figure size 432x288 with 0 Axes>

In [3]:
import sys
import os 
sys.path.append(
    sys.path[0].removesuffix("numpy_CNN") + "pytorch_implementations"
)

from resnet import Bottleneck as Bottleneck_torch
from resnet import resnet101 as resnet101_torch
from resnet_without_batchnorm import Bottleneck as Bottleneck_torch_without_batchnorm
from resnet_without_batchnorm import resnet101 as resnet101_torch_without_batchnorm


In [4]:
"""
FullyConnectedLayer test
"""

n_input_features = 6
n_output_features = 3
n_samples = 5
input_data = np.random.rand(n_samples, n_input_features).astype(np.float32)
input_data_torch = torch.from_numpy(input_data).float()
input_data_torch.requires_grad = True
output_gradient = np.random.rand(n_samples, n_output_features).astype(np.float32)

torch_fc = torch.nn.Linear(n_input_features, n_output_features)
torch_out = torch_fc(input_data_torch)
torch_out_np = torch_out.detach().numpy()
torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
torch_wg = torch_fc.weight.grad.detach().numpy().T
torch_bg = torch_fc.bias.grad.detach().numpy().reshape(-1, 1).T
# torch_input_g = input_data_torch.grad.detach().numpy()


my_fc = FullyConnectedLayer(n_input_features, n_output_features)
my_fc.weights = torch_fc.weight.detach().numpy().T
my_fc.bias = torch_fc.bias.detach().numpy().reshape(-1, 1).T
my_out = my_fc.forward(input_data)
my_input_g = my_fc.backward(output_gradient)
my_wg = my_fc.weights_gradient
my_bg = my_fc.bias_gradient


print("output all close:", np.allclose(my_out, torch_out_np))
print("w gradients all close:", np.allclose(my_wg, torch_wg))
print("b gradients all close:", np.allclose(my_bg, torch_bg))
print("input gradients all close:", np.allclose(my_input_g, input_data_torch.grad))
# print("input gradients all close:", np.allclose(my_input_g, torch_bg))

# print("all parameters shape same: ", my_fc.weights.shape == torch_fc.weight.T.shape and my_fc.bias.shape == torch.unsqueeze(torch_fc.bias, 1).shape)
# print("output sum of square dif:", np.square(my_out - torch_out_np).sum())
# print(torch_wg.sum(), my_wg.sum())
# print("w gradient sum of square dif:", np.square(my_wg - torch_wg).sum())

#print()
#print(my_wg)
#print()
#print(torch_wg)

output all close: True
w gradients all close: True
b gradients all close: True
input gradients all close: True


In [None]:
"""
CrossEntropyLoss test
"""
def one_hot(y: np.ndarray, n_classes: int):
    encoded = np.zeros((y.size, n_classes))
    encoded[np.arange(y.size), y] = 1
    return encoded


batch_size = 5
n_classes = 3
pred = np.random.rand(batch_size, n_classes).astype(np.float32)
true = one_hot(np.random.randint(0, n_classes, batch_size), n_classes)
pred_torch = torch.from_numpy(pred).float()
true_torch = torch.from_numpy(true).float()
pred_torch.requires_grad = True

torch_loss  = torch.nn.CrossEntropyLoss()
torch_loss_val = torch_loss(pred_torch, true_torch)
torch_loss_val.backward()

my_loss = CrossEntropyLossWithSoftMax()
my_loss_val = my_loss.forward(pred, true)
my_loss.backward()

print("loss_val all close:", np.allclose(my_loss_val, torch_loss_val.detach().numpy()))
print("loss gradients all close:", np.allclose(my_loss.backward(), pred_torch.grad))

loss_val all close: True
loss gradients all close: True


In [5]:
"""
ReLULayer test
"""

n_input_features = 6
n_output_features = 3
n_samples = 5
input_data = np.random.rand(n_samples, n_input_features).astype(np.float32)
input_data_torch = torch.from_numpy(input_data).float()
input_data_torch.requires_grad = True
output_gradient = np.random.rand(n_samples, n_input_features).astype(np.float32)


torch_relu = torch.nn.ReLU()
torch_out = torch_relu(input_data_torch)
torch_out_np = torch_out.detach().numpy()
torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
# torch_input_g = input_data_torch.grad.detach().numpy()


my_relu = ReLULayer()
my_out = my_relu.forward(input_data)


print("output all close:", np.allclose(my_out, torch_out_np))
print("input gradients all close:", np.allclose(my_relu.backward(output_gradient), input_data_torch.grad))

output all close: True
input gradients all close: True


In [6]:
"""
SigmoidLayer test
"""

n_input_features = 6
n_output_features = 3
n_samples = 5
input_data = np.random.rand(n_samples, n_input_features).astype(np.float32)
input_data_torch = torch.from_numpy(input_data).float()
input_data_torch.requires_grad = True
output_gradient = np.random.rand(n_samples, n_input_features).astype(np.float32)


torch_sigmoid = torch.nn.Sigmoid()
torch_out = torch_sigmoid(input_data_torch)
torch_out_np = torch_out.detach().numpy()
torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
# torch_input_g = input_data_torch.grad.detach().numpy()


my_sigmoid = SigmoidLayer()
my_out = my_sigmoid.forward(input_data)


print("output all close:", np.allclose(my_out, torch_out_np))
print("input gradients all close:", np.allclose(my_sigmoid.backward(output_gradient), input_data_torch.grad))


output all close: True
input gradients all close: True


In [7]:
"""
SigmoidLayer test on a 4D tensor
"""

n_input_channels = 3
n_samples = 2
height = 5
width = 5
input_data = np.random.rand(n_samples, n_input_channels, height, width).astype(np.float32)
input_data_torch = torch.from_numpy(input_data).float()
input_data_torch.requires_grad = True
output_gradient = np.random.rand(n_samples, n_input_channels, height, width).astype(np.float32)


torch_sigmoid = torch.nn.Sigmoid()
torch_out = torch_sigmoid(input_data_torch)
torch_out_np = torch_out.detach().numpy()
torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
# torch_input_g = input_data_torch.grad.detach().numpy()


my_sigmoid = SigmoidLayer()
my_out = my_sigmoid.forward(input_data)


print("output all close:", np.allclose(my_out, torch_out_np))
print("input gradients all close:", np.allclose(my_sigmoid.backward(output_gradient), input_data_torch.grad))
# print("input gradients all close:", np.allclose(my_input_g, torch_bg))

# print("all parameters shape same: ", my_fc.weights.shape == torch_fc.weight.T.shape and my_fc.bias.shape == torch.unsqueeze(torch_fc.bias, 1).shape)
# print("output sum of square dif:", np.square(my_out - torch_out_np).sum())
# print(torch_wg.sum(), my_wg.sum())
# print("w gradient sum of square dif:", np.square(my_wg - torch_wg).sum())

#print()
#print(my_wg)
#print()
#print(torch_wg)

output all close: True
input gradients all close: True


In [8]:
"""
FlattenLayer test
"""

n_input_channels = 3
n_samples = 2
height = 5
width = 5

input_data = np.random.rand(n_samples, n_input_channels, height, width).astype(np.float32)
input_data_torch = torch.from_numpy(input_data).float()
input_data_torch.requires_grad = True
output_gradient = np.random.rand(n_samples, n_input_channels * height * width).astype(np.float32)

my_flatten = Flatten()
my_out = my_flatten.forward(input_data)
my_out_g = my_flatten.backward(output_gradient)

torch_flatten = torch.nn.Flatten()
torch_out = torch_flatten(input_data_torch)
torch_out_np = torch_out.detach().numpy()

torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
torch_input_g = input_data_torch.grad.detach().numpy()

print("output all close:", np.allclose(my_out, torch_out_np))
print("input gradients all close:", np.allclose(my_out_g, torch_input_g))


output all close: True
input gradients all close: True


In [9]:
"""
Conv2dWithLoops test
"""

batch_size = 5
n_input_channels = 4
n_output_channels = 2
width = 3
height = 5

kernel_size = 3
stride = 1
padding = 1

output_width = (width + 2 * padding - kernel_size) // stride + 1
output_height = (height + 2 * padding - kernel_size) // stride + 1

input_data = np.random.rand(batch_size, n_input_channels, width, height).astype(np.float32)
input_data_torch = torch.from_numpy(input_data).float()
input_data_torch.requires_grad = True
output_gradient = np.random.rand(batch_size, n_output_channels, output_width, output_height).astype(np.float32)

torch_conv = torch.nn.Conv2d(n_input_channels, n_output_channels, kernel_size, stride, padding)

my_conv = Conv2dWithLoops(n_input_channels, n_output_channels, kernel_size, stride, padding)
my_conv.weights = torch_conv.weight.detach().numpy()
my_conv.bias = torch_conv.bias.detach().numpy().reshape(-1, 1)

my_out = my_conv.forward(input_data)

torch_out = torch_conv(input_data_torch)
torch_out_np = torch_out.detach().numpy()
torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
torch_input_g = input_data_torch.grad.detach().numpy()
torch_wg = torch_conv.weight.grad.detach().numpy()
torch_bg = torch_conv.bias.grad.detach().numpy().reshape(-1, 1)

# print(torch_conv.weight.shape, torch_conv.bias.shape)
# print(my_conv.weights.shape, my_conv.bias.shape)
my_input_g = my_conv.backward(output_gradient)

atol=1e-6

print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))

print("weights gradients all close:", np.allclose(my_conv.weights_gradient, torch_wg, atol=atol ))

print("bias gradients all close:", np.allclose(my_conv.bias_gradient, torch_bg, atol=atol))

print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

output all close: True
weights gradients all close: True
bias gradients all close: True
input gradients all close: True


In [2]:
"""
Conv2d test
"""

batch_size = 2
n_input_channels = 2
n_output_channels = 2
width = 4
height = 4

kernel_size = 2
stride = 1
padding = 0

output_width = (width + 2 * padding - kernel_size) // stride + 1
output_height = (height + 2 * padding - kernel_size) // stride + 1

input_data = np.random.rand(batch_size, n_input_channels, height, width).astype(np.float32)
input_data_torch = torch.from_numpy(input_data).float()
input_data_torch.requires_grad = True
output_gradient = np.random.rand(batch_size, n_output_channels, output_height, output_width).astype(np.float32)

torch_conv = torch.nn.Conv2d(n_input_channels, n_output_channels, kernel_size, stride, padding)

my_conv = Conv2d(n_input_channels, n_output_channels, kernel_size, stride, padding)
my_conv.weights = torch_conv.weight.detach().numpy()
if my_conv.bias is not None:
    my_conv.bias = torch_conv.bias.detach().numpy().reshape(-1, 1)

my_out = my_conv.forward(input_data)

torch_out = torch_conv(input_data_torch)
torch_out_np = torch_out.detach().numpy()
torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
torch_input_g = input_data_torch.grad.detach().numpy()
torch_wg = torch_conv.weight.grad.detach().numpy()
if my_conv.bias is not None:
    torch_bg = torch_conv.bias.grad.detach().numpy().reshape(-1, 1)

# print(torch_conv.weight.shape, torch_conv.bias.shape)
# print(my_conv.weights.shape, my_conv.bias.shape)
my_input_g = my_conv.backward_as_matrix_multiplication(output_gradient)


atol=1e-6

print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))

print("weights gradients all close:", np.allclose(my_conv.weights_gradient, torch_wg, atol=atol ))
if my_conv.bias is not None:
    print("bias gradients all close:", np.allclose(my_conv.bias_gradient, torch_bg, atol=atol))

print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

output all close: True
weights gradients all close: True
bias gradients all close: True
input gradients all close: True


In [3]:
"""
Conv2d test 2
"""

batch_size = 3
n_input_channels = 1
n_output_channels = 64
width = 32
height = 32

kernel_size = 7
stride = 2
padding = 3

output_width = (width + 2 * padding - kernel_size) // stride + 1
output_height = (height + 2 * padding - kernel_size) // stride + 1

input_data = np.random.rand(batch_size, n_input_channels, height, width).astype(np.float32)
input_data_torch = torch.from_numpy(input_data).float()
input_data_torch.requires_grad = True
output_gradient = np.random.rand(batch_size, n_output_channels, output_height, output_width).astype(np.float32)

torch_conv = torch.nn.Conv2d(n_input_channels, n_output_channels, kernel_size, stride, padding, bias=False)

my_conv = Conv2d(n_input_channels, n_output_channels, kernel_size, stride, padding, bias=False)
my_conv.weights = torch_conv.weight.detach().numpy()
if my_conv.bias is not None:
    my_conv.bias = torch_conv.bias.detach().numpy().reshape(my_conv.bias.shape)

my_out = my_conv.forward(input_data)

torch_out = torch_conv(input_data_torch)
torch_out_np = torch_out.detach().numpy()
torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
torch_input_g = input_data_torch.grad.detach().numpy()
torch_wg = torch_conv.weight.grad.detach().numpy()
if my_conv.bias is not None:
    torch_bg = torch_conv.bias.grad.detach().numpy().reshape(my_conv.bias.shape)

# print(torch_conv.weight.shape, torch_conv.bias.shape)
# print(my_conv.weights.shape, my_conv.bias.shape)
my_input_g = my_conv.backward_as_matrix_multiplication(output_gradient)


atol=1e-6

print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))

print("weights gradients all close:", np.allclose(my_conv.weights_gradient, torch_wg, atol=atol ))
if my_conv.bias is not None:
    print("bias gradients all close:", np.allclose(my_conv.bias_gradient, torch_bg, atol=atol))

print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

output all close: True
weights gradients all close: True
input gradients all close: True


In [13]:
"""
MaxPool2d test
"""

batch_size = 2
n_channels = 3
height = 6
width = 4

kernel_size = 2
stride = 1
padding = 0

output_width = (width + 2 * padding - kernel_size) // stride + 1
output_height = (height + 2 * padding - kernel_size) // stride + 1

input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
input_data_torch = torch.from_numpy(input_data).float()
input_data_torch.requires_grad = True
output_gradient = np.random.rand(batch_size, n_channels, output_height, output_width).astype(np.float32)

torch_pool = torch.nn.MaxPool2d(kernel_size, stride, padding)

my_pool = MaxPool2d(kernel_size, stride, padding)

my_out = my_pool.forward(input_data)

torch_out = torch_pool(input_data_torch)
torch_out_np = torch_out.detach().numpy()
torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
torch_input_g = input_data_torch.grad.detach().numpy()

my_input_g = my_pool.backward(output_gradient)

atol=1e-6

print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

output all close: True
input gradients all close: True


In [14]:
"""
MaxPool2d test 2
"""

batch_size = 10
n_channels = 3
height = 16
width = 16

kernel_size = 3
stride = 2
padding = 1

output_width = (width + 2 * padding - kernel_size) // stride + 1
output_height = (height + 2 * padding - kernel_size) // stride + 1

input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
input_data_torch = torch.from_numpy(input_data).float()
input_data_torch.requires_grad = True
output_gradient = np.random.rand(batch_size, n_channels, output_height, output_width).astype(np.float32)

torch_pool = torch.nn.MaxPool2d(kernel_size, stride, padding)

my_pool = MaxPool2d(kernel_size, stride, padding)

my_out = my_pool.forward(input_data)

torch_out = torch_pool(input_data_torch)
torch_out_np = torch_out.detach().numpy()
torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
torch_input_g = input_data_torch.grad.detach().numpy()

my_input_g = my_pool.backward(output_gradient)

atol=1e-6

print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

output all close: True
input gradients all close: True


In [5]:
"""
BottleNeckLayer test
"""

batch_size = 5
in_channels = 8
bottleneck_depth = 2
width = 6
height = 6

expansion_factor = 4
n_output_channels = bottleneck_depth * expansion_factor

for stride_for_downsampling in (1, 2):  # Checking both cases: no downsampling and downsampling
    print(f"stride = {stride_for_downsampling}")
    input_data = np.random.rand(batch_size, in_channels, width, height).astype(np.float32)
    input_data_torch = torch.from_numpy(input_data).float()
    input_data_torch.requires_grad = True

    if stride_for_downsampling == 1:
        output_width = width
        output_height = height
    if stride_for_downsampling == 2:
        output_width = width // stride_for_downsampling
        output_height = height // stride_for_downsampling
    output_gradient = np.random.rand(batch_size, n_output_channels, output_width, output_height).astype(np.float32)

    torch_bottleneck = Bottleneck_torch(in_channels, bottleneck_depth, stride_for_downsampling)
    my_bottleneck = Bottleneck(in_channels, bottleneck_depth, stride_for_downsampling)

    my_bottleneck.conv1.weights = torch_bottleneck.conv1.weight.detach().numpy().reshape(my_bottleneck.conv1.weights.shape)
    my_bottleneck.conv2.weights = torch_bottleneck.conv2.weight.detach().numpy()
    my_bottleneck.conv3.weights = torch_bottleneck.conv3.weight.detach().numpy()

    my_bottleneck.bn1.gamma = torch_bottleneck.bn1.weight.detach().numpy().reshape(my_bottleneck.bn1.gamma.shape)
    my_bottleneck.bn1.beta = torch_bottleneck.bn1.bias.detach().numpy().reshape(my_bottleneck.bn1.beta.shape)
    my_bottleneck.bn1.mean = torch_bottleneck.bn1.running_mean.detach().numpy().reshape(my_bottleneck.bn1.mean.shape)
    my_bottleneck.bn1.var = torch_bottleneck.bn1.running_var.detach().numpy().reshape(my_bottleneck.bn1.var.shape)
    torch_bottleneck.bn1.momentum = 1.0

    my_bottleneck.bn2.gamma = torch_bottleneck.bn2.weight.detach().numpy().reshape(my_bottleneck.bn2.gamma.shape)
    my_bottleneck.bn2.beta = torch_bottleneck.bn2.bias.detach().numpy().reshape(my_bottleneck.bn2.beta.shape)
    my_bottleneck.bn2.mean = torch_bottleneck.bn2.running_mean.detach().numpy().reshape(my_bottleneck.bn2.mean.shape)
    my_bottleneck.bn2.var = torch_bottleneck.bn2.running_var.detach().numpy().reshape(my_bottleneck.bn2.var.shape)
    torch_bottleneck.bn2.momentum = 1.0

    my_bottleneck.bn3.gamma = torch_bottleneck.bn3.weight.detach().numpy().reshape(my_bottleneck.bn3.gamma.shape)
    my_bottleneck.bn3.beta = torch_bottleneck.bn3.bias.detach().numpy().reshape(my_bottleneck.bn3.beta.shape)
    my_bottleneck.bn3.mean = torch_bottleneck.bn3.running_mean.detach().numpy().reshape(my_bottleneck.bn3.mean.shape)
    my_bottleneck.bn3.var = torch_bottleneck.bn3.running_var.detach().numpy().reshape(my_bottleneck.bn3.var.shape)
    torch_bottleneck.bn3.momentum = 1.0


    if my_bottleneck.conv_to_match_dimensions:
        my_bottleneck.conv_to_match_dimensions.weights = torch_bottleneck.conv_to_match_dimensions.weight.detach().numpy()
        my_bottleneck.bn_for_residual.gamma = torch_bottleneck.bn_for_residual.weight.detach().numpy().reshape(my_bottleneck.bn_for_residual.gamma.shape)
        my_bottleneck.bn_for_residual.beta = torch_bottleneck.bn_for_residual.bias.detach().numpy().reshape(my_bottleneck.bn_for_residual.beta.shape)
        my_bottleneck.bn_for_residual.mean = torch_bottleneck.bn_for_residual.running_mean.detach().numpy().reshape(my_bottleneck.bn_for_residual.mean.shape)
        my_bottleneck.bn_for_residual.var = torch_bottleneck.bn_for_residual.running_var.detach().numpy().reshape(my_bottleneck.bn_for_residual.var.shape)
        torch_bottleneck.bn_for_residual.momentum = 1.0
    
    my_bottleneck.eval()
    torch_bottleneck.eval()

    my_out = my_bottleneck.forward(input_data)
    torch_out = torch_bottleneck(input_data_torch)

    torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
    torch_input_g = input_data_torch.grad.detach().numpy()

    my_input_g = my_bottleneck.backward(output_gradient)

    atol = 1e-2
    print("output all close:", np.allclose(my_out, torch_out.detach().numpy(), atol=atol))
    print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))
    print("conv1 weights gradients all close:", np.allclose(my_bottleneck.conv1.weights_gradient, torch_bottleneck.conv1.weight.grad.detach().numpy(), atol=atol))
    print("conv2 weights gradients all close:", np.allclose(my_bottleneck.conv2.weights_gradient, torch_bottleneck.conv2.weight.grad.detach().numpy(), atol=atol))
    print("conv3 weights gradients all close:", np.allclose(my_bottleneck.conv3.weights_gradient, torch_bottleneck.conv3.weight.grad.detach().numpy(), atol=atol))
    if my_bottleneck.conv_to_match_dimensions:
        print("conv_to_match_dimensions weights gradients all close:", np.allclose(my_bottleneck.conv_to_match_dimensions.weights_gradient, torch_bottleneck.conv_to_match_dimensions.weight.grad.detach().numpy(), atol=atol))
    
    print("bn3 gamma gradients all close:", np.allclose(my_bottleneck.bn3.gamma_gradient, torch_bottleneck.bn3.weight.grad.detach().numpy(), atol=atol))  
    print("bn3 beta gradients all close:", np.allclose(my_bottleneck.bn3.beta_gradient, torch_bottleneck.bn3.bias.grad.detach().numpy(), atol=atol))  
    print()

stride = 1
output all close: True
input gradients all close: False
conv1 weights gradients all close: False
conv2 weights gradients all close: False
conv3 weights gradients all close: False
bn3 gamma gradients all close: False
bn3 beta gradients all close: False

stride = 2
output all close: True
input gradients all close: False
conv1 weights gradients all close: False
conv2 weights gradients all close: False
conv3 weights gradients all close: False
conv_to_match_dimensions weights gradients all close: False
bn3 gamma gradients all close: False
bn3 beta gradients all close: False



In [17]:
"""
resnet 101 test
"""

batch_size = 10
height = width = 32
n_channels = 1

input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
input_data_torch = torch.from_numpy(input_data).float()
input_data_torch.requires_grad = True
output_gradient = np.random.rand(batch_size, 10).astype(np.float32)

torch_resnet = resnet101_torch(10, 1)
torch_resnet.eval()
my_resnet = resnet101(10, 1)
my_resnet.eval()

my_resnet.clone_weights_from_torch(torch_resnet)

my_out = my_resnet.forward(input_data)

torch_out = torch_resnet(input_data_torch)
torch_out_np = torch_out.detach().numpy()

torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
torch_input_g = input_data_torch.grad.detach().numpy()

my_input_g = my_resnet.backward(output_gradient)

atol=1e-3

print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

print("fc weights gradients all close:", np.allclose(my_resnet.fc.weights_gradient, torch_resnet.fc.weight.grad.detach().numpy().T, atol=atol))
print("fc bias gradients all close:", np.allclose(my_resnet.fc.bias_gradient, torch_resnet.fc.bias.grad.detach().numpy(), atol=atol))

output all close: True
input gradients all close: False
fc weights gradients all close: True
fc bias gradients all close: True


In [4]:
"""
resnet 101 without batchnormtest
"""

batch_size = 10
height = width = 32
n_channels = 1

input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
input_data_torch = torch.from_numpy(input_data).float()
input_data_torch.requires_grad = True
output_gradient = np.random.rand(batch_size, 10).astype(np.float32)

torch_resnet = resnet101_torch_without_batchnorm(10, 1)
torch_resnet.eval()
my_resnet = resnet101_np_without_batchnorm(10, 1)
my_resnet.eval()

my_resnet.clone_weights_from_torch(torch_resnet)

my_out = my_resnet.forward(input_data)

torch_out = torch_resnet(input_data_torch)
torch_out_np = torch_out.detach().numpy()

torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
torch_input_g = input_data_torch.grad.detach().numpy()

my_input_g = my_resnet.backward(output_gradient)

atol=1e-3

print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

print("fc weights gradients all close:", np.allclose(my_resnet.fc.weights_gradient, torch_resnet.fc.weight.grad.detach().numpy().T, atol=atol))
print("fc bias gradients all close:", np.allclose(my_resnet.fc.bias_gradient, torch_resnet.fc.bias.grad.detach().numpy(), atol=atol))

output all close: True
input gradients all close: True
fc weights gradients all close: True
fc bias gradients all close: True


In [4]:
"""
conv1 test
"""

batch_size = 10
height = width = 32
n_channels = 1

input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
input_data_torch = torch.from_numpy(input_data).float()
input_data_torch.requires_grad = True
output_gradient = np.random.rand(batch_size, 64, 16, 16).astype(np.float32)

torch_resnet = resnet101_torch(10, 1)
torch_resnet.eval()
my_resnet = resnet101(10, 1)
my_resnet.eval()

my_resnet.clone_weights_from_torch(torch_resnet)

my_out = my_resnet.conv1.forward(input_data)

torch_out = torch_resnet.conv1(input_data_torch)
torch_out_np = torch_out.detach().numpy()

torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
torch_input_g = input_data_torch.grad.detach().numpy()

my_input_g = my_resnet.conv1.backward(output_gradient)

atol=1e-3

print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

output all close: True
input gradients all close: True


In [9]:
# """
# conv2_x test
# """


# batch_size = 10
# height = width = 8
# n_channels = 64

# input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
# input_data_torch = torch.from_numpy(input_data).float()
# input_data_torch.requires_grad = True
# output_gradient = np.random.rand(batch_size, 256, 8, 8).astype(np.float32)

# torch_resnet = resnet101_torch(10, 1)
# torch_resnet.eval()
# my_resnet = resnet101(10, 1)
# my_resnet.eval()

# my_resnet.clone_weights_from_torch(torch_resnet)

# my_out = my_resnet.conv2_x.forward(input_data)

# torch_out = torch_resnet.conv2_x(input_data_torch)
# torch_out_np = torch_out.detach().numpy()

# torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
# torch_input_g = input_data_torch.grad.detach().numpy()

# my_input_g = my_resnet.conv2_x.backward(output_gradient)

# atol=1e-3

# print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
# print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

output all close: False
input gradients all close: False


In [19]:
# """
# conv3_x test
# """

# batch_size = 10
# height = width = 8
# n_channels = 256

# input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
# input_data_torch = torch.from_numpy(input_data).float()
# input_data_torch.requires_grad = True
# output_gradient = np.random.rand(batch_size, 512, 4, 4).astype(np.float32)

# torch_resnet = resnet101_torch(10, 1)
# my_resnet = resnet101(10, 1)

# my_resnet.clone_weights_from_torch(torch_resnet)

# my_out = my_resnet.conv3_x.forward(input_data)

# torch_out = torch_resnet.conv3_x(input_data_torch)
# torch_out_np = torch_out.detach().numpy()

# torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
# torch_input_g = input_data_torch.grad.detach().numpy()

# my_input_g = my_resnet.conv3_x.backward(output_gradient)

# atol=1e-3

# print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
# print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

In [20]:
# """
# conv4_x test
# """

# batch_size = 10
# height = width = 4
# n_channels = 512

# input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
# input_data_torch = torch.from_numpy(input_data).float()
# input_data_torch.requires_grad = True
# output_gradient = np.random.rand(batch_size, 1024, 2, 2).astype(np.float32)

# torch_resnet = resnet101_torch(10, 1)
# my_resnet = resnet101(10, 1)

# my_resnet.clone_weights_from_torch(torch_resnet)

# my_out = my_resnet.conv4_x.forward(input_data)

# torch_out = torch_resnet.conv4_x(input_data_torch)
# torch_out_np = torch_out.detach().numpy()

# torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
# torch_input_g = input_data_torch.grad.detach().numpy()

# my_input_g = my_resnet.conv4_x.backward(output_gradient)

# atol=1e-3

# print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
# print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

In [21]:
# """
# conv5_x test
# """

# batch_size = 10
# height = width = 2
# n_channels = 1024

# input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
# input_data_torch = torch.from_numpy(input_data).float()
# input_data_torch.requires_grad = True
# output_gradient = np.random.rand(batch_size, 2048, 1, 1).astype(np.float32)

# torch_resnet = resnet101_torch(10, 1)
# my_resnet = resnet101(10, 1)

# my_resnet.clone_weights_from_torch(torch_resnet)

# my_out = my_resnet.conv5_x.forward(input_data)

# torch_out = torch_resnet.conv5_x(input_data_torch)
# torch_out_np = torch_out.detach().numpy()

# torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
# torch_input_g = input_data_torch.grad.detach().numpy()

# my_input_g = my_resnet.conv5_x.backward(output_gradient)

# atol=1e-3

# print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
# print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))

In [22]:
# """
# AdamOptimizer test
# """

# from torch.optim import Adam as Adam_torch

# n_input_features = 6
# n_output_features = 3
# batch_size = 5
# input_data = np.random.rand(batch_size, n_input_features).astype(np.float32)
# input_data_torch = torch.from_numpy(input_data).float()
# input_data_torch.requires_grad = True
# output_gradient = np.random.rand(batch_size, n_output_features).astype(np.float32)

# torch_fc = torch.nn.Linear(n_input_features, n_output_features)
# torch_out = torch_fc(input_data_torch)
# torch_out_np = torch_out.detach().numpy()
# torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
# torch_wg = torch_fc.weight.grad.detach().numpy().T
# torch_bg = torch_fc.bias.grad.detach().numpy().reshape(-1, 1).T
# torch_input_g = input_data_torch.grad.detach().numpy()


# my_fc = FullyConnectedLayer(n_input_features, n_output_features)
# my_fc.weights = torch_fc.weight.detach().numpy().T
# my_fc.bias = torch_fc.bias.detach().numpy().reshape(-1, 1).T
# my_out = my_fc.forward(input_data)
# my_input_g = my_fc.backward(output_gradient)
# my_wg = my_fc.weights_gradient
# my_bg = my_fc.bias_gradient

# atol=1e-3

# print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
# print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))
# print("before adam weights gradients all close:", np.allclose(my_wg, torch_wg, atol=atol))
# print("before adam bias gradients all close:", np.allclose(my_bg, torch_bg, atol=atol))
# print(my_wg, "\n", torch_wg)

# my_adam = AdamOptimizer(my_fc.get_trainable_layers(), 0.001, 0.9, 0.999, 1e-8)
# my_adam.step()

# torch_adam = Adam_torch(torch_fc.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-8)
# torch_adam.step()

# print("after adam weights gradients all close:", np.allclose(my_fc.weights_gradient, torch_fc.weight.grad.detach().numpy().T, atol=atol))
# print("after adam bias gradients all close:", np.allclose(my_fc.bias_gradient, torch_fc.bias.grad.detach().numpy().reshape(-1, 1).T, atol=atol))
# print(my_fc.weights_gradient, "\n", torch_fc.weight.grad.detach().numpy().T)

In [12]:
"""
BatchNorm test
"""

batch_size = 4
n_channels = 5
height = 8
width = 8

input_data = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)
input_data_torch = torch.from_numpy(input_data).float()
input_data_torch.requires_grad = True
output_gradient = np.random.rand(batch_size, n_channels, height, width).astype(np.float32)

torch_bn = torch.nn.BatchNorm2d(n_channels, momentum=1)

my_bn = BatchNormalization2d(n_channels)

my_bn.gamma = torch_bn.weight.detach().numpy().reshape(1, n_channels, 1, 1)
my_bn.beta = torch_bn.bias.detach().numpy().reshape(1, n_channels, 1, 1)

my_bn.mean = torch_bn.running_mean.detach().numpy().reshape(1, n_channels, 1, 1)
my_bn.var = torch_bn.running_var.detach().numpy().reshape(1, n_channels, 1, 1)

my_out = my_bn.forward(input_data)

torch_out = torch_bn(input_data_torch)

torch_out_np = torch_out.detach().numpy()

torch_out.backward(torch.tensor(output_gradient), retain_graph=True)

torch_input_g = input_data_torch.grad.detach().numpy()

my_input_g = my_bn.backward(output_gradient)

atol=1e-2

print("output all close:", np.allclose(my_out, torch_out_np, atol=atol))
print("input gradients all close:", np.allclose(my_input_g, torch_input_g, atol=atol))
print("weights gradients all close:", np.allclose(my_bn.gamma_gradient, torch_bn.weight.grad.detach().numpy().reshape(1, n_channels, 1, 1), atol=atol))
print("bias gradients all close:", np.allclose(my_bn.beta_gradient, torch_bn.bias.grad.detach().numpy().reshape(1, n_channels, 1, 1), atol=atol))


output all close: True
input gradients all close: True
weights gradients all close: True
bias gradients all close: True


In [8]:
"""
Conv2d vs Conv2dWithLoops vs torch.nn.Conv2d time comparison forward and backward
"""

import time

n_input_channels = 4
n_output_channels = 2
width = 3
height = 5

kernel_size = 3
stride = 1
padding = 3

output_width = (width + 2 * padding - kernel_size) // stride + 1
output_height = (height + 2 * padding - kernel_size) // stride + 1

for batch_size in [1, 2, 4, 8, 16]:
    print("batch_size:", batch_size)

    input_data = np.random.rand(batch_size, n_input_channels, height, width).astype(np.float32)
    input_data_torch = torch.from_numpy(input_data).float()
    input_data_torch.requires_grad = True
    output_gradient = np.random.rand(batch_size, n_output_channels, output_height, output_width).astype(np.float32)

    torch_conv = torch.nn.Conv2d(n_input_channels, n_output_channels, kernel_size, stride, padding)

    my_conv_with_loops = Conv2dWithLoops(n_input_channels, n_output_channels, kernel_size, stride, padding)
    my_conv_with_loops.weights = torch_conv.weight.detach().numpy()

    my_conv = Conv2d(n_input_channels, n_output_channels, kernel_size, stride, padding)
    my_conv.weights = torch_conv.weight.detach().numpy()

    n_iterations = 1000

    start = time.time()
    for i in range(n_iterations):
        my_out = my_conv.forward(input_data)
    end = time.time()
    print(f"my_conv forward time: {end - start}")

    start = time.time()
    for i in range(n_iterations):
        my_out_with_loops = my_conv_with_loops.forward(input_data)
    end = time.time()
    print(f"my_conv_with_loops forward time: {end - start}")

    start = time.time()
    for i in range(n_iterations):
        torch_out = torch_conv(input_data_torch)
    end = time.time()
    print(f"torch_conv forward time: {end - start}")

    start = time.time()
    for i in range(n_iterations):
        my_input_g = my_conv.backward_as_matrix_multiplication(output_gradient)
    end = time.time()
    print(f"my_conv backward time: {end - start}")

    start = time.time()
    for i in range(n_iterations):
        my_input_g_with_loops = my_conv_with_loops.backward(output_gradient)
    end = time.time()
    print(f"my_conv_with_loops backward time: {end - start}")

    start = time.time()
    for i in range(n_iterations):
        torch_out.backward(torch.tensor(output_gradient), retain_graph=True)
    end = time.time()
    print(f"torch_conv backward time: {end - start}")

    print()

batch_size: 1
my_conv forward time: 0.5312726497650146
my_conv_with_loops forward time: 2.215890645980835
torch_conv forward time: 0.06669497489929199
my_conv backward time: 0.7423074245452881
my_conv_with_loops backward time: 2.638596773147583
torch_conv backward time: 0.1759324073791504

batch_size: 2
my_conv forward time: 0.5586578845977783
my_conv_with_loops forward time: 2.2504141330718994
torch_conv forward time: 0.17436599731445312
my_conv backward time: 1.2079191207885742
my_conv_with_loops backward time: 5.122586965560913
torch_conv backward time: 0.3848881721496582

batch_size: 4
my_conv forward time: 1.064781904220581
my_conv_with_loops forward time: 2.2000110149383545
torch_conv forward time: 0.1567244529724121
my_conv backward time: 2.2330682277679443
my_conv_with_loops backward time: 9.913957118988037
torch_conv backward time: 0.3933844566345215

batch_size: 8
my_conv forward time: 1.9885218143463135
my_conv_with_loops forward time: 2.341416597366333
torch_conv forward ti