In [2]:
import torch
from d2l import torch as d2l

In [3]:
# when the number of channels in the input data > 1, we can essentially think of a kernel as a a cube with x * k_height * k_width
# we perform a cross-correlation operation on the two dimensional tensor of the input and the two dimensional tensor of the convolution kernel for each channel, and then summing the c_i results together to yield a two dimensional tensor
def corr2d_multi_in(X, K):
    return sum(d2l.corr2d(x, k) for x, k in zip(X,K))

In [4]:
X = torch.tensor([[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0], [6.0, 7.0, 8.0]],
               [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]])
K = torch.tensor([[[0.0, 1.0], [2.0, 3.0]], [[1.0, 2.0], [3.0, 4.0]]])

corr2d_multi_in(X, K)

tensor([[ 56.,  72.],
        [104., 120.]])

In [5]:
# we could also have multiple channel outputs. each channel can be thought of as responding to a differnet set of features. 
def corr_2d_multi_in_out(X, K):
    return torch.stack([corr2d_multi_in(X, k) for k in K], 0)

In [9]:
# 1 by 1 convolutional layer
# you could think of the 1 by 1 convolutional layer as contituting a fully connected later applied at every single pixel location to transform the ci corresponding input values into co output channels. we can think of this as beginning with a three stack of layers, and ending up with a two stack of layers
def corr2d_multi_in_out_1x1(X, K):
    c_i, h, w = X.shape
    c_o = K.shape[0]
    X = X.reshape((c_i, h * w))
    K = K.reshape((c_o, c_i))
    # matrix multiplication in the fully connected later
    Y = torch.matmul(K, X)
    return Y.reshape((c_o, h, w))

In [10]:
X = torch.normal(0, 1, (3, 3, 3))
K = torch.normal(0, 1, (2, 3, 1, 1))
Y1 = corr2d_multi_in_out_1x1(X, K)
Y2 = corr_2d_multi_in_out(X, K)
assert float(torch.abs(Y1 - Y2).sum()) < 1e-6