### define

In [1]:
%autoreload 2
import torch
from torchvision import datasets, transforms
import argparse
from simple_conv_net_func import diff_mse
from simple_conv_net_func import conv2d_scalar, pool2d_scalar, relu_scalar, reshape_scalar, fc_layer_scalar
from simple_conv_net_func import conv2d_vector, pool2d_vector, relu_vector, reshape_vector, fc_layer_vector, im2col

In [2]:
N_batch = 4  # bacth size, default 64
C = 1  # number of input channels
S = 28  # input image size
B = 20  # number of filters
K = 5  # kernel size
fc1_out = 500  # fc1 layer output units
N_classes = 10
epochs_num = 20
alpha = 0.01
momentum = 0.5

In [3]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
device

device(type='cpu')

In [4]:
x = torch.randn(N_batch, C, S, S).to(device)
# w = torch.randn(B, C, K, K).to(device)
# b = torch.randn(B).to(device)
x.shape#, w.shape, b.shape

torch.Size([4, 1, 28, 28])

In [5]:
x.get_device()

-1

## scalar

### conv

In [6]:
cl = torch.nn.Conv2d(in_channels=C, out_channels=B, kernel_size=K)
w_conv = cl.weight
b_conv = cl.bias
w_conv.shape, b_conv.shape

(torch.Size([20, 1, 5, 5]), torch.Size([20]))

In [7]:
with torch.no_grad():
    %time z_conv = conv2d_scalar(x, w_conv, b_conv, device)
z_conv.shape

CPU times: user 23.2 s, sys: 4.81 ms, total: 23.2 s
Wall time: 23.2 s


torch.Size([4, 20, 24, 24])

In [8]:
with torch.no_grad():
    %time torch_z_conv = cl(x)
torch_z_conv.shape

CPU times: user 784 µs, sys: 36 µs, total: 820 µs
Wall time: 599 µs


torch.Size([4, 20, 24, 24])

In [9]:
diff_mse(z_conv, torch_z_conv)

3.95232052990903e-15

### pool

In [10]:
with torch.no_grad():
    %time z_pool = pool2d_scalar(torch_z_conv, device)
z_pool.shape

CPU times: user 354 ms, sys: 4.11 ms, total: 359 ms
Wall time: 358 ms


torch.Size([4, 20, 12, 12])

In [11]:
with torch.no_grad():
    %time torch_z_pool = torch.max_pool2d(torch_z_conv, 2)
torch_z_pool.shape

CPU times: user 192 µs, sys: 10 µs, total: 202 µs
Wall time: 204 µs


torch.Size([4, 20, 12, 12])

In [12]:
diff_mse(z_pool, torch_z_pool)

0.0

### reshape

In [13]:
with torch.no_grad():
    %time z_reshaped = reshape_scalar(z_pool, device)
z_reshaped.shape

CPU times: user 90.4 ms, sys: 0 ns, total: 90.4 ms
Wall time: 89.9 ms


torch.Size([4, 2880])

In [14]:
with torch.no_grad():
    # %time torch_z_reshaped = torch.reshape(torch_z_pool, (N_batch, -1))
    %time torch_z_reshaped = torch_z_pool.view(N_batch, -1)
torch_z_reshaped.shape

CPU times: user 28 µs, sys: 1e+03 ns, total: 29 µs
Wall time: 31.2 µs


torch.Size([4, 2880])

In [15]:
diff_mse(z_reshaped, torch_z_reshaped)

0.0

### relu

In [16]:
with torch.no_grad():
    %time z_relu = relu_scalar(torch_z_reshaped, device)
z_relu.shape

CPU times: user 121 ms, sys: 58 µs, total: 121 ms
Wall time: 121 ms


torch.Size([4, 2880])

In [17]:
with torch.no_grad():
    %time torch_z_relu = torch.relu(torch_z_reshaped)
torch_z_relu.shape

CPU times: user 69 µs, sys: 4 µs, total: 73 µs
Wall time: 89.2 µs


torch.Size([4, 2880])

In [18]:
diff_mse(z_relu, torch_z_relu)

0.0

### fc

In [19]:
fc = torch.nn.Linear(torch_z_reshaped.size(-1), N_classes)
w_fc = fc.weight
b_fc = fc.bias

In [20]:
with torch.no_grad():
    %time z_fc = fc_layer_scalar(torch_z_reshaped, w_fc, b_fc, device)
z_fc.shape

CPU times: user 1.56 s, sys: 0 ns, total: 1.56 s
Wall time: 1.56 s


torch.Size([4, 10])

In [21]:
with torch.no_grad():
    %time torch_z_fc = fc(z_reshaped)
torch_z_fc.shape

CPU times: user 686 µs, sys: 42 µs, total: 728 µs
Wall time: 334 µs


torch.Size([4, 10])

In [22]:
diff_mse(z_fc, torch_z_fc)

1.6700226271315621e-13

## vector

#### im2col

In [23]:
with torch.no_grad():
    %time col = im2col(x[0], 5, device).t().contiguous()
col.shape

CPU times: user 7.91 ms, sys: 0 ns, total: 7.91 ms
Wall time: 7.53 ms


torch.Size([25, 576])

In [24]:
with torch.no_grad():
    %time torch_col = torch.functional.F.unfold(x, 5)[0]
torch_col.shape

CPU times: user 181 µs, sys: 11 µs, total: 192 µs
Wall time: 195 µs


torch.Size([25, 576])

In [25]:
diff_mse(col, torch_col)

0.0

### conv

In [26]:
cl = torch.nn.Conv2d(in_channels=C, out_channels=B, kernel_size=K)
w_conv = cl.weight
b_conv = cl.bias

In [27]:
with torch.no_grad():
    %time z_conv = conv2d_vector(x, w_conv, b_conv, device)
z_conv.shape

CPU times: user 96.2 ms, sys: 0 ns, total: 96.2 ms
Wall time: 29.7 ms


torch.Size([4, 20, 24, 24])

In [28]:
with torch.no_grad():
    %time torch_z_conv = cl(x)
torch_z_conv.shape

CPU times: user 570 µs, sys: 39 µs, total: 609 µs
Wall time: 389 µs


torch.Size([4, 20, 24, 24])

In [29]:
diff_mse(z_conv, torch_z_conv)

3.57074997486286e-15

### pool

In [30]:
with torch.no_grad():
    %time z_pool = pool2d_vector(torch_z_conv, device)
z_pool.shape

CPU times: user 141 ms, sys: 46 µs, total: 141 ms
Wall time: 140 ms


torch.Size([4, 20, 12, 12])

In [31]:
with torch.no_grad():
    %time torch_z_pool = torch.max_pool2d(torch_z_conv, 2)
torch_z_pool.shape

CPU times: user 155 µs, sys: 11 µs, total: 166 µs
Wall time: 168 µs


torch.Size([4, 20, 12, 12])

In [32]:
diff_mse(z_pool, torch_z_pool)

0.0

### reshape

In [33]:
with torch.no_grad():
    %time z_reshaped = reshape_vector(z_pool, device)
z_reshaped.shape

CPU times: user 35 µs, sys: 3 µs, total: 38 µs
Wall time: 40.1 µs


torch.Size([4, 2880])

In [34]:
with torch.no_grad():
#     %time torch_z_reshaped = torch.reshape(torch_z_pool, (N_batch, -1))
    %time torch_z_reshaped = torch_z_pool.view(N_batch, -1)
torch_z_reshaped.shape

CPU times: user 29 µs, sys: 2 µs, total: 31 µs
Wall time: 33.1 µs


torch.Size([4, 2880])

In [35]:
diff_mse(z_reshaped, torch_z_reshaped)

0.0

### relu

In [36]:
with torch.no_grad():
    %time z_relu = relu_vector(z_reshaped, device)
z_relu.shape

CPU times: user 306 µs, sys: 22 µs, total: 328 µs
Wall time: 242 µs


torch.Size([4, 2880])

In [37]:
with torch.no_grad():
    %time torch_z_relu = torch.relu(z_reshaped)
torch_z_relu.shape

CPU times: user 111 µs, sys: 8 µs, total: 119 µs
Wall time: 75.1 µs


torch.Size([4, 2880])

In [38]:
diff_mse(z_relu, torch_z_relu)

0.0

### fc

In [39]:
fc = torch.nn.Linear(z_reshaped.size(-1), N_classes)
w_fc = fc.weight
b_fc = fc.bias

In [40]:
with torch.no_grad():
    %time z_fc = fc_layer_vector(z_reshaped, w_fc, b_fc, device)
z_fc.shape

CPU times: user 331 µs, sys: 24 µs, total: 355 µs
Wall time: 188 µs


torch.Size([4, 10])

In [41]:
with torch.no_grad():
    %time torch_z_fc = fc(z_reshaped)
torch_z_fc.shape

CPU times: user 189 µs, sys: 14 µs, total: 203 µs
Wall time: 179 µs


torch.Size([4, 10])

In [42]:
diff_mse(z_fc, torch_z_fc)

1.4093761092331806e-15

## test

scalar model 10 * 1 batch - too expensive

with no_grad (no backward) ~300 times slower than vector model

In [43]:
# %time !python ./simple_conv_net_train.py --log-interval=1 --epochs 1 --type scalar --batch-size 1 --batch-limit 10

vector model 10 * 64 batch

In [44]:
%time !python ./simple_conv_net_train.py --log-interval=1 --epochs 1 --type vector --batch-limit 10


Test set: Average loss: -0.1930, Accuracy: 6252/10000 (62.52%)

CPU times: user 4.98 s, sys: 829 ms, total: 5.81 s
Wall time: 9min 52s


built-in model 10 * 64 batch

In [45]:
%time !python ./simple_conv_net_train.py --log-interval=1 --epochs 1 --batch-limit 10


Test set: Average loss: -0.1930, Accuracy: 6252/10000 (62.52%)

CPU times: user 17.5 ms, sys: 12.7 ms, total: 30.2 ms
Wall time: 2.22 s


## train

full built-in model train

In [46]:
%time !python ./simple_conv_net_train.py --log-interval=100 --epochs 20 --save-model


Test set: Average loss: -0.9235, Accuracy: 9592/10000 (95.92%)


Test set: Average loss: -0.9436, Accuracy: 9690/10000 (96.90%)


Test set: Average loss: -0.9617, Accuracy: 9788/10000 (97.88%)


Test set: Average loss: -0.9692, Accuracy: 9825/10000 (98.25%)


Test set: Average loss: -0.9654, Accuracy: 9800/10000 (98.00%)


Test set: Average loss: -0.9746, Accuracy: 9834/10000 (98.34%)


Test set: Average loss: -0.9769, Accuracy: 9854/10000 (98.54%)


Test set: Average loss: -0.9783, Accuracy: 9855/10000 (98.55%)


Test set: Average loss: -0.9786, Accuracy: 9855/10000 (98.55%)


Test set: Average loss: -0.9793, Accuracy: 9857/10000 (98.57%)


Test set: Average loss: -0.9801, Accuracy: 9862/10000 (98.62%)


Test set: Average loss: -0.9817, Accuracy: 9870/10000 (98.70%)


Test set: Average loss: -0.9826, Accuracy: 9872/10000 (98.72%)


Test set: Average loss: -0.9829, Accuracy: 9869/10000 (98.69%)




Test set: Average loss: -0.9841, Accuracy: 9881/10000 (98.81%)


Test set: Average loss: -0.9839, Accuracy: 9878/10000 (98.78%)


Test set: Average loss: -0.9838, Accuracy: 9873/10000 (98.73%)


Test set: Average loss: -0.9837, Accuracy: 9872/10000 (98.72%)


Test set: Average loss: -0.9834, Accuracy: 9867/10000 (98.67%)


Test set: Average loss: -0.9853, Accuracy: 9878/10000 (98.78%)

CPU times: user 3.9 s, sys: 662 ms, total: 4.56 s
Wall time: 5min 56s


full vector model train

~30 hours to train! let's compare tiny piece (from test)

In [None]:
%time !python ./simple_conv_net_train.py --log-interval=100 --epochs 20 --type vector --save-model

## summary

After `1` epoch of just `10` batches with size `64` (640 pics only) we did test on vector model and built-in pytorch

|model|time|accuracy|
|-|-|-|
|vector|9min 52s|62.52%|
|pytorch|2.22s|62.52%|

Python implementation even in vector form is almost 300 times slower than built-in pytorch

The accuracy is near the same value - models seem to work in a similar way

Scalar model is much slower than vector one - no results for comparison; but MSEs shows good quality of implementation

$$t_{scalar} \approx 300 \times t_{vector} \approx 10K \times t_{torch}$$
$$t_{vector} \approx 300 \times t_{torch}$$

All tests were at `Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz, 4 Cores, 8 Threads` machine