In [5]:
import cupy as cp
import time
import psutil
import csv
from torchvision import datasets, transforms

In [6]:
# Setup
process = psutil.Process()
output_csv = "resnet18_scratch_gpu_im2col.csv"

# Load CIFAR-10 Dataset (GPU Tensor)
transform = transforms.Compose([transforms.ToTensor()])
cifar10 = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
full_data = cp.stack([cp.array(img[0].numpy(), dtype=cp.float32) for img in cifar10])
full_data = full_data.reshape((-1, 3, 32, 32))  # shape: [50000, 3, 32, 32]



Files already downloaded and verified


In [7]:

# Utility: im2col Convolution
def im2col(input, kernel_size, stride, padding):
    N, C, H, W = input.shape
    KH, KW = kernel_size
    OH = (H + 2 * padding - KH) // stride + 1
    OW = (W + 2 * padding - KW) // stride + 1

    img_padded = cp.pad(input, ((0, 0), (0, 0), (padding, padding), (padding, padding)))
    col = cp.zeros((N, C, KH, KW, OH, OW), dtype=cp.float32)

    for y in range(KH):
        for x in range(KW):
            col[:, :, y, x, :, :] = img_padded[:, :, y:y + OH * stride:stride, x:x + OW * stride:stride]

    return col.transpose(0, 4, 5, 1, 2, 3).reshape(N * OH * OW, -1)

def conv2d_im2col(x, weight, bias, stride=1, padding=1):
    N, C, H, W = x.shape
    F, _, KH, KW = weight.shape
    x_col = im2col(x, (KH, KW), stride, padding)
    w_col = weight.reshape(F, -1).T
    out = x_col @ w_col + bias
    OH = (H + 2 * padding - KH) // stride + 1
    OW = (W + 2 * padding - KW) // stride + 1
    out = out.reshape(N, OH, OW, F).transpose(0, 3, 1, 2)
    return out

def relu(x):
    return cp.maximum(0, x)

def identity_block(x, W1, b1, W2, b2):
    x_shortcut = x
    x = relu(conv2d_im2col(x, W1, b1))
    x = conv2d_im2col(x, W2, b2)
    return relu(x + x_shortcut)

def downsample_block(x, W1, b1, W2, b2, W_short, b_short, stride):
    x_shortcut = conv2d_im2col(x, W_short, b_short, stride=stride, padding=0)
    x = relu(conv2d_im2col(x, W1, b1, stride=stride))
    x = conv2d_im2col(x, W2, b2)
    return relu(x + x_shortcut)

# ResNet18 Scratch GPU Model with im2col
class ResNet18CIFAR_GPU_Im2Col:
    def __init__(self):
        self.conv1_W = cp.random.randn(64, 3, 3, 3).astype(cp.float32) * 0.01
        self.conv1_b = cp.zeros(64, dtype=cp.float32)

        self.id1_W1, self.id1_b1 = cp.random.randn(64, 64, 3, 3).astype(cp.float32) * 0.01, cp.zeros(64)
        self.id1_W2, self.id1_b2 = cp.random.randn(64, 64, 3, 3).astype(cp.float32) * 0.01, cp.zeros(64)

        self.ds1_W1, self.ds1_b1 = cp.random.randn(128, 64, 3, 3).astype(cp.float32) * 0.01, cp.zeros(128)
        self.ds1_W2, self.ds1_b2 = cp.random.randn(128, 128, 3, 3).astype(cp.float32) * 0.01, cp.zeros(128)
        self.ds1_short_W, self.ds1_short_b = cp.random.randn(128, 64, 1, 1).astype(cp.float32) * 0.01, cp.zeros(128)

        self.id2_W1, self.id2_b1 = cp.random.randn(128, 128, 3, 3).astype(cp.float32) * 0.01, cp.zeros(128)
        self.id2_W2, self.id2_b2 = cp.random.randn(128, 128, 3, 3).astype(cp.float32) * 0.01, cp.zeros(128)

        self.ds2_W1, self.ds2_b1 = cp.random.randn(256, 128, 3, 3).astype(cp.float32) * 0.01, cp.zeros(256)
        self.ds2_W2, self.ds2_b2 = cp.random.randn(256, 256, 3, 3).astype(cp.float32) * 0.01, cp.zeros(256)
        self.ds2_short_W, self.ds2_short_b = cp.random.randn(256, 128, 1, 1).astype(cp.float32) * 0.01, cp.zeros(256)

        self.id3_W1, self.id3_b1 = cp.random.randn(256, 256, 3, 3).astype(cp.float32) * 0.01, cp.zeros(256)
        self.id3_W2, self.id3_b2 = cp.random.randn(256, 256, 3, 3).astype(cp.float32) * 0.01, cp.zeros(256)

        self.ds3_W1, self.ds3_b1 = cp.random.randn(512, 256, 3, 3).astype(cp.float32) * 0.01, cp.zeros(512)
        self.ds3_W2, self.ds3_b2 = cp.random.randn(512, 512, 3, 3).astype(cp.float32) * 0.01, cp.zeros(512)
        self.ds3_short_W, self.ds3_short_b = cp.random.randn(512, 256, 1, 1).astype(cp.float32) * 0.01, cp.zeros(512)

        self.id4_W1, self.id4_b1 = cp.random.randn(512, 512, 3, 3).astype(cp.float32) * 0.01, cp.zeros(512)
        self.id4_W2, self.id4_b2 = cp.random.randn(512, 512, 3, 3).astype(cp.float32) * 0.01, cp.zeros(512)

        self.fc_W = cp.random.randn(512, 10).astype(cp.float32) * 0.01
        self.fc_b = cp.zeros(10, dtype=cp.float32)

    def forward(self, x):
        x = relu(conv2d_im2col(x, self.conv1_W, self.conv1_b))
        x = identity_block(x, self.id1_W1, self.id1_b1, self.id1_W2, self.id1_b2)
        x = downsample_block(x, self.ds1_W1, self.ds1_b1, self.ds1_W2, self.ds1_b2, self.ds1_short_W, self.ds1_short_b, stride=2)
        x = identity_block(x, self.id2_W1, self.id2_b1, self.id2_W2, self.id2_b2)
        x = downsample_block(x, self.ds2_W1, self.ds2_b1, self.ds2_W2, self.ds2_b2, self.ds2_short_W, self.ds2_short_b, stride=2)
        x = identity_block(x, self.id3_W1, self.id3_b1, self.id3_W2, self.id3_b2)
        x = downsample_block(x, self.ds3_W1, self.ds3_b1, self.ds3_W2, self.ds3_b2, self.ds3_short_W, self.ds3_short_b, stride=2)
        x = identity_block(x, self.id4_W1, self.id4_b1, self.id4_W2, self.id4_b2)
        x = x.mean(axis=(2, 3))  # GAP
        return x @ self.fc_W + self.fc_b



In [8]:
# Model
model = ResNet18CIFAR_GPU_Im2Col()

# Logging
with open(output_csv, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Image_Index", "GPU_Memory_Used(MB)", "CPU_Usage(%)", "RAM_Usage(MB)", "Time_Per_Image(s)"])
    print("\n[CuPy Scratch ResNet + im2col] Starting forward pass on 50,000 CIFAR-10 images...")

    total_start = time.time()
    for i, image in enumerate(full_data):
        image = image[cp.newaxis, ...]  # [1, 3, 32, 32]
        start = time.time()
        output = model.forward(image)
        cp.cuda.Device(0).synchronize()
        end = time.time()

        gpu_mem_MB = (cp.cuda.runtime.memGetInfo()[1] - cp.cuda.runtime.memGetInfo()[0]) / (1024 ** 2)
        cpu_percent = process.cpu_percent(interval=0.01)
        ram_MB = process.memory_info().rss / (1024 ** 2)
        duration = end - start

        if i % 5000 == 0:
            print(f"Image {i}: GPU={gpu_mem_MB:.2f}MB, CPU={cpu_percent:.2f}%, RAM={ram_MB:.2f}MB, Time={duration:.4f}s")

        writer.writerow([i, f"{gpu_mem_MB:.2f}", f"{cpu_percent:.2f}", f"{ram_MB:.2f}", f"{duration:.6f}"])

    total_end = time.time()
    print(f"\nTotal time for 50,000 images: {total_end - total_start:.2f} seconds")
    print(f"Last output shape: {output.shape}")


[CuPy Scratch ResNet + im2col] Starting forward pass on 50,000 CIFAR-10 images...
Image 0: GPU=1998.94MB, CPU=0.00%, RAM=1126.38MB, Time=1.6318s
Image 5000: GPU=1998.94MB, CPU=0.00%, RAM=1126.38MB, Time=0.0093s
Image 10000: GPU=1998.94MB, CPU=0.00%, RAM=1126.38MB, Time=0.0088s
Image 15000: GPU=1998.94MB, CPU=0.00%, RAM=1126.38MB, Time=0.0082s
Image 20000: GPU=1998.94MB, CPU=0.00%, RAM=1126.38MB, Time=0.0085s
Image 25000: GPU=1998.94MB, CPU=0.00%, RAM=1126.38MB, Time=0.0081s
Image 30000: GPU=1998.94MB, CPU=0.00%, RAM=1126.38MB, Time=0.0082s
Image 35000: GPU=1998.94MB, CPU=0.00%, RAM=1126.38MB, Time=0.0093s
Image 40000: GPU=1998.94MB, CPU=0.00%, RAM=1126.38MB, Time=0.0091s
Image 45000: GPU=1998.94MB, CPU=0.00%, RAM=1126.38MB, Time=0.0088s

Total time for 50,000 images: 963.67 seconds
Last output shape: (1, 10)
