In [10]:
import cupy as cp
import time
import psutil
import os
import csv
from torchvision import datasets, transforms

In [11]:
# --------- Set Up Environment ---------
process = psutil.Process(os.getpid())
output_csv = "scratch_cnn_gpu.csv"

# --------- Load CIFAR-10 Data (Images Only) ---------
transform = transforms.Compose([transforms.ToTensor()])
cifar10 = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
full_data = cp.stack([cp.array(img[0].numpy(), dtype=cp.float32) for img in cifar10])

# --------- Define CNN Weights and Biases ---------
weights1 = cp.random.rand(8, 3, 3, 3).astype(cp.float32)
bias1 = cp.random.rand(8).astype(cp.float32)
weights2 = cp.random.rand(16, 8, 3, 3).astype(cp.float32)
bias2 = cp.random.rand(16).astype(cp.float32)
weights3 = cp.random.rand(32, 16, 3, 3).astype(cp.float32)
bias3 = cp.random.rand(32).astype(cp.float32)

Files already downloaded and verified


In [12]:
# --------- Utility: im2col ---------
def im2col(input_data, filter_h, filter_w, stride=1):
    C, H, W = input_data.shape
    out_h = (H - filter_h) // stride + 1
    out_w = (W - filter_w) // stride + 1

    col = cp.zeros((C, filter_h, filter_w, out_h, out_w), dtype=cp.float32)
    for y in range(filter_h):
        y_max = y + stride * out_h
        for x in range(filter_w):
            x_max = x + stride * out_w
            col[:, y, x, :, :] = input_data[:, y:y_max:stride, x:x_max:stride]

    col = col.transpose(3, 4, 0, 1, 2).reshape(out_h * out_w, -1)
    return col

# --------- CNN Operations with Im2Col ---------
def relu(x):
    return cp.maximum(0, x)

def conv2d(x, w, b, stride=1):
    FN, C, FH, FW = w.shape
    col = im2col(x, FH, FW, stride)       # shape: (out_h*out_w, C*FH*FW)
    w_col = w.reshape(FN, -1)             # shape: (FN, C*FH*FW)

    out = cp.dot(w_col, col.T) + b.reshape(-1, 1)  # shape: (FN, out_h*out_w)

    out_h = (x.shape[1] - FH) // stride + 1
    out_w = (x.shape[2] - FW) // stride + 1
    out = out.reshape(FN, out_h, out_w)
    return out

def max_pool2d(x, size=2, stride=2):
    c, h, w = x.shape
    out_h = h // stride
    out_w = w // stride
    pooled = cp.zeros((c, out_h, out_w), dtype=cp.float32)
    for ch in range(c):
        for i in range(out_h):
            for j in range(out_w):
                pooled[ch, i, j] = cp.max(x[ch, i*stride:i*stride+size, j*stride:j*stride+size])
    return pooled

def forward_pass(x, w1, b1, w2, b2, w3, b3):
    x = conv2d(x, w1, b1)
    x = relu(x)
    x = max_pool2d(x)
    x = conv2d(x, w2, b2)
    x = relu(x)
    x = max_pool2d(x)
    x = conv2d(x, w3, b3)
    x = relu(x)
    x = max_pool2d(x)
    return x

In [13]:
# --------- Logging CSV Output ---------
with open(output_csv, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Image_Index", "GPU_Memory_Used(MB)", "CPU_Usage(%)", "RAM_Usage(MB)", "Time_Per_Image(s)"])

    print("\n[CuPy Scratch CNN] Starting forward pass on 50,000 images using GPU (im2col)...")
    total_start = time.time()

    for i, image in enumerate(full_data):
        start = time.time()
        output = forward_pass(image, weights1, bias1, weights2, bias2, weights3, bias3)
        cp.cuda.Device(0).synchronize()
        end = time.time()

        # GPU memory usage
        free_mem, total_mem = cp.cuda.runtime.memGetInfo()
        gpu_mem_used_MB = (total_mem - free_mem) / (1024 ** 2)

        # CPU and RAM usage
        cpu_percent = process.cpu_percent(interval=0.01)
        ram_usage = process.memory_info().rss / (1024 ** 2)
        time_per_image = end - start

        if i % 5000 == 0:
            print(f"Image {i}: GPU Mem={gpu_mem_used_MB:.2f} MB, CPU={cpu_percent:.2f}%, RAM={ram_usage:.2f} MB, Time={time_per_image:.4f}s")

        writer.writerow([i, f"{gpu_mem_used_MB:.2f}", f"{cpu_percent:.2f}", f"{ram_usage:.2f}", f"{time_per_image:.6f}"])

    total_end = time.time()
    print(f"\nTotal time for 50,000 images: {total_end - total_start:.2f} seconds")
    print(f"Last output shape: {output.shape}")


[CuPy Scratch CNN] Starting forward pass on 50,000 images using GPU (im2col)...
Image 0: GPU Mem=1874.94 MB, CPU=0.00%, RAM=1109.29 MB, Time=0.3601s
Image 5000: GPU Mem=1874.94 MB, CPU=0.00%, RAM=1109.29 MB, Time=0.1170s
Image 10000: GPU Mem=1874.94 MB, CPU=0.00%, RAM=1109.29 MB, Time=0.1190s
Image 15000: GPU Mem=1874.94 MB, CPU=0.00%, RAM=1109.29 MB, Time=0.1172s
Image 20000: GPU Mem=1874.94 MB, CPU=0.00%, RAM=1109.29 MB, Time=0.1178s
Image 25000: GPU Mem=1874.94 MB, CPU=0.00%, RAM=1109.29 MB, Time=0.1217s
Image 30000: GPU Mem=1874.94 MB, CPU=0.00%, RAM=1109.29 MB, Time=0.1211s
Image 35000: GPU Mem=1874.94 MB, CPU=0.00%, RAM=1109.29 MB, Time=0.1195s
Image 40000: GPU Mem=1874.94 MB, CPU=0.00%, RAM=1109.29 MB, Time=0.1163s
Image 45000: GPU Mem=1874.94 MB, CPU=0.00%, RAM=1109.29 MB, Time=0.1175s

Total time for 50,000 images: 6590.93 seconds
Last output shape: (32, 2, 2)
