Single-Core Scratch Implementation

In [None]:
import numpy as np
import csv
import time
import psutil
from torchvision import datasets, transforms

transform = transforms.Compose([transforms.ToTensor()])
cifar10 = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
images = np.stack([np.array(img[0].numpy(), dtype=np.float32) for img in cifar10])  # shape: [50000, 3, 32, 32]

# Define ReLU
def relu(x):
    return np.maximum(0, x)

# Define im2col
def im2col(input_data, kernel_h, kernel_w, stride=1, padding=0):
    N, C, H, W = input_data.shape
    out_h = (H + 2*padding - kernel_h) // stride + 1
    out_w = (W + 2*padding - kernel_w) // stride + 1
    img = np.pad(input_data, ((0,0), (0,0), (padding, padding), (padding, padding)), mode='constant')
    col = np.zeros((N, C, kernel_h, kernel_w, out_h, out_w))
    for y in range(kernel_h):
        y_max = y + stride*out_h
        for x in range(kernel_w):
            x_max = x + stride*out_w
            col[:, :, y, x, :, :] = img[:, :, y:y_max:stride, x:x_max:stride]
    col = col.transpose(0, 4, 5, 1, 2, 3).reshape(N*out_h*out_w, -1)
    return col

# Convolution
def conv2d_im2col(x, weight, bias, stride=1, padding=1):
    N, C, H, W = x.shape
    F, _, KH, KW = weight.shape
    OH = (H + 2 * padding - KH) // stride + 1
    OW = (W + 2 * padding - KW) // stride + 1
    col = im2col(x, KH, KW, stride, padding)
    weight_col = weight.reshape(F, -1)
    out = col @ weight_col.T + bias
    return out.reshape(N, OH, OW, F).transpose(0, 3, 1, 2)

# ResNet Blocks
class ResNet18CIFAR:
    def __init__(self):
        # Initial Conv
        self.conv1_W = np.random.randn(64, 3, 3, 3).astype(np.float32) * 0.01
        self.conv1_b = np.zeros(64, dtype=np.float32)

        # Define weights
        self.id_blocks = []
        self.ds_blocks = []

        # Identity blocks
        for _ in range(2):
            W1 = np.random.randn(64, 64, 3, 3).astype(np.float32) * 0.01
            b1 = np.zeros(64, dtype=np.float32)
            W2 = np.random.randn(64, 64, 3, 3).astype(np.float32) * 0.01
            b2 = np.zeros(64, dtype=np.float32)
            self.id_blocks.append((W1, b1, W2, b2))

        # Downsample + identity: (64→128), (128→256), (256→512)
        channels = [(64, 128), (128, 256), (256, 512)]
        for in_c, out_c in channels:
            W1 = np.random.randn(out_c, in_c, 3, 3).astype(np.float32) * 0.01
            b1 = np.zeros(out_c, dtype=np.float32)
            W2 = np.random.randn(out_c, out_c, 3, 3).astype(np.float32) * 0.01
            b2 = np.zeros(out_c, dtype=np.float32)
            W_short = np.random.randn(out_c, in_c, 1, 1).astype(np.float32) * 0.01
            b_short = np.zeros(out_c, dtype=np.float32)
            self.ds_blocks.append((W1, b1, W2, b2, W_short, b_short))

        # Final FC
        self.fc_W = np.random.randn(512, 10).astype(np.float32) * 0.01
        self.fc_b = np.zeros(10, dtype=np.float32)

    def identity_block(self, x, W1, b1, W2, b2):
        x_shortcut = x.copy()
        x = relu(conv2d_im2col(x, W1, b1, stride=1, padding=1))
        x = conv2d_im2col(x, W2, b2, stride=1, padding=1)
        return relu(x + x_shortcut)

    def downsample_block(self, x, W1, b1, W2, b2, W_short, b_short, stride=2):
        x_shortcut = conv2d_im2col(x, W_short, b_short, stride=stride, padding=0)
        x = relu(conv2d_im2col(x, W1, b1, stride=stride, padding=1))
        x = conv2d_im2col(x, W2, b2, stride=1, padding=1)
        return relu(x + x_shortcut)

    def forward(self, x):
        x = relu(conv2d_im2col(x, self.conv1_W, self.conv1_b, stride=1, padding=1))
        for W1, b1, W2, b2 in self.id_blocks:
            x = self.identity_block(x, W1, b1, W2, b2)
        for idx, (W1, b1, W2, b2, W_short, b_short) in enumerate(self.ds_blocks):
            x = self.downsample_block(x, W1, b1, W2, b2, W_short, b_short, stride=2)

            id_W1, id_b1 = np.random.randn(W2.shape[0], W2.shape[0], 3, 3).astype(np.float32) * 0.01, np.zeros(W2.shape[0])
            id_W2, id_b2 = np.random.randn(W2.shape[0], W2.shape[0], 3, 3).astype(np.float32) * 0.01, np.zeros(W2.shape[0])

            x = self.identity_block(x, id_W1, id_b1, id_W2, id_b2)
        x = x.mean(axis=(2, 3))  # GAP
        return x @ self.fc_W + self.fc_b

model = ResNet18CIFAR()

# Logging
with open('resnet18_cpu_forwardpass_cifar10.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Image_Index", "CPU (%)", "RAM (MB)", "Time (s)", "Throughput (img/s)"])
    for i, image in enumerate(images):
        if i % 1000 == 0:
            print(f"Processing image {i}...")
        x = image[None, :, :, :]

        start = time.time()
        _ = model.forward(x)
        end = time.time()

        elapsed = end - start
        cpu = psutil.cpu_percent()
        ram = psutil.virtual_memory().used / (1024 ** 2)
        throughput = 1 / elapsed if elapsed > 0 else 0

        writer.writerow([i, cpu, ram, elapsed, throughput])

Multi-Core Scratch Implementation

In [None]:
import numpy as np
import csv
import time
import psutil
import multiprocessing
from torchvision import datasets, transforms
from concurrent.futures import ProcessPoolExecutor

==
transform = transforms.Compose([transforms.ToTensor()])
cifar10 = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
images = np.stack([np.array(img[0].numpy(), dtype=np.float32) for img in cifar10])  # [50000, 3, 32, 32]


def relu(x):
    return np.maximum(0, x)

def im2col(input_data, kernel_h, kernel_w, stride=1, padding=0):
    N, C, H, W = input_data.shape
    out_h = (H + 2 * padding - kernel_h) // stride + 1
    out_w = (W + 2 * padding - kernel_w) // stride + 1
    img = np.pad(input_data, ((0, 0), (0, 0), (padding, padding), (padding, padding)), mode='constant')
    col = np.zeros((N, C, kernel_h, kernel_w, out_h, out_w))
    for y in range(kernel_h):
        y_max = y + stride * out_h
        for x in range(kernel_w):
            x_max = x + stride * out_w
            col[:, :, y, x, :, :] = img[:, :, y:y_max:stride, x:x_max:stride]
    col = col.transpose(0, 4, 5, 1, 2, 3).reshape(N * out_h * out_w, -1)
    return col

def conv2d_im2col(x, weight, bias, stride=1, padding=1):
    N, C, H, W = x.shape
    F, _, KH, KW = weight.shape
    OH = (H + 2 * padding - KH) // stride + 1
    OW = (W + 2 * padding - KW) // stride + 1
    col = im2col(x, KH, KW, stride, padding)
    weight_col = weight.reshape(F, -1)
    out = col @ weight_col.T + bias
    return out.reshape(N, OH, OW, F).transpose(0, 3, 1, 2)



class ResNet18CIFAR:
    def __init__(self):
        self.conv1_W = np.random.randn(64, 3, 3, 3).astype(np.float32) * 0.01
        self.conv1_b = np.zeros(64, dtype=np.float32)

        self.id_blocks = []
        self.ds_blocks = []

        for _ in range(2):
            W1 = np.random.randn(64, 64, 3, 3).astype(np.float32) * 0.01
            b1 = np.zeros(64, dtype=np.float32)
            W2 = np.random.randn(64, 64, 3, 3).astype(np.float32) * 0.01
            b2 = np.zeros(64, dtype=np.float32)
            self.id_blocks.append((W1, b1, W2, b2))

        channels = [(64, 128), (128, 256), (256, 512)]
        for in_c, out_c in channels:
            W1 = np.random.randn(out_c, in_c, 3, 3).astype(np.float32) * 0.01
            b1 = np.zeros(out_c, dtype=np.float32)
            W2 = np.random.randn(out_c, out_c, 3, 3).astype(np.float32) * 0.01
            b2 = np.zeros(out_c, dtype=np.float32)
            W_short = np.random.randn(out_c, in_c, 1, 1).astype(np.float32) * 0.01
            b_short = np.zeros(out_c, dtype=np.float32)
            self.ds_blocks.append((W1, b1, W2, b2, W_short, b_short))

        self.fc_W = np.random.randn(512, 10).astype(np.float32) * 0.01
        self.fc_b = np.zeros(10, dtype=np.float32)

    def identity_block(self, x, W1, b1, W2, b2):
        x_shortcut = x.copy()
        x = relu(conv2d_im2col(x, W1, b1, stride=1, padding=1))
        x = conv2d_im2col(x, W2, b2, stride=1, padding=1)
        return relu(x + x_shortcut)

    def downsample_block(self, x, W1, b1, W2, b2, W_short, b_short, stride=2):
        x_shortcut = conv2d_im2col(x, W_short, b_short, stride=stride, padding=0)
        x = relu(conv2d_im2col(x, W1, b1, stride=stride, padding=1))
        x = conv2d_im2col(x, W2, b2, stride=1, padding=1)
        return relu(x + x_shortcut)

    def forward(self, x):
        x = relu(conv2d_im2col(x, self.conv1_W, self.conv1_b, stride=1, padding=1))
        for W1, b1, W2, b2 in self.id_blocks:
            x = self.identity_block(x, W1, b1, W2, b2)
        for idx, (W1, b1, W2, b2, W_short, b_short) in enumerate(self.ds_blocks):
            x = self.downsample_block(x, W1, b1, W2, b2, W_short, b_short, stride=2)
            id_W1 = np.random.randn(W2.shape[0], W2.shape[0], 3, 3).astype(np.float32) * 0.01
            id_b1 = np.zeros(W2.shape[0])
            id_W2 = np.random.randn(W2.shape[0], W2.shape[0], 3, 3).astype(np.float32) * 0.01
            id_b2 = np.zeros(W2.shape[0])
            x = self.identity_block(x, id_W1, id_b1, id_W2, id_b2)
        x = x.mean(axis=(2, 3))
        return x @ self.fc_W + self.fc_b



def process_image(i):
    import time
    import psutil

    model = ResNet18CIFAR()
    x = images[i][None, :, :, :]

    start = time.time()
    _ = model.forward(x)
    end = time.time()

    elapsed = end - start
    cpu = psutil.cpu_percent(interval=None)
    ram = psutil.virtual_memory().used / (1024 ** 2)
    throughput = 1 / elapsed if elapsed > 0 else 0

    return [i, cpu, ram, elapsed, throughput]


if __name__ == "__main__":
    num_workers = multiprocessing.cpu_count()

    with open('resnet18_multicore_scratch_window.csv', mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Image_Index", "CPU (%)", "RAM (MB)", "Time (s)", "Throughput (img/s)"])

        with ProcessPoolExecutor(max_workers=num_workers) as executor:
            for i, result in enumerate(executor.map(process_image, range(len(images)))):
                if i % 1000 == 0:
                    print(f"Completed {i} images...")
                writer.writerow(result)

Single-Core PyTorch Implementation

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import time
import psutil
import csv
import os
from torchvision.models import resnet18

# Use a single CPU core
torch.set_num_threads(1)
torch.set_num_interop_threads(1)
print(f"[CPU mode] PyTorch using {torch.get_num_threads()} thread")

# Device
device = torch.device("cpu")

# Transform
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Datasets
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)

testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False)

# Model
model = resnet18(num_classes=10)
model.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
model.maxpool = nn.Identity()
model = model.to(device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

# Accuracy Function
def calculate_accuracy(loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in loader:
            outputs = model(images.to(device))
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels.to(device)).sum().item()
            total += labels.size(0)
    model.train()
    return 100 * correct / total

# CSV File
csv_file = 'resnet18_single_core_log_pytorch.csv'

# Training
num_epochs = 10
last_row = []

for epoch in range(num_epochs):
    total_train = 0
    correct_train = 0
    epoch_loss = 0.0
    batch_count = 0
    start_epoch_time = time.perf_counter()

    for batch_idx, (images, labels) in enumerate(trainloader):
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()
        epoch_loss = loss.item()
        batch_count += 1

    end_epoch_time = time.perf_counter()
    elapsed = end_epoch_time - start_epoch_time
    train_acc = 100 * correct_train / total_train
    test_acc = calculate_accuracy(testloader)
    cpu = psutil.cpu_percent(interval=None)
    ram = psutil.Process(os.getpid()).memory_info().rss / (1024 ** 2)


    last_row = [
        epoch + 1, batch_count, f"{epoch_loss:.4f}",
        f"{train_acc:.2f}", f"{test_acc:.2f}",
        f"{cpu:.2f}", f"{ram:.2f}", f"{elapsed:.6f}"
    ]

    print(f"[Epoch {epoch+1}] Final: Loss={epoch_loss:.4f}, "
          f"Train Acc={train_acc:.2f}%, Test Acc={test_acc:.2f}%, "
          f"CPU={cpu:.2f}%, RAM={ram:.2f}MB, Time={elapsed:.6f}s")


with open(csv_file, mode='w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Epoch", "Batch_Index", "Loss", "Training_Accuracy(%)", "Testing_Accuracy(%)",
                     "CPU_Usage(%)", "RAM_Usage(MB)", "Time_Per_Batch(s)"])
    writer.writerow(last_row)


Multi-Core PyTorch Implementation

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import time
import psutil
import csv
import os
from torchvision.models import resnet18

# Use physical CPU cores
torch.set_num_threads(psutil.cpu_count(logical=False))
torch.set_num_interop_threads(psutil.cpu_count(logical=False))
print(f"[CPU mode] PyTorch using {torch.get_num_threads()} threads")

# Device
device = torch.device("cpu")

# Transform
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Datasets
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=1, shuffle=True)

testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=1, shuffle=False)

# Model
model = resnet18(num_classes=10)
model.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
model.maxpool = nn.Identity()
model = model.to(device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

# Accuracy Function
def calculate_accuracy(loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in loader:
            outputs = model(images.to(device))
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels.to(device)).sum().item()
            total += labels.size(0)
    model.train()
    return 100 * correct / total

# CSV File
csv_file = 'resnet18_cpu_log.csv'
with open(csv_file, mode='w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Epoch", "Batch_Index", "Loss", "Training_Accuracy(%)", "Testing_Accuracy(%)",
                     "CPU_Usage(%)", "RAM_Usage(MB)", "Time_Per_Batch(s)", "Throughput(img/sec)"])

# Training
num_epochs = 10

for epoch in range(num_epochs):
    total_train = 0
    correct_train = 0
    for batch_idx, (images, labels) in enumerate(trainloader):
        start_time = time.perf_counter()

        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()
        train_acc = 100 * correct_train / total_train

        end_time = time.perf_counter()
        elapsed = end_time - start_time
        throughput = 1 / elapsed  # batch size = 1 → images/sec
        global_idx = batch_idx + 1

        should_log = (batch_idx == 0) or (global_idx % 10000 == 0) or (global_idx == len(trainloader))

        if should_log:
            test_acc = calculate_accuracy(testloader)
            cpu = psutil.cpu_percent(interval=None)
            ram = psutil.Process(os.getpid()).memory_info().rss / (1024 ** 2)

            # Write to CSV
            with open(csv_file, mode='a', newline='') as f:
                writer = csv.writer(f)
                writer.writerow([
                    epoch + 1, global_idx, f"{loss.item():.4f}",
                    f"{train_acc:.2f}", f"{test_acc:.2f}",
                    f"{cpu:.2f}", f"{ram:.2f}", f"{elapsed:.6f}", f"{throughput:.2f}"
                ])

            print(f"[Epoch {epoch+1}] Batch {global_idx}: Loss={loss.item():.4f}, "
                  f"Train Acc={train_acc:.2f}%, Test Acc={test_acc:.2f}%, "
                  f"CPU={cpu:.2f}%, RAM={ram:.2f}MB, Time={elapsed:.6f}s, "
                  f"Throughput={throughput:.2f} img/sec")

print(f"Training complete. Log saved to: {csv_file}")
