In [1]:
import numpy as np
import resource
import time

## KNN

#### 1. Numpy

In [2]:
import numpy as np
import time
import resource
from collections import Counter
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

# --- Load MNIST ---
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X, y = mnist["data"], mnist["target"].astype(np.int64)

# Normalize to [0, 1]
X = X / 255.0

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=1000, test_size=100, stratify=y, random_state=42)

# --- KNN Logic ---
def euclidean_distance(a, b):
    return np.sqrt(np.sum((a - b)**2, axis=1))

def knn_numpy(X_train, y_train, X_test, k=5):
    predictions = []
    for test_point in X_test:
        distances = euclidean_distance(X_train, test_point)
        nearest = np.argsort(distances)[:k]
        nearest_labels = y_train[nearest]
        predictions.append(Counter(nearest_labels).most_common(1)[0][0])
    return np.array(predictions)

# --- Peak RAM ---
def print_peak_ram_usage():
    usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    print(f"Peak RAM Usage: {usage / 1024:.2f} MB")

# --- Memory Bandwidth ---
def test_bandwidth_numpy(X_train, X_test):
    start = time.time()
    _ = np.sum((X_train - X_test[0]) ** 2, axis=1)
    end = time.time()
    bytes_moved = (X_train.nbytes + X_test[0].nbytes) * 2
    bandwidth = bytes_moved / (end - start) / 1e9
    print(f"Estimated Memory Bandwidth: {bandwidth:.2f} GB/s")

# --- Run Benchmark ---
start = time.time()
y_pred = knn_numpy(X_train, y_train, X_test)
end = time.time()
accuracy = np.mean(y_pred == y_test)

print(f"KNN-Numpy Accuracy (MNIST): {accuracy:.2f}")
print(f"Execution Time: {end - start:.4f} seconds")
print_peak_ram_usage()
test_bandwidth_numpy(X_train, X_test)

KNN-Numpy Accuracy (MNIST): 0.96
Execution Time: 0.2646 seconds
Peak RAM Usage: 1508.25 MB
Estimated Memory Bandwidth: 5.22 GB/s


#### 2. PyTorch

In [3]:
import torch
import time
import resource
from collections import Counter
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

# --- Load MNIST from OpenML ---
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X, y = mnist["data"], mnist["target"].astype(int)

# Normalize and split
X = X / 255.0
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(X, y, train_size=1000, test_size=100, stratify=y, random_state=42)

# --- Convert to PyTorch tensors (CPU-only) ---
device = "cpu"
X_train = torch.tensor(X_train_np, dtype=torch.float32, device=device)
y_train = torch.tensor(y_train_np, dtype=torch.int64, device=device)
X_test = torch.tensor(X_test_np, dtype=torch.float32, device=device)
y_test = torch.tensor(y_test_np, dtype=torch.int64, device=device)

# --- Euclidean distance ---
def euclidean_distance(a, b):
    return torch.sqrt(torch.sum((a - b) ** 2, dim=1))

# --- KNN Logic ---
def knn_torch(X_train, y_train, X_test, k=5):
    predictions = []
    for i in range(X_test.shape[0]):
        test_point = X_test[i]
        distances = euclidean_distance(X_train, test_point)
        nearest = torch.topk(distances, k=k, largest=False).indices
        nearest_labels = y_train[nearest]
        predicted = torch.mode(nearest_labels).values.item()
        predictions.append(predicted)
    return torch.tensor(predictions)

# --- Peak RAM ---
def print_peak_ram_usage():
    usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    print(f"[PyTorch] Peak RAM Usage: {usage / 1024:.2f} MB")

# --- Memory Bandwidth ---
def test_bandwidth_torch(X_train, X_test):
    start = time.time()
    _ = torch.sum((X_train - X_test[0]) ** 2, dim=1)
    end = time.time()

    bytes_moved = (X_train.element_size() * X_train.nelement() +
                   X_test[0].element_size() * X_test[0].nelement()) * 2
    bandwidth = bytes_moved / (end - start) / 1e9
    print(f"[PyTorch] Estimated Memory Bandwidth: {bandwidth:.2f} GB/s")

# --- Benchmark ---
start = time.time()
y_pred = knn_torch(X_train, y_train, X_test, k=5)
end = time.time()

accuracy = (y_pred == y_test).float().mean().item()
print(f"KNN-PyTorch Accuracy (MNIST): {accuracy:.2f}")
print(f"Execution Time: {end - start:.4f} seconds")
print_peak_ram_usage()
test_bandwidth_torch(X_train, X_test)

KNN-PyTorch Accuracy (MNIST): 0.96
Execution Time: 0.1857 seconds
[PyTorch] Peak RAM Usage: 2732.95 MB
[PyTorch] Estimated Memory Bandwidth: 3.75 GB/s


#### 3. C/C++

In [4]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Fetch and prepare MNIST
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X, y = mnist["data"], mnist["target"].astype(int)
X = X / 255.0

# Split and save to CSV
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=1000, test_size=100, stratify=y, random_state=42)

np.savetxt("X_train.csv", X_train, delimiter=",")
np.savetxt("y_train.csv", y_train, fmt='%d')
np.savetxt("X_test.csv", X_test, delimiter=",")
np.savetxt("y_test.csv", y_test, fmt='%d')

In [5]:
%%writefile knn_cpu.cpp
#include <iostream>
#include <vector>
#include <cmath>
#include <algorithm>
#include <map>
#include <chrono>
#include <fstream>
#include <sstream>
#include <sys/resource.h>

using namespace std;
using namespace std::chrono;

// Load CSV into 2D vector
vector<vector<double>> load_csv_data(const string& filename, int rows, int cols) {
    vector<vector<double>> data(rows, vector<double>(cols));
    ifstream file(filename);
    string line;
    for (int i = 0; i < rows && getline(file, line); ++i) {
        stringstream ss(line);
        string val;
        for (int j = 0; j < cols && getline(ss, val, ','); ++j) {
            data[i][j] = stod(val);
        }
    }
    return data;
}

// Load label CSV into 1D vector
vector<int> load_csv_labels(const string& filename, int rows) {
    vector<int> labels(rows);
    ifstream file(filename);
    string line;
    for (int i = 0; i < rows && getline(file, line); ++i) {
        labels[i] = stoi(line);
    }
    return labels;
}

double euclidean_distance(const vector<double>& a, const vector<double>& b) {
    double sum = 0.0;
    for (size_t i = 0; i < a.size(); ++i) {
        double diff = a[i] - b[i];
        sum += diff * diff;
    }
    return sqrt(sum);
}

int knn_predict(const vector<vector<double>>& X_train, const vector<int>& y_train,
                const vector<double>& x_test, int k) {
    vector<pair<double, int>> distances;
    for (size_t i = 0; i < X_train.size(); ++i) {
        double dist = euclidean_distance(X_train[i], x_test);
        distances.push_back({dist, y_train[i]});
    }

    sort(distances.begin(), distances.end());

    map<int, int> class_count;
    for (int i = 0; i < k; ++i) {
        class_count[distances[i].second]++;
    }

    int prediction = -1, max_count = -1;
    for (const auto& pair : class_count) {
        if (pair.second > max_count) {
            max_count = pair.second;
            prediction = pair.first;
        }
    }
    return prediction;
}

void print_peak_ram_usage() {
    struct rusage usage;
    getrusage(RUSAGE_SELF, &usage);
    cout << "Peak RAM Usage: " << usage.ru_maxrss / 1024.0 << " MB" << endl;
}

double estimate_bandwidth(const vector<vector<double>>& X_train, const vector<double>& x_test) {
    auto start = high_resolution_clock::now();
    for (size_t i = 0; i < X_train.size(); ++i) {
        double sum = 0.0;
        for (size_t j = 0; j < x_test.size(); ++j) {
            double diff = X_train[i][j] - x_test[j];
            sum += diff * diff;
        }
        volatile double result = sqrt(sum);
    }
    auto end = high_resolution_clock::now();
    double elapsed = duration_cast<nanoseconds>(end - start).count() / 1e9;

    size_t bytes = X_train.size() * x_test.size() * sizeof(double) * 2;
    return bytes / elapsed / 1e9;
}

int main() {
    int num_train = 1000;
    int num_test = 100;
    int num_features = 784;
    int k = 5;

    vector<vector<double>> X_train = load_csv_data("X_train.csv", num_train, num_features);
    vector<int> y_train = load_csv_labels("y_train.csv", num_train);
    vector<vector<double>> X_test = load_csv_data("X_test.csv", num_test, num_features);
    vector<int> y_test = load_csv_labels("y_test.csv", num_test);

    vector<int> y_pred;
    auto start = high_resolution_clock::now();

    for (int i = 0; i < num_test; ++i) {
        int pred = knn_predict(X_train, y_train, X_test[i], k);
        y_pred.push_back(pred);
    }

    auto stop = high_resolution_clock::now();
    auto duration = duration_cast<milliseconds>(stop - start);
    int correct = 0;
    for (int i = 0; i < num_test; ++i) {
        if (y_pred[i] == y_test[i]) correct++;
    }

    cout << "KNN-C++ Accuracy (MNIST): " << static_cast<double>(correct) / num_test << endl;
    cout << "Execution Time: " << duration.count() << " ms" << endl;
    print_peak_ram_usage();
    double bandwidth = estimate_bandwidth(X_train, X_test[0]);
    cout << "Estimated Memory Bandwidth: " << bandwidth << " GB/s" << endl;

    return 0;
}

Writing knn_cpu.cpp


In [6]:
!g++ -O2 knn_cpu.cpp -o knn_cpu
!./knn_cpu

KNN-C++ Accuracy (MNIST): 0.96
Execution Time: 124 ms
Peak RAM Usage: 2745.22 MB
Estimated Memory Bandwidth: 10.4639 GB/s


##MATRIX MULTIPLICATION

####Numpy

In [7]:
import numpy as np
import time
import tracemalloc
import psutil
import os

def estimate_bandwidth(A, B, time_seconds):
    bytes_read = A.nbytes + B.nbytes
    gb_read = bytes_read / 1e9
    return gb_read / time_seconds if time_seconds > 1e-6 else 0.0

def benchmark_dense(n=1024):
    print(" Dense Matrix Multiplication (A @ B)")

    A = np.random.rand(n, n).astype(np.float32)
    B = np.random.rand(n, n).astype(np.float32)

    tracemalloc.start()
    start_time = time.time()
    C = A @ B
    end_time = time.time()
    _, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    exec_time = end_time - start_time
    bandwidth = estimate_bandwidth(A, B, exec_time)

    print(f"Shape: ({n}, {n}) x ({n}, {n})")
    print(f"Execution Time: {exec_time:.4f} seconds")
    print(f"Peak RAM Usage: {peak / 1e6:.2f} MB")
    print(f"Estimated Memory Bandwidth: {bandwidth:.2f} GB/s")

def benchmark_batched(batch_size=64, n=128):
    print("\n Batched Matrix Multiplication (np.matmul(A, B))")

    A = np.random.rand(batch_size, n, n).astype(np.float32)
    B = np.random.rand(batch_size, n, n).astype(np.float32)

    tracemalloc.start()
    start_time = time.time()
    C = np.matmul(A, B)  # or A @ B in NumPy 1.10+
    end_time = time.time()
    _, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    exec_time = end_time - start_time
    bandwidth = estimate_bandwidth(A, B, exec_time)

    print(f"Shape: ({batch_size}, {n}, {n}) x ({batch_size}, {n}, {n})")
    print(f"Execution Time: {exec_time:.4f} seconds")
    print(f"Peak RAM Usage: {peak / 1e6:.2f} MB")
    print(f"Estimated Memory Bandwidth: {bandwidth:.2f} GB/s")

def main():
    benchmark_dense(n=1024)
    benchmark_batched(batch_size=64, n=128)

if __name__ == "__main__":
    main()


 Dense Matrix Multiplication (A @ B)
Shape: (1024, 1024) x (1024, 1024)
Execution Time: 0.0381 seconds
Peak RAM Usage: 4.20 MB
Estimated Memory Bandwidth: 0.22 GB/s

 Batched Matrix Multiplication (np.matmul(A, B))
Shape: (64, 128, 128) x (64, 128, 128)
Execution Time: 0.0078 seconds
Peak RAM Usage: 4.20 MB
Estimated Memory Bandwidth: 1.07 GB/s


####PyTorch

In [8]:
import torch
import time
import tracemalloc
import psutil
import os

def estimate_bandwidth(A, B, time_seconds):
    bytes_read = A.element_size() * (A.numel() + B.numel())
    gb_read = bytes_read / 1e9
    return gb_read / time_seconds if time_seconds > 1e-6 else 0.0

def benchmark_dense(n=1024):
    print(" Dense Matrix Multiplication on CPU (A @ B)")

    A = torch.rand(n, n, dtype=torch.float32)
    B = torch.rand(n, n, dtype=torch.float32)

    tracemalloc.start()
    start_time = time.time()
    C = A @ B
    end_time = time.time()
    _, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    exec_time = end_time - start_time
    bandwidth = estimate_bandwidth(A, B, exec_time)

    print(f"Shape: ({n}, {n}) x ({n}, {n})")
    print(f"Execution Time: {exec_time:.4f} seconds")
    print(f"Peak RAM Usage: {peak / 1e6:.2f} MB")
    print(f"Estimated Memory Bandwidth: {bandwidth:.2f} GB/s")

def benchmark_batched(batch_size=64, n=128):
    print("\n Batched Matrix Multiplication on CPU (A @ B)")

    A = torch.rand(batch_size, n, n, dtype=torch.float32)
    B = torch.rand(batch_size, n, n, dtype=torch.float32)

    tracemalloc.start()
    start_time = time.time()
    C = torch.bmm(A, B)  # or torch.matmul
    end_time = time.time()
    _, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    exec_time = end_time - start_time
    bandwidth = estimate_bandwidth(A, B, exec_time)

    print(f"Shape: ({batch_size}, {n}, {n}) x ({batch_size}, {n}, {n})")
    print(f"Execution Time: {exec_time:.4f} seconds")
    print(f"Peak RAM Usage: {peak / 1e6:.2f} MB")
    print(f"Estimated Memory Bandwidth: {bandwidth:.2f} GB/s")

def main():
    benchmark_dense(n=1024)
    benchmark_batched(batch_size=64, n=128)

if __name__ == "__main__":
    main()


 Dense Matrix Multiplication on CPU (A @ B)
Shape: (1024, 1024) x (1024, 1024)
Execution Time: 0.0610 seconds
Peak RAM Usage: 0.00 MB
Estimated Memory Bandwidth: 0.14 GB/s

 Batched Matrix Multiplication on CPU (A @ B)
Shape: (64, 128, 128) x (64, 128, 128)
Execution Time: 0.0091 seconds
Peak RAM Usage: 0.00 MB
Estimated Memory Bandwidth: 0.92 GB/s


####C++

In [9]:
%%writefile matrix_multiplication_cpp.cpp
#include <iostream>
#include <vector>
#include <chrono>
#include <cstdlib>
#include <sys/resource.h>
#include <iomanip>

using namespace std;
using namespace chrono;

long get_peak_ram_usage_mb() {
    struct rusage usage;
    getrusage(RUSAGE_SELF, &usage);
    return usage.ru_maxrss / 1024;
}

void dense_matmul(const vector<vector<float>>& A,
                  const vector<vector<float>>& B,
                  vector<vector<float>>& C,
                  int n) {
    for (int i = 0; i < n; ++i)
        for (int j = 0; j < n; ++j)
            for (int k = 0; k < n; ++k)
                C[i][j] += A[i][k] * B[k][j];
}

void batched_matmul(const vector<vector<vector<float>>>& A,
                    const vector<vector<vector<float>>>& B,
                    vector<vector<vector<float>>>& C,
                    int batch, int n) {
    for (int b = 0; b < batch; ++b)
        for (int i = 0; i < n; ++i)
            for (int j = 0; j < n; ++j)
                for (int k = 0; k < n; ++k)
                    C[b][i][j] += A[b][i][k] * B[b][k][j];
}

double estimate_bandwidth(long bytes, double seconds) {
    return (bytes / 1e9) / seconds;
}

void run_dense(int n) {
    cout << " Dense Matrix Multiplication (C++ std::vector)\n";

    vector<vector<float>> A(n, vector<float>(n));
    vector<vector<float>> B(n, vector<float>(n));
    vector<vector<float>> C(n, vector<float>(n, 0.0f));

    for (auto& row : A)
        for (auto& val : row) val = static_cast<float>(rand()) / RAND_MAX;
    for (auto& row : B)
        for (auto& val : row) val = static_cast<float>(rand()) / RAND_MAX;

    long bytes = 2L * n * n * sizeof(float);

    auto start = high_resolution_clock::now();
    dense_matmul(A, B, C, n);
    auto end = high_resolution_clock::now();

    double duration = chrono::duration<double>(end - start).count();
    double bandwidth = estimate_bandwidth(bytes, duration);
    long peak_mem = get_peak_ram_usage_mb();

    cout << fixed << setprecision(4);
    cout << "Size: " << n << " x " << n << endl;
    cout << "Execution Time: " << duration << " seconds\n";
    cout << "Peak RAM Usage: " << peak_mem << " MB\n";
    cout << "Estimated Memory Bandwidth: " << bandwidth << " GB/s\n";
}

void run_batched(int batch, int n) {
    cout << "\n Batched Matrix Multiplication (C++ std::vector)\n";

    vector<vector<vector<float>>> A(batch, vector<vector<float>>(n, vector<float>(n)));
    vector<vector<vector<float>>> B(batch, vector<vector<float>>(n, vector<float>(n)));
    vector<vector<vector<float>>> C(batch, vector<vector<float>>(n, vector<float>(n, 0.0f)));

    for (auto& mat : A)
        for (auto& row : mat)
            for (auto& val : row) val = static_cast<float>(rand()) / RAND_MAX;

    for (auto& mat : B)
        for (auto& row : mat)
            for (auto& val : row) val = static_cast<float>(rand()) / RAND_MAX;

    long bytes = 2L * batch * n * n * sizeof(float);

    auto start = high_resolution_clock::now();
    batched_matmul(A, B, C, batch, n);
    auto end = high_resolution_clock::now();

    double duration = chrono::duration<double>(end - start).count();
    double bandwidth = estimate_bandwidth(bytes, duration);
    long peak_mem = get_peak_ram_usage_mb();

    cout << fixed << setprecision(4);
    cout << "Batch Size: " << batch << ", Matrix Size: " << n << " x " << n << endl;
    cout << "Execution Time: " << duration << " seconds\n";
    cout << "Peak RAM Usage: " << peak_mem << " MB\n";
    cout << "Estimated Memory Bandwidth: " << bandwidth << " GB/s\n";
}

int main() {
    srand(42);
    run_dense(512);       // You can change to 1024 for larger benchmarks
    run_batched(32, 64);  // You can increase if memory allows
    return 0;
}


Writing matrix_multiplication_cpp.cpp


In [10]:
!g++ -O2 -std=c++17 matrix_multiplication_cpp.cpp -o matmul_cpp


In [11]:
!./matmul_cpp

 Dense Matrix Multiplication (C++ std::vector)
Size: 512 x 512
Execution Time: 0.2166 seconds
Peak RAM Usage: 2745 MB
Estimated Memory Bandwidth: 0.0097 GB/s

 Batched Matrix Multiplication (C++ std::vector)
Batch Size: 32, Matrix Size: 64 x 64
Execution Time: 0.0099 seconds
Peak RAM Usage: 2745 MB
Estimated Memory Bandwidth: 0.1060 GB/s


##CNN

Conv1: 1 → 16 channels, 3×3, ReLU

MaxPool

Conv2: 16 → 32 channels, 3×3, ReLU

MaxPool

Flatten

FC1: 512 → 128

FC2: 128 → 10 (class scores)

####Numpy

In [12]:
import numpy as np
import time
import tracemalloc

def relu(x):
    return np.maximum(0, x)

def maxpool(x, size=2, stride=2):
    n, c, h, w = x.shape
    out_h = (h - size) // stride + 1
    out_w = (w - size) // stride + 1
    out = np.zeros((n, c, out_h, out_w))
    for i in range(out_h):
        for j in range(out_w):
            x_slice = x[:, :, i*stride:i*stride+size, j*stride:j*stride+size]
            out[:, :, i, j] = np.max(x_slice, axis=(2, 3))
    return out

def conv2d(x, w, b, stride=1, padding=0):
    n, c_in, h, w_in = x.shape
    c_out, _, k, _ = w.shape
    h_out = (h + 2*padding - k) // stride + 1
    w_out = (w_in + 2*padding - k) // stride + 1

    x_padded = np.pad(x, ((0,0), (0,0), (padding, padding), (padding, padding)), mode='constant')
    out = np.zeros((n, c_out, h_out, w_out))

    for i in range(h_out):
        for j in range(w_out):
            x_slice = x_padded[:, :, i*stride:i*stride+k, j*stride:j*stride+k]
            for f in range(c_out):
                out[:, f, i, j] = np.sum(x_slice * w[f, :, :, :], axis=(1,2,3))
    return out + b[None, :, None, None]

def flatten(x):
    return x.reshape(x.shape[0], -1)

def fully_connected(x, w, b):
    return np.dot(x, w.T) + b

def estimate_bandwidth(arrays, time_seconds):
    bytes_total = sum([a.nbytes for a in arrays])
    return (bytes_total / 1e9) / time_seconds if time_seconds > 1e-6 else 0.0

def main():
    np.random.seed(42)
    batch_size = 8
    x = np.random.rand(batch_size, 1, 64, 64).astype(np.float32)

    # Conv1: 1→16, kernel 3x3
    w1 = np.random.rand(16, 1, 3, 3).astype(np.float32)
    b1 = np.random.rand(16).astype(np.float32)

    # Conv2: 16→32, kernel 3x3
    w2 = np.random.rand(32, 16, 3, 3).astype(np.float32)
    b2 = np.random.rand(32).astype(np.float32)

    # 🛠 Fix: Compute correct input size after conv+pool layers
    # After Conv1 → Pool → Conv2 → Pool: shape becomes (batch, 32, 16, 16)
    fc1_in = 32 * 16 * 16  # = 8192
    w_fc1 = np.random.rand(128, fc1_in).astype(np.float32)
    b_fc1 = np.random.rand(128).astype(np.float32)

    w_fc2 = np.random.rand(10, 128).astype(np.float32)
    b_fc2 = np.random.rand(10).astype(np.float32)

    # === Benchmark Start ===
    tracemalloc.start()
    start_time = time.time()

    out = conv2d(x, w1, b1, stride=1, padding=1)
    out = relu(out)
    out = maxpool(out)

    out = conv2d(out, w2, b2, stride=1, padding=1)
    out = relu(out)
    out = maxpool(out)

    out = flatten(out)
    out = relu(fully_connected(out, w_fc1, b_fc1))
    out = fully_connected(out, w_fc2, b_fc2)

    end_time = time.time()
    _, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    exec_time = end_time - start_time
    ram_mb = peak / 1e6
    bandwidth = estimate_bandwidth([x, w1, w2, out], exec_time)

    print(" Large CNN - NumPy (CPU)")
    print(f"Execution Time: {exec_time:.4f} sec")
    print(f"Peak RAM Usage: {ram_mb:.2f} MB")
    print(f"Estimated Memory Bandwidth: {bandwidth:.2f} GB/s")
    print(f"Output shape: {out.shape}")

if __name__ == "__main__":
    main()


 Large CNN - NumPy (CPU)
Execution Time: 6.4119 sec
Peak RAM Usage: 8.94 MB
Estimated Memory Bandwidth: 0.00 GB/s
Output shape: (8, 10)


####PyTorch

In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import tracemalloc
import psutil
import os

class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)   # (B, 1, 64, 64) → (B, 16, 64, 64)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)  # (B, 16, 32, 32) → (B, 32, 32, 32)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)         # halves H and W
        self.fc1 = nn.Linear(32 * 16 * 16, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))  # → (B, 16, 32, 32)
        x = self.pool(F.relu(self.conv2(x)))  # → (B, 32, 16, 16)
        x = x.view(x.size(0), -1)             # → (B, 8192)
        x = F.relu(self.fc1(x))               # → (B, 128)
        x = self.fc2(x)                       # → (B, 10)
        return x

def estimate_bandwidth(tensors, exec_time):
    total_bytes = sum([t.element_size() * t.numel() for t in tensors])
    return (total_bytes / 1e9) / exec_time if exec_time > 1e-6 else 0.0

def main():
    torch.manual_seed(42)
    device = torch.device("cpu")
    batch_size = 8

    model = CNNModel().to(device)
    x = torch.randn(batch_size, 1, 64, 64, device=device)

    model.eval()

    tracemalloc.start()
    start_time = time.time()

    with torch.no_grad():
        output = model(x)

    end_time = time.time()
    _, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    exec_time = end_time - start_time
    ram_mb = peak / 1e6
    bandwidth = estimate_bandwidth([x, output], exec_time)

    print(" CNN - PyTorch (CPU)")
    print(f"Execution Time: {exec_time:.4f} sec")
    print(f"Peak RAM Usage: {ram_mb:.2f} MB")
    print(f"Estimated Memory Bandwidth: {bandwidth:.2f} GB/s")
    print(f"Output Shape: {output.shape}")

if __name__ == "__main__":
    main()


 CNN - PyTorch (CPU)
Execution Time: 0.1256 sec
Peak RAM Usage: 0.00 MB
Estimated Memory Bandwidth: 0.00 GB/s
Output Shape: torch.Size([8, 10])


**Comparing metrics by improving Metrics**

Model   --   BatchSize	 --   Winner

Small   	--     (< 32)	   --     CPU

Medium   --     (64–128)	 --    Depends

Large   -- 	   ( >128) --	        GPU


####Numpy with increased batch size

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import time

class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(32 * 16 * 16, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))  # → (B, 16, 32, 32)
        x = self.pool(F.relu(self.conv2(x)))  # → (B, 32, 16, 16)
        x = x.view(x.size(0), -1)             # → (B, 8192)
        x = F.relu(self.fc1(x))               # → (B, 128)
        x = self.fc2(x)                       # → (B, 10)
        return x

def estimate_bandwidth(tensors, exec_time):
    total_bytes = sum([t.element_size() * t.numel() for t in tensors])
    return (total_bytes / 1e9) / exec_time if exec_time > 1e-6 else 0.0

def benchmark(device, batch_size):
    print(f"\n Running on: {device.upper()} with batch size {batch_size}")
    torch.manual_seed(42)
    model = CNNModel().to(device)
    x = torch.randn(batch_size, 1, 64, 64, device=device)

    model.eval()

    # Warm-up
    with torch.no_grad():
        _ = model(x)

    if device == 'cuda':
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
        torch.cuda.synchronize()

    start_time = time.time()
    with torch.no_grad():
        output = model(x)
    if device == 'cuda':
        torch.cuda.synchronize()
    end_time = time.time()

    exec_time = end_time - start_time
    mem = torch.cuda.max_memory_allocated() / 1e6 if device == 'cuda' else 0
    bandwidth = estimate_bandwidth([x, output], exec_time)

    print(f"Execution Time: {exec_time:.4f} sec")
    print(f"Peak {'GPU' if device=='cuda' else 'RAM'} Memory Usage: {mem:.2f} MB")
    print(f"Estimated Memory Bandwidth: {bandwidth:.2f} GB/s")
    print(f"Output Shape: {output.shape}")

if __name__ == "__main__":
    batch_size = 128
    benchmark('cpu', batch_size)

    if torch.cuda.is_available():
        benchmark('cuda', batch_size)
    else:
        print(" GPU not available. Only CPU benchmark run.")



 Running on: CPU with batch size 128
Execution Time: 0.1753 sec
Peak RAM Memory Usage: 0.00 MB
Estimated Memory Bandwidth: 0.01 GB/s
Output Shape: torch.Size([128, 10])
 GPU not available. Only CPU benchmark run.


####PyTorch with increased batch size

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import tracemalloc

class CNNModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 16, 3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(32 * 16 * 16, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

def estimate_bandwidth(tensors, time_sec):
    total_bytes = sum([t.element_size() * t.numel() for t in tensors])
    return (total_bytes / 1e9) / time_sec if time_sec > 0 else 0.0

def main():
    batch_size = 128
    model = CNNModel().to("cpu")
    x = torch.randn(batch_size, 1, 64, 64)

    model.eval()

    with torch.no_grad():
        _ = model(x)  # warm-up

    tracemalloc.start()
    start = time.time()
    with torch.no_grad():
        output = model(x)
    end = time.time()
    _, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    exec_time = end - start
    ram_mb = peak / 1e6
    bandwidth = estimate_bandwidth([x, output], exec_time)

    print(" CNN - PyTorch (CPU)")
    print(f"Execution Time: {exec_time:.4f} sec")
    print(f"Peak RAM Usage: {ram_mb:.2f} MB")
    print(f"Estimated Memory Bandwidth: {bandwidth:.2f} GB/s")
    print(f"Output Shape: {output.shape}")

if __name__ == "__main__":
    main()


 CNN - PyTorch (CPU)
Execution Time: 0.2223 sec
Peak RAM Usage: 0.00 MB
Estimated Memory Bandwidth: 0.01 GB/s
Output Shape: torch.Size([128, 10])


####C++ with increased batch size

In [17]:
%%writefile cnn_forward_openmp.cpp

#include <iostream>
#include <vector>
#include <cmath>
#include <chrono>
#include <iomanip>
#include <omp.h>

using namespace std;
using namespace chrono;

typedef vector<vector<vector<vector<float>>>> Tensor4D;

float relu(float x) { return x > 0 ? x : 0; }

Tensor4D conv2d_omp(const Tensor4D& x, const Tensor4D& w, const vector<float>& b, int stride = 1, int padding = 0) {
    int N = x.size(), C_in = x[0].size(), H = x[0][0].size(), W = x[0][0][0].size();
    int C_out = w.size(), K = w[0][0].size();
    int H_out = (H + 2 * padding - K) / stride + 1;
    int W_out = (W + 2 * padding - K) / stride + 1;

    Tensor4D out(N, vector<vector<vector<float>>>(C_out, vector<vector<float>>(H_out, vector<float>(W_out, 0.0f))));

    #pragma omp parallel for collapse(4)
    for (int n = 0; n < N; ++n)
        for (int cout = 0; cout < C_out; ++cout)
            for (int i = 0; i < H_out; ++i)
                for (int j = 0; j < W_out; ++j) {
                    float sum = 0.0;
                    for (int cin = 0; cin < C_in; ++cin)
                        for (int ki = 0; ki < K; ++ki)
                            for (int kj = 0; kj < K; ++kj) {
                                int row = i * stride + ki - padding;
                                int col = j * stride + kj - padding;
                                if (row >= 0 && col >= 0 && row < H && col < W)
                                    sum += x[n][cin][row][col] * w[cout][cin][ki][kj];
                            }
                    out[n][cout][i][j] = sum + b[cout];
                }
    return out;
}

Tensor4D relu4D(const Tensor4D& x) {
    Tensor4D out = x;
    #pragma omp parallel for collapse(4)
    for (int n = 0; n < x.size(); ++n)
        for (int c = 0; c < x[0].size(); ++c)
            for (int i = 0; i < x[0][0].size(); ++i)
                for (int j = 0; j < x[0][0][0].size(); ++j)
                    out[n][c][i][j] = relu(x[n][c][i][j]);
    return out;
}

Tensor4D maxpool2D(const Tensor4D& x, int size = 2, int stride = 2) {
    int N = x.size(), C = x[0].size(), H = x[0][0].size(), W = x[0][0][0].size();
    int H_out = (H - size) / stride + 1;
    int W_out = (W - size) / stride + 1;
    Tensor4D out(N, vector<vector<vector<float>>>(C, vector<vector<float>>(H_out, vector<float>(W_out, 0.0f))));

    #pragma omp parallel for collapse(4)
    for (int n = 0; n < N; ++n)
        for (int c = 0; c < C; ++c)
            for (int i = 0; i < H_out; ++i)
                for (int j = 0; j < W_out; ++j) {
                    float max_val = -1e9;
                    for (int ki = 0; ki < size; ++ki)
                        for (int kj = 0; kj < size; ++kj)
                            max_val = max(max_val, x[n][c][i * stride + ki][j * stride + kj]);
                    out[n][c][i][j] = max_val;
                }

    return out;
}

vector<vector<float>> flatten(const Tensor4D& x) {
    int N = x.size(), C = x[0].size(), H = x[0][0].size(), W = x[0][0][0].size();
    vector<vector<float>> out(N, vector<float>(C * H * W));
    #pragma omp parallel for collapse(2)
    for (int n = 0; n < N; ++n)
        for (int i = 0; i < C * H * W; ++i)
            out[n][i] = x[n][i / (H * W)][(i / W) % H][i % W];
    return out;
}

vector<vector<float>> dense(const vector<vector<float>>& x, const vector<vector<float>>& W, const vector<float>& b) {
    int N = x.size(), D = W.size(), F = W[0].size();
    vector<vector<float>> out(N, vector<float>(D, 0.0f));
    #pragma omp parallel for collapse(2)
    for (int n = 0; n < N; ++n)
        for (int d = 0; d < D; ++d) {
            float sum = 0;
            for (int f = 0; f < F; ++f)
                sum += x[n][f] * W[d][f];
            out[n][d] = sum + b[d];
        }
    return out;
}

int main() {
    const int B = 128;  // Bigger batch size
    const int C = 1, H = 28, W = 28;
    const int K1 = 3, K2 = 3;

    Tensor4D input(B, vector<vector<vector<float>>>(C, vector<vector<float>>(H, vector<float>(W, 1.0f))));

    Tensor4D w1(8, vector<vector<vector<float>>>(1, vector<vector<float>>(K1, vector<float>(K1, 0.1f))));
    vector<float> b1(8, 0.1f);

    Tensor4D w2(16, vector<vector<vector<float>>>(8, vector<vector<float>>(K2, vector<float>(K2, 0.1f))));
    vector<float> b2(16, 0.1f);

    vector<vector<float>> w_fc(10, vector<float>(16 * 5 * 5, 0.1f));
    vector<float> b_fc(10, 0.1f);

    cout << fixed << setprecision(4);
    auto start = high_resolution_clock::now();

    auto out1 = conv2d_omp(input, w1, b1, 1, 1);
    auto act1 = relu4D(out1);
    auto pool1 = maxpool2D(act1);

    auto out2 = conv2d_omp(pool1, w2, b2, 1, 1);
    auto act2 = relu4D(out2);
    auto pool2 = maxpool2D(act2);

    auto flat = flatten(pool2);
    auto logits = dense(flat, w_fc, b_fc);

    auto end = high_resolution_clock::now();
    double duration = chrono::duration<double>(end - start).count();


    cout << "CNN Forward Pass (C++ OpenMP, Batch: " << B << ")\n";
    cout << "Execution Time: " << duration << " seconds\n";
    cout << "Output Shape: (" << logits.size() << ", " << logits[0].size() << ")\n";

    return 0;
}


Writing cnn_forward_openmp.cpp


In [18]:
!g++ -O2 -fopenmp cnn_forward_openmp.cpp -o cnn_openmp


In [19]:
!./cnn_openmp

CNN Forward Pass (C++ OpenMP, Batch: 128)
Execution Time: 0.1547 seconds
Output Shape: (128, 10)


##Gradient Computation

####Numpy

In [20]:
import numpy as np
import time
import psutil
import tracemalloc
import os

def quadratic_function(x):
    return np.sum(x ** 2)

def compute_gradient_cpu(f, x, h=1e-5):
    grad = np.zeros_like(x)
    fx = f(x)
    for i in range(x.size):
        x_ph = x.copy()
        x_mh = x.copy()
        x_ph[i] += h
        x_mh[i] -= h
        grad[i] = (f(x_ph) - f(x_mh)) / (2 * h)
    return grad

def estimate_bandwidth(arrays, exec_time):
    total_bytes = sum(a.nbytes for a in arrays)
    return (total_bytes / 1e9) / exec_time if exec_time > 1e-6 else 0.0

def main():
    dim = 10_000
    x = np.random.rand(dim).astype(np.float64)

    # Warm-up
    _ = compute_gradient_cpu(quadratic_function, x.copy())

    # Start performance tracking
    tracemalloc.start()
    process = psutil.Process(os.getpid())
    start_time = time.time()

    grad = compute_gradient_cpu(quadratic_function, x.copy())

    end_time = time.time()
    _, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    exec_time = end_time - start_time
    peak_ram = peak / 1e6  # MB
    bandwidth = estimate_bandwidth([x, grad], exec_time)

    print(" CPU Gradient Computation - NumPy")
    print(f"Input Dimension: {dim}")
    print(f"Execution Time: {exec_time:.4f} sec")
    print(f"Peak RAM Usage: {peak_ram:.2f} MB")
    print(f"Estimated Memory Bandwidth: {bandwidth:.4f} GB/s")
    print(f"Gradient Norm: {np.linalg.norm(grad):.4f}")

if __name__ == "__main__":
    main()


 CPU Gradient Computation - NumPy
Input Dimension: 10000
Execution Time: 0.7264 sec
Peak RAM Usage: 0.53 MB
Estimated Memory Bandwidth: 0.0002 GB/s
Gradient Norm: 115.0822


####Numpy
**Increase the dimension**

In [21]:
import numpy as np
import time
import psutil
import tracemalloc
import os

def quadratic_function(x):
    return np.sum(x ** 2)

def compute_gradient_cpu(f, x, h=1e-5):
    grad = np.zeros_like(x)
    for i in range(x.size):
        x_ph = x.copy()
        x_mh = x.copy()
        x_ph[i] += h
        x_mh[i] -= h
        grad[i] = (f(x_ph) - f(x_mh)) / (2 * h)
    return grad

def estimate_bandwidth(arrays, exec_time):
    total_bytes = sum(a.nbytes for a in arrays)
    return (total_bytes / 1e9) / exec_time if exec_time > 1e-6 else 0.0

def main():
    dim = 100_000
    num_iters = 5
    x = np.random.rand(dim).astype(np.float64)

    tracemalloc.start()
    start_time = time.time()

    for _ in range(num_iters):
        grad = compute_gradient_cpu(quadratic_function, x.copy())

    end_time = time.time()
    _, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    exec_time = end_time - start_time
    peak_ram = peak / 1e6
    bandwidth = estimate_bandwidth([x, grad], exec_time)

    print(" Scaled Gradient Computation - NumPy")
    print(f"Input Dimension: {dim}")
    print(f"Iterations: {num_iters}")
    print(f"Execution Time: {exec_time:.4f} sec")
    print(f"Peak RAM Usage: {peak_ram:.2f} MB")
    print(f"Estimated Memory Bandwidth: {bandwidth:.4f} GB/s")
    print(f"Gradient Norm: {np.linalg.norm(grad):.4f}")

if __name__ == "__main__":
    main()


 Scaled Gradient Computation - NumPy
Input Dimension: 100000
Iterations: 5
Execution Time: 202.5222 sec
Peak RAM Usage: 6.54 MB
Estimated Memory Bandwidth: 0.0000 GB/s
Gradient Norm: 364.5356


####PyTorch

In [22]:
import torch
import time
import tracemalloc
import psutil
import os

def quadratic_function(x):
    return torch.sum(x ** 2)

def compute_gradient_cpu(f, x, h=1e-5):
    grad = torch.zeros_like(x)
    for i in range(x.size(0)):
        x_ph = x.clone()
        x_mh = x.clone()
        x_ph[i] += h
        x_mh[i] -= h
        grad[i] = (f(x_ph) - f(x_mh)) / (2 * h)
    return grad

def estimate_bandwidth(tensors, exec_time):
    total_bytes = sum([t.element_size() * t.numel() for t in tensors])
    return (total_bytes / 1e9) / exec_time if exec_time > 1e-6 else 0.0

def main():
    dim = 100_000
    num_iters = 3
    device = torch.device("cpu")

    x = torch.rand(dim, dtype=torch.float64, device=device)

    tracemalloc.start()
    process = psutil.Process(os.getpid())
    start_time = time.time()

    for _ in range(num_iters):
        grad = compute_gradient_cpu(quadratic_function, x.clone())

    end_time = time.time()
    _, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    exec_time = end_time - start_time
    peak_ram = peak / 1e6  # MB
    bandwidth = estimate_bandwidth([x, grad], exec_time)

    print(" Gradient Computation - PyTorch (CPU)")
    print(f"Input Dimension     : {dim}")
    print(f"Iterations          : {num_iters}")
    print(f"Execution Time      : {exec_time:.4f} sec")
    print(f"Peak RAM Usage      : {peak_ram:.2f} MB")
    print(f"Estimated Bandwidth : {bandwidth:.4f} GB/s")
    print(f"Gradient Norm       : {torch.linalg.norm(grad):.4f}")

if __name__ == "__main__":
    main()


 Gradient Computation - PyTorch (CPU)
Input Dimension     : 100000
Iterations          : 3
Execution Time      : 156.2450 sec
Peak RAM Usage      : 0.17 MB
Estimated Bandwidth : 0.0000 GB/s
Gradient Norm       : 364.6636


####C++

In [23]:
%%writefile gradient_cpu.cpp
// Paste the full code here

#include <iostream>
#include <vector>
#include <cmath>
#include <chrono>
#include <numeric>

using namespace std;
using namespace chrono;

double quadratic_function(const vector<double>& x) {
    double sum = 0.0;
    for (double val : x)
        sum += val * val;
    return sum;
}

vector<double> compute_gradient_cpu(double (*f)(const vector<double>&), const vector<double>& x, double h = 1e-5) {
    int n = x.size();
    vector<double> grad(n);
    for (int i = 0; i < n; ++i) {
        vector<double> x_ph = x;
        vector<double> x_mh = x;
        x_ph[i] += h;
        x_mh[i] -= h;
        grad[i] = (f(x_ph) - f(x_mh)) / (2 * h);
    }
    return grad;
}

double estimate_bandwidth(const vector<double>& x, const vector<double>& grad, double time_sec) {
    size_t bytes = (x.size() + grad.size()) * sizeof(double);
    return (bytes / 1e9) / time_sec;
}

double norm(const vector<double>& v) {
    double sum_sq = 0.0;
    for (double val : v)
        sum_sq += val * val;
    return sqrt(sum_sq);
}

int main() {
    int dim = 100000;
    int iters = 3;
    vector<double> x(dim);

    for (int i = 0; i < dim; ++i)
        x[i] = static_cast<double>(rand()) / RAND_MAX;

    auto start = high_resolution_clock::now();

    vector<double> grad;
    for (int i = 0; i < iters; ++i)
        grad = compute_gradient_cpu(quadratic_function, x);

    auto end = high_resolution_clock::now();
    double elapsed = duration_cast<duration<double>>(end - start).count();
    double bandwidth = estimate_bandwidth(x, grad, elapsed);

    cout << " Gradient Computation - C++ (CPU)" << endl;
    cout << "Input Dimension     : " << dim << endl;
    cout << "Iterations          : " << iters << endl;
    cout << "Execution Time      : " << elapsed << " sec" << endl;
    cout << "Estimated Bandwidth : " << bandwidth << " GB/s" << endl;
    cout << "Gradient Norm       : " << norm(grad) << endl;

    return 0;
}


Writing gradient_cpu.cpp


In [24]:
!g++ -O2 gradient_cpu.cpp -o gradient_cpu

In [25]:
!./gradient_cpu

 Gradient Computation - C++ (CPU)
Input Dimension     : 100000
Iterations          : 3
Execution Time      : 332.439 sec
Estimated Bandwidth : 4.81292e-06 GB/s
Gradient Norm       : 365.052
