### 1. Get System Info

In [1]:
import cupy as cp
import numpy as np

np.set_printoptions(threshold=np.inf)  # Show all elements

# Get the default device ID (usually 0)
device_id = cp.cuda.runtime.getDevice()

# Get device properties
props = cp.cuda.runtime.getDeviceProperties(device_id)

# Print device name
device_name = props['name'].decode('utf-8')
print(f"Device Name: {device_name}")

# Print compute capability
cc_major = props['major']
cc_minor = props['minor']
print(f"Compute Capability: {cc_major}.{cc_minor}")

# Print total memory
total_mem = props['totalGlobalMem'] / (1024 ** 3)  # Convert bytes to GB
print(f"Total Memory: {total_mem:.2f} GB")

Device Name: Tesla V100-PCIE-16GB
Compute Capability: 7.0
Total Memory: 15.77 GB


### 2. Load Dataset

In [2]:
import tensorflow as tf
import cupy as cp
# Load the MNIST dataset
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

# Preprocess the data
x_train = x_train / 255.0
x_test = x_test / 255.0

#flatten the dataset
x_train_flat = x_train.reshape(x_train.shape[0], 28 * 28).T
x_test_flat = x_test.reshape(x_test.shape[0], 28 * 28).T

train_size = int(0.8 * x_train_flat.shape[1])  # 48000

x_val = x_train_flat[:, train_size:]     # Last 12000 columns for validation
x_train_flat = x_train_flat[:, :train_size]   # First 48000 columns for training

print("X_train Shape: ", x_train_flat.shape)
print("X_Val Shape: ", x_val.shape)
print("X_test Shape: ", x_test_flat.shape)

y_train = cp.eye(10)[y_train].T #convert to one hot encoded vectors

y_val = y_train[:, train_size:]
y_train = y_train[:, :train_size]

print("Y_Train Shape: ", y_train.shape)
print("Y_Val Shape: ", y_val.shape)

y_test = cp.eye(10)[y_test].T #convert to one hot encoded vectors

print("Y_Test Shape: ", y_test.shape)

epochs = 10
batch_size = 64

2025-05-30 18:59:09.088665: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-05-30 18:59:09.270750: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-30 18:59:10.565469: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2025-05-30 18:59:10.565564: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such 

X_train Shape:  (784, 48000)
X_Val Shape:  (784, 12000)
X_test Shape:  (784, 10000)
Y_Train Shape:  (10, 48000)
Y_Val Shape:  (10, 12000)
Y_Test Shape:  (10, 10000)


### 3. spn_0. The object oriented approach with sequential forward and backprop

In [3]:
from spn_0.spn import SPN as SPN_0
import cupy as cp

model_0 = SPN_0()
cp.random.seed(42)

input_features = 784 # for MNIST dataset
hidden_nodes = 703
output_nodes = 10

#manually create the network structure
for i in range(hidden_nodes):
    model_0.create_node('Relu', input_features, 'input')

for i in range(1, hidden_nodes):
    for j in range(i + 1, hidden_nodes + 1):
        model_0.add_connection(i, j)

inputs = model_0.input_nodes.copy()
for i in range(output_nodes):
    new_node = model_0.create_node('None', input_features, 'input')
    model_0.output_nodes.append(new_node)
    model_0.vertices[new_node].output_size = 1

    for node in inputs:
        model_0.add_connection(node, new_node)
        
for i in range(hidden_nodes + 1, hidden_nodes + output_nodes):
    for j in range(i + 1, hidden_nodes + output_nodes + 1):
        model_0.add_connection(i, j)

model_0.compile()
#model_0.visualize()

Extract weights before training

In [4]:
#extract weights from spn

spn_weights = []
biases = []
for i, node in enumerate(model_0.input_nodes):
    spn_weights.append(model_0.vertices[node].weights.copy())
    biases.append(model_0.vertices[node].bias)

max_pad = spn_weights[-1].shape[1]

for i in range(len(spn_weights)):
    row = cp.pad(spn_weights[i],  pad_width=((0, 0), (1, max_pad - spn_weights[i].shape[1])), mode='constant', constant_values=(biases[i], 0))
    spn_weights[i] = row

spn_weights = cp.vstack(spn_weights)

Run Model

In [5]:
#train_metrics_0, val_metrics_0, test_metrics_0 = model_0.execute(epochs, batch_size, x_train_flat, y_train, x_val, y_val, x_test_flat, y_test)

### 4. spn_1. The block based forward appproach with sequential backprop in cupy

Use spn_0 weights

In [6]:
spn_1_weights = spn_weights.copy()

In [7]:
from spn_1.spn import SPN as SPN_1

model_1 = SPN_1(input_features, hidden_nodes + output_nodes, output_nodes)
model_1.set_weights(spn_1_weights)
model_1.compile()

In [8]:
#append 1s to multiply with biases
ones_column_train = cp.ones((1, x_train_flat.shape[1]))
x_train_flat_with_ones = cp.vstack((ones_column_train, x_train_flat))

ones_column_val = cp.ones((1, x_val.shape[1]))
x_val_with_ones = cp.vstack((ones_column_val, x_val))

ones_column_test = cp.ones((1, x_test_flat.shape[1]))
x_test_flat_with_ones = cp.vstack((ones_column_test, x_test_flat))

alpha = 0.001
beta_1 = 0.9
beta_2 = 0.999
epsilon = 1e-8
t = 1  # Timestep

In [9]:
#train_metrics_1, val_metrics_1 = model_1.fit(x_train_flat_with_ones, y_train, x_val_with_ones, y_val, epochs, batch_size, alpha, beta_1, beta_2, epsilon)
#test_metrics_1 = model_1.test(x_test_flat_with_ones, y_test)

### 5. spn_2. Block based forward and backprop approach in pytorch

In [10]:
import torch

def cp_to_torch_tensor(cp_array):
    return torch.tensor(cp.asnumpy(cp_array)).float().cuda()

In [11]:
from torch.utils.data import TensorDataset, DataLoader

X_train = cp_to_torch_tensor(x_train_flat).T  # [N, 784]
X_val = cp_to_torch_tensor(x_val).T
X_test = cp_to_torch_tensor(x_test_flat).T

Y_train = cp_to_torch_tensor(y_train).T  # [N]
Y_val = cp_to_torch_tensor(y_val).T
Y_test = cp_to_torch_tensor(y_test).T

train_dataset = TensorDataset(X_train, Y_train)
val_dataset = TensorDataset(X_val, Y_val)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(TensorDataset(X_test, Y_test), batch_size=batch_size)

In [12]:
from spn_2.spn import SPN as SPN_2
    
model_2 = SPN_2(input_features, hidden_nodes + output_nodes, output_nodes, False).cuda()

In [13]:
import torch.nn as nn

spn_2_weights = []

limits = spn_weights.shape[1] - cp.argmax(spn_weights[:, ::-1] != 0, axis=1)
start = 1
old_limit = 0

for limit in limits:
    if limit > old_limit:
        old_limit = limit
        spn_2_weights.append(nn.Parameter(torch.as_tensor(spn_weights[limit - 1 - input_features:, start:limit].copy().get()).float().cuda()))
        start = limit

spn_2_biases = nn.Parameter(torch.as_tensor(spn_weights[:, 0].get()).float().cuda())

In [14]:
model_2.weights.extend(spn_2_weights)
model_2.biases = spn_2_biases

In [15]:
import torch.optim as optim

model_2 = torch.jit.trace(model_2, torch.randn(batch_size, input_features).cuda())
optimizer = optim.Adam(model_2.parameters(), lr=alpha, betas=(beta_1, beta_2), eps=epsilon)
criterion = nn.CrossEntropyLoss()

  if input_start_idx < nodes - output_size:


In [16]:
from utils import train

train_metrics_2, val_metrics_2, test_metrics_2 = train(model_2, train_loader, val_loader, test_loader, epochs, optimizer, criterion)

Epoch: 1 Total_Time: 119.7386 Average_Time_per_batch: 0.1597 Train_Accuracy: 0.9249 Train_Loss: 0.2447 Val_Accuracy: 0.9597 Val_Loss: 0.1292
Epoch: 2 Total_Time: 117.7434 Average_Time_per_batch: 0.1570 Train_Accuracy: 0.9705 Train_Loss: 0.0958 Val_Accuracy: 0.9713 Val_Loss: 0.0970
Epoch: 3 Total_Time: 118.1292 Average_Time_per_batch: 0.1575 Train_Accuracy: 0.9795 Train_Loss: 0.0645 Val_Accuracy: 0.9735 Val_Loss: 0.0890
Epoch: 4 Total_Time: 118.0554 Average_Time_per_batch: 0.1574 Train_Accuracy: 0.9853 Train_Loss: 0.0458 Val_Accuracy: 0.9729 Val_Loss: 0.0943
Epoch: 5 Total_Time: 117.7072 Average_Time_per_batch: 0.1569 Train_Accuracy: 0.9881 Train_Loss: 0.0370 Val_Accuracy: 0.9729 Val_Loss: 0.0954
Epoch: 6 Total_Time: 117.8271 Average_Time_per_batch: 0.1571 Train_Accuracy: 0.9906 Train_Loss: 0.0300 Val_Accuracy: 0.9778 Val_Loss: 0.0892
Epoch: 7 Total_Time: 117.9259 Average_Time_per_batch: 0.1572 Train_Accuracy: 0.9917 Train_Loss: 0.0235 Val_Accuracy: 0.9777 Val_Loss: 0.0867
Epoch: 8 Tota

In [17]:
from spn_3.spn import SPN as SPN_3
    
model_3 = SPN_3(input_features, hidden_nodes + output_nodes, output_nodes, False).cuda()

In [18]:
model_3 = torch.jit.trace(model_3, torch.randn(batch_size, input_features).cuda())
optimizer = optim.Adam(model_3.parameters(), lr=alpha, betas=(beta_1, beta_2), eps=epsilon)
criterion = nn.CrossEntropyLoss()

In [19]:
train_metrics_3, val_metrics_3, test_metrics_3 = train(model_3, train_loader, val_loader, test_loader, epochs, optimizer, criterion)

Epoch: 1 Total_Time: 217.9252 Average_Time_per_batch: 0.2906 Train_Accuracy: 0.9269 Train_Loss: 0.2418 Val_Accuracy: 0.9618 Val_Loss: 0.1225
Epoch: 2 Total_Time: 121.6688 Average_Time_per_batch: 0.1622 Train_Accuracy: 0.9698 Train_Loss: 0.0979 Val_Accuracy: 0.9694 Val_Loss: 0.1034
Epoch: 3 Total_Time: 121.4814 Average_Time_per_batch: 0.1620 Train_Accuracy: 0.9789 Train_Loss: 0.0665 Val_Accuracy: 0.9731 Val_Loss: 0.0947
Epoch: 4 Total_Time: 120.2777 Average_Time_per_batch: 0.1604 Train_Accuracy: 0.9854 Train_Loss: 0.0469 Val_Accuracy: 0.9724 Val_Loss: 0.0984
Epoch: 5 Total_Time: 121.1118 Average_Time_per_batch: 0.1615 Train_Accuracy: 0.9879 Train_Loss: 0.0365 Val_Accuracy: 0.9706 Val_Loss: 0.1089
Epoch: 6 Total_Time: 120.5235 Average_Time_per_batch: 0.1607 Train_Accuracy: 0.9910 Train_Loss: 0.0279 Val_Accuracy: 0.9728 Val_Loss: 0.1033
Epoch: 7 Total_Time: 120.2340 Average_Time_per_batch: 0.1603 Train_Accuracy: 0.9915 Train_Loss: 0.0267 Val_Accuracy: 0.9752 Val_Loss: 0.0947
Epoch: 8 Tota