In [1]:
import cupy as cp 

x = cp.array([[1, 2, 3],
            [4, 5, 6],
            [7, 8, 9]])

y = cp.array([1,0,0])

print(x.shape)
print(y.shape)


(3, 3)
(3,)


In [2]:
! nvidia-smi

Wed Feb 26 02:21:49 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.120                Driver Version: 550.120        CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 1650        Off |   00000000:01:00.0 Off |                  N/A |
| N/A   49C    P3             13W /   30W |      65MiB /   4096MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
# lets decide the number of hiiden neurons we want
# we only we use on hidden  layer
n_neurons = 6

num_of_example , num_of_features = x.shape 

# initialize the weights and biases
W1 = cp.random.randn(num_of_features, n_neurons)
b1 = cp.zeros((n_neurons,)) 

print(W1)
print(b1)
print(W1.shape)
print(b1.shape)



[[-2.35713845e-01  9.35595471e-01  1.07025822e+00 -1.78587871e-01
   7.93042990e-01  1.68280696e-03]
 [-1.09309289e+00 -1.36785874e+00  3.62140521e-01  1.86794945e+00
  -6.78621715e-01  2.15288539e-01]
 [-1.79336067e-01  1.61288816e+00 -1.23014474e+00  9.78876973e-01
   1.33692201e+00 -1.26535825e+00]]
[0. 0. 0. 0. 0. 0.]
(3, 6)
(6,)


In [4]:
# the forward pass (linear transformation)

z1 = cp.dot(x, W1) + b1

print(z1)
print(z1.shape)

[[ -2.95990783   3.03854246  -1.89589496   6.49394195   3.44656558
   -3.36381486]
 [ -7.48433624   6.58041712  -1.28913296  14.4986576    7.80059543
   -6.50897557]
 [-12.00876465  10.12229178  -0.68237096  22.50337326  12.15462528
   -9.65413628]]
(3, 6)


In [5]:
# activation function
## adding the non linearity to the linear transformation

a1 = cp.tanh(z1)

print(a1)
print(a1.shape)



[[-0.994643    0.9954208  -0.95588464  0.99999542  0.99797259 -0.99760813]
 [-0.99999937  0.99999615 -0.85889929  1.          0.99999966 -0.99999556]
 [-1.          1.         -0.59305861  1.          1.         -0.99999999]]
(3, 6)


In [6]:
# the output of the network
out_layer_neurons = 1
W2 = cp.random.randn(n_neurons, out_layer_neurons)
b2 = cp.zeros((out_layer_neurons,))

print(W2)
print(b2)
print(W2.shape)
print(b2.shape)


[[-0.77732016]
 [ 0.40734385]
 [ 0.61643084]
 [-0.22694915]
 [-0.15064317]
 [-0.32521864]]
[0.]
(6, 1)
(1,)


In [7]:
# second forward pass
z2 = cp.dot(a1 , W2) + b2 
print(z2)
print(z2.shape)



[[0.53655271]
 [0.60283487]
 [0.76671071]]
(3, 1)


In [8]:
## adding the non linearity to the linear transformation 
## this type we will use sigmoid activation function since this is a binary classification problem 

def sigmoid(x):
    return 1 / (1 + cp.exp(-x))


a2 = sigmoid(z2)

print(a2)
print(a2.shape)


[[0.63101013]
 [0.64630461]
 [0.68280893]]
(3, 1)


In [9]:
## next we must calculate the loss
## we will use binary cross entropy loss function , again becuse this is a binary classification problem

def binary_cross_entropy(y_true, y_pred):
    epsilon = 1e-15
    y_pred = cp.clip(y_pred, epsilon, 1 - epsilon)
    loss = -cp.mean(y_true * cp.log(y_pred) + (1 - y_true) * cp.log(1 - y_pred))
    return loss

loss = binary_cross_entropy(y, a2)

print(loss)

0.8497300432839547


In [11]:
## now we must optimize (minmize this loss function) by the power of chain rule and gradient descent 

def binary_cross_entropy_derivative(y_true, y_pred):
    epsilon = 1e-15
    y_pred = cp.clip(y_pred, epsilon, 1 - epsilon)
    return -((y_true / y_pred) - ((1 - y_true) / (1 - y_pred)))

d_loss_d_a2 = binary_cross_entropy_derivative(y, a2)
print(d_loss_d_a2)

[[-1.58476062  2.71010148  2.71010148]
 [-1.54725803  2.82729159  2.82729159]
 [-1.46453856  3.15267383  3.15267383]]


In [12]:
def sigmoid_derivative(x):
    return x*(1 - x) 

d_a2_d_z2 = sigmoid_derivative(a2)

print(d_a2_d_z2)


[[0.23283635]
 [0.22859496]
 [0.2165809 ]]


In [13]:
out_layer_grad = d_loss_d_a2 * d_a2_d_z2 
print(out_layer_grad)

[[-0.36898987  0.63101013  0.63101013]
 [-0.35369539  0.64630461  0.64630461]
 [-0.31719107  0.68280893  0.68280893]]


In [15]:
## TO simply we can use the this function and get the same result :

def out_layer_grad(y_true, y_pred):
    epsilon = 1e-15
    
    y_pred = cp.clip(y_pred, epsilon, 1 - epsilon)
    return (y_pred - y_true)

# Reshape y to match a2
y = y.reshape(-1, 1)  # Ensure y has shape (3,1)

# Compute the gradient of the loss with respect to a2
grad_a2 = out_layer_grad(y, a2)

print("Gradient w.r.t. a2:", grad_a2)
print(grad_a2.shape)


Gradient w.r.t. a2: [[-0.36898987]
 [ 0.64630461]
 [ 0.68280893]]
(3, 1)


In [17]:
## lets move to the backprop of the hidden layer:
def tanh_derivative(x):
    return 1 - cp.tanh(x)**2 


grad_z2 = tanh_derivative(a2) * grad_a2

print(grad_z2)
print(grad_z2.shape)

[[-0.25379174]
 [ 0.43692618]
 [ 0.44242244]]
(3, 1)


In [18]:
# now we need  to calcukate the grad for the parametrs in the hidden layer (W2 , B2)
dW2 = cp.dot( a1.T, grad_z2) / num_of_example
db2 = cp.sum(grad_z2, axis=0, keepdims=True) / num_of_example

print(dW2)
print(dW2.shape)

print(db2)
print(db2.shape)

[[-0.20897206]
 [ 0.20890579]
 [-0.13168747]
 [ 0.20851935]
 [ 0.20869043]
 [-0.20872066]]
(6, 1)
[[0.20851896]]
(1, 1)


In [20]:
grad_a1 = cp.dot(grad_z2, W2.T)

print(grad_a1)
print(grad_a1.shape)


[[ 0.19727744 -0.10338051 -0.15644506  0.05759782  0.03823199  0.0825378 ]
 [-0.33963153  0.17797919  0.26933477 -0.09916002 -0.06581995 -0.14209654]
 [-0.34390389  0.18021806  0.27272284 -0.1004074  -0.06664792 -0.14388402]]
(3, 6)


In [21]:
dz1  = grad_a1 * tanh_derivative(a1)
print(dz1)
print(dz1.shape)

[[ 0.08352926 -0.04372067 -0.07021089  0.02418977  0.01610609  0.0347902 ]
 [-0.14263667  0.07474713  0.13898085 -0.04164467 -0.0276427  -0.0596773 ]
 [-0.14443081  0.07568696  0.19550908 -0.04216853 -0.02799042 -0.0604276 ]]
(3, 6)


In [22]:
# now we need  to calcukate the grad for the parametrs in the hidden layer (W2 , B2)
dW1 = cp.dot( x.T, dz1) / num_of_example
db1 = cp.sum(dz1, axis=0, keepdims=True) / num_of_example

print(dW2)
print(dW2.shape)

print(db2)
print(db2.shape)

[[-0.20897206]
 [ 0.20890579]
 [-0.13168747]
 [ 0.20851935]
 [ 0.20869043]
 [-0.20872066]]
(6, 1)
[[0.20851896]]
(1, 1)


In [23]:
lr = 0.01

W1 -= lr * dW1
b1 -= lr * db1.ravel()  # Fix here
W2 -= lr * dW2
b2 -= lr * db2.ravel()  # No need to fix since b2 is already (1,)


print("new W1:", W1)
print("new b1:", b1)
print("new W2:", W2)
print("new bw:", b2)



new W1: [[-0.2307204   0.93297855  1.0640773  -0.17712931  0.79401098  0.00377251]
 [-1.08742099 -1.37083137  0.35507867  1.86960676 -0.67752197  0.21766263]
 [-0.1729857   1.60955981 -1.23808752  0.98073302  1.33815351 -1.26269978]]
new b1: [ 0.00067846 -0.00035571 -0.00088093  0.00019874  0.00013176  0.00028438]
new W2: [[-0.77523044]
 [ 0.4052548 ]
 [ 0.61774772]
 [-0.22903434]
 [-0.15273008]
 [-0.32313143]]
new bw: [-0.00208519]


In [24]:
# Forward pass
z1 = cp.dot(x, W1) + b1
a1 = cp.tanh(z1)

z2 = cp.dot(a1, W2) + b2
a2 = sigmoid(z2)

# Compute new loss
loss = binary_cross_entropy(y, a2)

print("New loss:", loss)


New loss: 0.8614092666656665


In [43]:
import time
import cupy as cp

# Set up synthetic dataset on GPU
num_of_example = 1000
x = cp.random.randn(num_of_example, 3)
y = (cp.random.rand(num_of_example, 1) > 0.5).astype(cp.float32)

# Define sigmoid and loss functions
def sigmoid(x):
    return 1 / (1 + cp.exp(-x))

def binary_cross_entropy(y_true, y_pred):
    return -cp.mean(y_true * cp.log(y_pred) + (1 - y_true) * cp.log(1 - y_pred))

def tanh_derivative(x):
    return 1 - cp.tanh(x) ** 2

def out_layer_grad(y_true, y_pred):
    return -(y_true / y_pred - (1 - y_true) / (1 - y_pred))

# Initialize weights and biases on GPU
W1 = cp.random.randn(3, 6)
b1 = cp.zeros((6,))
W2 = cp.random.randn(6, 1)
b2 = cp.zeros((1,))

# Hyperparameters
lr = 0.01
epochs = 1000  # Number of training iterations

# Start timing
start_time = time.time()

for epoch in range(epochs):
    # Forward pass
    z1 = cp.dot(x, W1) + b1
    a1 = cp.tanh(z1)

    z2 = cp.dot(a1, W2) + b2
    a2 = sigmoid(z2)

    # Compute loss
    loss = binary_cross_entropy(y, a2)

    # Backpropagation
    grad_a2 = out_layer_grad(y, a2)
    grad_z2 = grad_a2 * a2 * (1 - a2)  # More efficient sigmoid derivative

    dW2 = cp.dot(a1.T, grad_z2) / num_of_example
    db2 = cp.sum(grad_z2, axis=0, keepdims=True) / num_of_example

    grad_z1 = cp.dot(grad_z2, W2.T) * tanh_derivative(a1)

    dW1 = cp.dot(x.T, grad_z1) / num_of_example
    db1 = cp.sum(grad_z1, axis=0, keepdims=True) / num_of_example

    # Parameter updates
    W1 -= lr * dW1
    b1 -= lr * db1.ravel()
    W2 -= lr * dW2
    b2 -= lr * db2.ravel()

    # Print loss every 10 epochs
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.get()}")  # Move loss to CPU

# End timing
end_time = time.time()
print(f"Total Training Time: {end_time - start_time:.4f} seconds")


Epoch 0, Loss: 0.7050632553774739
Epoch 10, Loss: 0.7037360046511018
Epoch 20, Loss: 0.7025508787021367
Epoch 30, Loss: 0.7014925155240381


Epoch 40, Loss: 0.7005471705657625
Epoch 50, Loss: 0.6997025583526734
Epoch 60, Loss: 0.6989477068245045
Epoch 70, Loss: 0.6982728239990537
Epoch 80, Loss: 0.697669176421448
Epoch 90, Loss: 0.6971289787579569
Epoch 100, Loss: 0.6966452938311406
Epoch 110, Loss: 0.6962119423614923
Epoch 120, Loss: 0.6958234216727721
Epoch 130, Loss: 0.6954748326281223
Epoch 140, Loss: 0.6951618140870238
Epoch 150, Loss: 0.6948804842053002
Epoch 160, Loss: 0.6946273879385744
Epoch 170, Loss: 0.6943994501513866
Epoch 180, Loss: 0.6941939337777024
Epoch 190, Loss: 0.6940084025223086
Epoch 200, Loss: 0.6938406876356007
Epoch 210, Loss: 0.6936888583357151
Epoch 220, Loss: 0.6935511954913448
Epoch 230, Loss: 0.6934261682156063
Epoch 240, Loss: 0.6933124130557684
Epoch 250, Loss: 0.6932087154954724
Epoch 260, Loss: 0.6931139935152831
Epoch 270, Loss: 0.6930272829840494
Epoch 280, Loss: 0.6929477246777714
Epoch 290, Loss: 0.6928745527445644
Epoch 300, Loss: 0.6928070844540737
Epoch 310, Loss: 0.6927447110874381

In [44]:
import time
import torch 
import torch.nn as nn 
import torch.optim as optim
import numpy as np

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Generate synthetic dataset
num_of_example = 1000
x = np.random.randn(num_of_example, 3)
y = (np.random.rand(num_of_example, 1) > 0.5).astype(np.float32)

# Convert input data to tensors and move to device
tensor_x = torch.tensor(x, dtype=torch.float32, device=device)
tensor_y = torch.tensor(y, dtype=torch.float32, device=device)

# Define model
class MyNet(nn.Module):
    def __init__(self):
        super(MyNet, self).__init__()
        self.fc1 = nn.Linear(3, 6)  # input: 3, output: 6
        self.tanh = nn.Tanh()
        self.fc2 = nn.Linear(6, 1)  # input: 6, output: 1
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        z1 = self.fc1(x)
        a1 = self.tanh(z1)
        z2 = self.fc2(a1)
        a2 = self.sigmoid(z2)
        return a2 

# Initialize model and move to device
model = MyNet().to(device)

# Set random seed
np.random.seed(42)

# Define weights and biases
W1 = np.random.randn(3, 6)
b1 = np.zeros((6,))
W2 = np.random.randn(6, 1)
b2 = np.zeros((1,))

# Assign weights and biases, ensuring they're on the same device
model.fc1.weight.data = torch.tensor(W1.T, dtype=torch.float32, device=device)
model.fc1.bias.data = torch.tensor(b1, dtype=torch.float32, device=device)
model.fc2.weight.data = torch.tensor(W2.T, dtype=torch.float32, device=device)
model.fc2.bias.data = torch.tensor(b2, dtype=torch.float32, device=device)

# Define loss function and optimizer
criterion = nn.BCELoss()  
optimizer = optim.SGD(model.parameters(), lr=0.01)

try:
    # Start timing
    start_time = time.time()

    # Training loop
    epochs = 1000
    for epoch in range(epochs):
        optimizer.zero_grad()   # Reset gradients
        output = model(tensor_x)  # Forward pass
        loss = criterion(output, tensor_y)
        loss.backward()  # Backward pass
        optimizer.step()  # Update parameters

        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss.item():.4f}", flush=True)

    # Ensure GPU operations finish before stopping timer
    if torch.cuda.is_available():
        torch.cuda.synchronize()

    # End timing
    end_time = time.time()
    print(f"Total Training Time: {end_time - start_time:.4f} seconds", flush=True)

except Exception as e:
    print(f"Error occurred: {e}")


Epoch 0, Loss: 0.8592
Epoch 10, Loss: 0.8459
Epoch 20, Loss: 0.8338
Epoch 30, Loss: 0.8229
Epoch 40, Loss: 0.8129


Epoch 50, Loss: 0.8038
Epoch 60, Loss: 0.7956
Epoch 70, Loss: 0.7882
Epoch 80, Loss: 0.7815
Epoch 90, Loss: 0.7754
Epoch 100, Loss: 0.7698
Epoch 110, Loss: 0.7648
Epoch 120, Loss: 0.7603
Epoch 130, Loss: 0.7562
Epoch 140, Loss: 0.7524
Epoch 150, Loss: 0.7490
Epoch 160, Loss: 0.7459
Epoch 170, Loss: 0.7431
Epoch 180, Loss: 0.7405
Epoch 190, Loss: 0.7382
Epoch 200, Loss: 0.7361
Epoch 210, Loss: 0.7341
Epoch 220, Loss: 0.7323
Epoch 230, Loss: 0.7306
Epoch 240, Loss: 0.7291
Epoch 250, Loss: 0.7277
Epoch 260, Loss: 0.7264
Epoch 270, Loss: 0.7252
Epoch 280, Loss: 0.7241
Epoch 290, Loss: 0.7231
Epoch 300, Loss: 0.7222
Epoch 310, Loss: 0.7213
Epoch 320, Loss: 0.7205
Epoch 330, Loss: 0.7197
Epoch 340, Loss: 0.7190
Epoch 350, Loss: 0.7183
Epoch 360, Loss: 0.7177
Epoch 370, Loss: 0.7171
Epoch 380, Loss: 0.7165
Epoch 390, Loss: 0.7160
Epoch 400, Loss: 0.7155
Epoch 410, Loss: 0.7151
Epoch 420, Loss: 0.7146
Epoch 430, Loss: 0.7142
Epoch 440, Loss: 0.7138
Epoch 450, Loss: 0.7134
Epoch 460, Loss: 0.71