In [12]:
import cupy as cp 

x = cp.array([[1, 2, 3],
            [4, 5, 6],
            [7, 8, 9],
            [10, 11, 12]])

y = cp.array([1,0,0, 1])

print(x.shape)
print(y.shape)


(4, 3)
(4,)


In [13]:
class ActivationFunctions:
    @staticmethod
    def relu(x):
        return cp.maximum(0, x)

    @staticmethod
    def sigmoid(x):
        return 1 / (1 + cp.exp(-x))

    @staticmethod
    def relu_derivative(x):
        return (x > 0).astype(cp.float32)

    @staticmethod
    def sigmoid_derivative(x):
        sig = ActivationFunctions.sigmoid(x)
        return sig * (1 - sig)
    
    @staticmethod
    def tanh(x):
        return cp.tanh(x)
    
    @staticmethod
    def tanh_derivative(x):
        return 1 - cp.tanh(x)**2

    
    
    

In [14]:
# ✅ Create a test input (CuPy array)
x = cp.array([-2.0, -1.0, 0.0, 1.0, 2.0])

# ✅ Test ReLU
relu_output = ActivationFunctions.relu(x)
relu_derivative_output = ActivationFunctions.relu_derivative(x)

# ✅ Test Sigmoid
sigmoid_output = ActivationFunctions.sigmoid(x)
sigmoid_derivative_output = ActivationFunctions.sigmoid_derivative(x)

# ✅ Test Tanh
tanh_output = ActivationFunctions.tanh(x)
tanh_derivative_output = ActivationFunctions.tanh_derivative(x)

# ✅ Print results
print("Input:", x)
print("ReLU Output:", relu_output)
print("ReLU Derivative:", relu_derivative_output)
print("Sigmoid Output:", sigmoid_output)
print("Sigmoid Derivative:", sigmoid_derivative_output)
print("tanh Output:", tanh_output)
print("tanh Derivative:", tanh_derivative_output)

Input: [-2. -1.  0.  1.  2.]
ReLU Output: [0. 0. 0. 1. 2.]
ReLU Derivative: [0. 0. 0. 1. 1.]
Sigmoid Output: [0.11920292 0.26894142 0.5        0.73105858 0.88079708]
Sigmoid Derivative: [0.10499359 0.19661193 0.25       0.19661193 0.10499359]
tanh Output: [-0.96402758 -0.76159416  0.          0.76159416  0.96402758]
tanh Derivative: [0.07065082 0.41997434 1.         0.41997434 0.07065082]


In [15]:
import cupy as cp  # Assuming you're using CuPy for GPU acceleration

class Linear:
    def __init__(self,
                 in_features: int,
                 out_features: int,
                 bias: bool = True,
                 initializer: str = "he"):
        
        self.in_features = in_features
        self.out_features = out_features
        
        # Initialize weights
        if initializer == 'he':
            scale = cp.sqrt(2.0 / in_features)  # Fixed denominator
        elif initializer == 'xavier':
            scale = cp.sqrt(1.0 / in_features)  # Fixed denominator
        else:  # Plain initialization
            scale = 1.0

        self.weights = cp.random.randn(out_features, in_features) * scale
        self.bias = cp.zeros((out_features,)) if bias else None

        # Initialize gradients
        self.dweights = cp.zeros_like(self.weights)
        self.dbias = cp.zeros_like(self.bias) if bias else None

    def forward(self, x: cp.ndarray) -> cp.ndarray:
        self.x = x 
        return cp.dot(x, self.weights.T) + (self.bias if self.bias is not None else 0)  # Ensures correct shape


In [16]:
x = cp.array([[1, 2, 3],
            [4, 5, 6],
            [7, 8, 9],
            [10, 11, 12]])

layer = Linear(3, 2)  # 3 input features, 2 output features
out  = layer.forward(x)

print(out)



[[ -4.65469258   1.27288809]
 [ -8.42404719   2.15758172]
 [-12.19340181   3.04227535]
 [-15.96275642   3.92696897]]


In [17]:
class Linear:
    def __init__(self, in_features: int, out_features: int, bias: bool = True, initializer: str = "he"):
        self.in_features = in_features
        self.out_features = out_features
        
        # Initialize weights
        if initializer == 'he':
            scale = cp.sqrt(2.0 / in_features)
        elif initializer == 'xavier':
            scale = cp.sqrt(1.0 / in_features)
        else:
            scale = 1.0

        self.weights = cp.random.randn(out_features, in_features) * scale
        self.bias = cp.zeros((out_features,)) if bias else None

        # Initialize gradients
        self.dweights = cp.zeros_like(self.weights)
        self.dbias = cp.zeros_like(self.bias) if bias else None

    def forward(self, x: cp.ndarray) -> cp.ndarray:
        """ Forward pass: Computes Y = XW^T + b """
        self.x = x  # Store input for backprop
        return cp.dot(x, self.weights.T) + (self.bias if self.bias is not None else 0)

    def backward(self, upstream_grad: cp.ndarray) -> cp.ndarray:
        """
        Backward pass (backpropagation)

        Args:
            upstream_grad: Gradient from subsequent layer, shape (batch_size, output_dim)

        Returns:
            Gradient with respect to input, shape (batch_size, input_dim)
        """
        # Compute gradients
        self.dweights = cp.dot(upstream_grad.T, self.x)  # (out_features, in_features)
        if self.bias is not None:
            self.dbias = cp.sum(upstream_grad, axis=0)    # (out_features,)

        # Compute gradient for input
        dx = cp.dot(upstream_grad, self.weights)         # (batch_size, in_features)

        return dx

    def update(self, learning_rate: float):
        """Update weights using computed gradients"""
        self.weights -= learning_rate * self.dweights
        if self.bias is not None:
            self.bias -= learning_rate * self.dbias

    @property
    def parameters(self):
        """Return weights and biases"""
        return {'weights': self.weights, 'bias': self.bias}

    @property
    def gradients(self):
        """Return current gradients"""
        return {'dweights': self.dweights, 'dbias': self.dbias}


In [18]:
import cupy as cp

# Define input
x = cp.array([[1, 2, 3],
              [4, 5, 6],
              [7, 8, 9],
              [10, 11, 12]])  # (4, 3) -> batch_size=4, input_dim=3

# Create Linear layer: input_dim=3, output_dim=2
linear_layer = Linear(in_features=3, out_features=2, bias=True)

# Forward pass
output = linear_layer.forward(x)
print("Forward Output:\n", output)

# Define upstream gradient (random values for testing)
upstream_grad = cp.random.randn(4, 2)  # Same shape as output

# Backward pass
grad_input = linear_layer.backward(upstream_grad)
print("\nGradient w.r.t Input:\n", grad_input)

# Print gradients of weights and bias
print("\nWeight Gradient:\n", linear_layer.dweights)
print("\nBias Gradient:\n", linear_layer.dbias)

# Update parameters
learning_rate = 0.01
linear_layer.update(learning_rate)

# Print updated weights and bias
print("\nUpdated Weights:\n", linear_layer.weights)
print("\nUpdated Bias:\n", linear_layer.bias)


Forward Output:
 [[ -4.87847309   0.66462138]
 [-10.52705953   1.35894429]
 [-16.17564598   2.0532672 ]
 [-21.82423242   2.74759012]]

Gradient w.r.t Input:
 [[ 0.46778272  1.01007504  3.05260214]
 [-0.12136245 -0.21295474 -0.61117007]
 [-0.14084134 -0.21826389 -0.60295544]
 [ 0.00515197  0.10965762  0.39644441]]

Weight Gradient:
 [[ 3.90744476  2.56820989  1.22897503]
 [28.01494466 30.66171602 33.30848737]]

Bias Gradient:
 [-1.33923487  2.64677135]

Updated Weights:
 [[-0.21507349 -0.44379736 -1.30103759]
 [-0.28958477 -0.25804497 -0.14078077]]

Updated Bias:
 [ 0.01339235 -0.02646771]


In [19]:
class Activation:
    def __init__(self, activation: str):
        self.activation = activation
        self.x = None  # Store input for backprop

        # Set activation function and its derivative
        if activation == "relu":
            self.func = ActivationFunctions.relu
            self.derivative = ActivationFunctions.relu_derivative
        elif activation == "sigmoid":
            self.func = ActivationFunctions.sigmoid
            self.derivative = ActivationFunctions.sigmoid_derivative
        elif activation == "tanh":
            self.func = ActivationFunctions.tanh
            self.derivative = ActivationFunctions.tanh_derivative
        else:
            raise ValueError(f"Unsupported activation: {activation}")

    def forward(self, x):
        """ Forward pass: Apply activation function """
        self.x = x  # Store for backward pass
        return self.func(x)

    def backward(self, upstream_grad):
        """ Backward pass: Apply activation derivative """
        return upstream_grad * self.derivative(self.x)


In [20]:
# Define input
x = cp.array([[1, 2, 3],
              [4, 5, 6],
              [7, 8, 9],
              [10, 11, 12]])  # (4, 3)

# Create layers
linear_layer = Linear(in_features=3, out_features=2, bias=True)
activation_layer = Activation("tanh")  # Can be "sigmoid" or "tanh"

# Forward pass
linear_output = linear_layer.forward(x)
activated_output = activation_layer.forward(linear_output)

print("Activated Output:\n", activated_output)

# Backward pass (simulating loss gradient)
upstream_grad = cp.random.randn(4, 2)
grad_activated = activation_layer.backward(upstream_grad)
grad_input = linear_layer.backward(grad_activated)

print("\nGradient w.r.t Input:\n", grad_input)


Activated Output:
 [[-0.90983665 -0.52384797]
 [-0.99214963 -0.8567783 ]
 [-0.99934236 -0.96262925]
 [-0.99994509 -0.99064311]]

Gradient w.r.t Input:
 [[ 0.10420543 -0.15336635  0.17225569]
 [ 0.04014839 -0.06070295  0.04583223]
 [ 0.01057647 -0.01566041  0.01628363]
 [ 0.00072666 -0.00108212  0.00104036]]


In [21]:
class Sequential:
    def __init__(self, *layers):
        """
        A simple sequential model to stack layers.

        Args:
            *layers: A list of layers (Linear, Activation, etc.)
        """
        self.layers = layers

    def forward(self, x):
        """ Forward pass through all layers """
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def backward(self, upstream_grad):
        """ Backward pass through all layers in reverse order """
        for layer in reversed(self.layers):
            upstream_grad = layer.backward(upstream_grad)
        return upstream_grad

    def update(self, learning_rate):
        """ Update weights of layers that have parameters (Linear layers) """
        for layer in self.layers:
            if hasattr(layer, "update"):
                layer.update(learning_rate)


In [22]:
# Define input
x = cp.array([[1, 2, 3],
              [4, 5, 6],
              [7, 8, 9],
              [10, 11, 12]])  # (4, 3)

# Create a simple feedforward network
model = Sequential(
    Linear(in_features=3, out_features=5, bias=True),
    Activation("relu"),
    Linear(in_features=5, out_features=2, bias=True),
    Activation("sigmoid")
)

# Forward pass
output = model.forward(x)
print("Model Output:\n", output)

# Backward pass (simulating loss gradient)
upstream_grad = cp.random.randn(4, 2)
model.backward(upstream_grad)

# Update weights
model.update(learning_rate=0.01)


Model Output:
 [[4.02194951e-01 7.33736526e-01]
 [5.24591193e-02 9.58006111e-01]
 [5.84850073e-03 9.94863931e-01]
 [7.01209019e-04 9.99399263e-01]]


In [23]:
class BCE:
    def __init__(self):
        pass
    
    def forward(self, output, target):
        # Binary Cross-Entropy loss: -target * log(output) - (1 - target) * log(1 - output)
        epsilon = 1e-15  # To avoid log(0) errors
        output = cp.clip(output, epsilon, 1 - epsilon)
        loss = -cp.mean(target * cp.log(output) + (1 - target) * cp.log(1 - output))
        self.output = output
        self.target = target
        return loss
    
    def backward(self):
        # Gradient of BCE loss with respect to output
        grad_input = (self.output - self.target) / (self.output * (1 - self.output))
        return grad_input


In [25]:

# Training Setup
x = cp.array([[1, 2, 3],
              [4, 5, 6],
              [7, 8, 9],
              [10, 11, 12]])  # (4, 3)

# Target labels for binary classification (simulated)
y = cp.array([[1], [0], [1], [0]])  # (4, 1)

# Create a simple feedforward network
model = Sequential(
    Linear(in_features=3, out_features=5, bias=True),
    Activation("relu"),
    Linear(in_features=5, out_features=1, bias=True),  # Single output for binary classification
    Activation("sigmoid")  # Sigmoid for binary output
)

# Initialize BCE loss
loss_fn = BCE()

# Hyperparameters
learning_rate = 0.01
epochs = 1000

# Training Loop
for epoch in range(epochs):
    # Forward pass
    output = model.forward(x)

    # Compute loss
    loss = loss_fn.forward(output, y)

    # Backward pass
    upstream_grad = loss_fn.backward()  # Gradient of loss w.r.t. output
    model.backward(upstream_grad)

    # Update weights
    model.update(learning_rate)

    # Print loss for every 100 epochs
    if epoch % 100 == 0:
        print(f"Epoch {epoch}/{epochs}, Loss: {loss:.4f}")

Epoch 0/1000, Loss: 0.9514
Epoch 100/1000, Loss: 0.5961
Epoch 200/1000, Loss: 0.5877
Epoch 300/1000, Loss: 0.5874
Epoch 400/1000, Loss: 0.5872
Epoch 500/1000, Loss: 0.5870
Epoch 600/1000, Loss: 0.5869
Epoch 700/1000, Loss: 0.5869
Epoch 800/1000, Loss: 0.5869
Epoch 900/1000, Loss: 0.5869
