In [1]:
import cupy as cp 

x = cp.array([[1, 2, 3],
            [4, 5, 6],
            [7, 8, 9],
            [10, 11, 12]])

y = cp.array([1,0,0, 1])

print(x.shape)
print(y.shape)


(4, 3)
(4,)


In [4]:
class ActivationFunctions:
    @staticmethod
    def relu(x):
        return cp.maximum(0, x)

    @staticmethod
    def sigmoid(x):
        return 1 / (1 + cp.exp(-x))

    @staticmethod
    def relu_derivative(x):
        return (x > 0).astype(cp.float32)

    @staticmethod
    def sigmoid_derivative(x):
        sig = ActivationFunctions.sigmoid(x)
        return sig * (1 - sig)
    
    @staticmethod
    def tanh(x):
        return cp.tanh(x)
    
    @staticmethod
    def tanh_derivative(x):
        return 1 - cp.tanh(x)**2

    
    
    

In [6]:
# ✅ Create a test input (CuPy array)
x = cp.array([-2.0, -1.0, 0.0, 1.0, 2.0])

# ✅ Test ReLU
relu_output = ActivationFunctions.relu(x)
relu_derivative_output = ActivationFunctions.relu_derivative(x)

# ✅ Test Sigmoid
sigmoid_output = ActivationFunctions.sigmoid(x)
sigmoid_derivative_output = ActivationFunctions.sigmoid_derivative(x)

# ✅ Test Tanh
tanh_output = ActivationFunctions.tanh(x)
tanh_derivative_output = ActivationFunctions.tanh_derivative(x)

# ✅ Print results
print("Input:", x)
print("ReLU Output:", relu_output)
print("ReLU Derivative:", relu_derivative_output)
print("Sigmoid Output:", sigmoid_output)
print("Sigmoid Derivative:", sigmoid_derivative_output)
print("tanh Output:", tanh_output)
print("tanh Derivative:", tanh_derivative_output)

Input: [-2. -1.  0.  1.  2.]
ReLU Output: [0. 0. 0. 1. 2.]
ReLU Derivative: [0. 0. 0. 1. 1.]
Sigmoid Output: [0.11920292 0.26894142 0.5        0.73105858 0.88079708]
Sigmoid Derivative: [0.10499359 0.19661193 0.25       0.19661193 0.10499359]
tanh Output: [-0.96402758 -0.76159416  0.          0.76159416  0.96402758]
tanh Derivative: [0.07065082 0.41997434 1.         0.41997434 0.07065082]


In [41]:
import cupy as cp  # Assuming you're using CuPy for GPU acceleration

class Linear:
    def __init__(self,
                 in_features: int,
                 out_features: int,
                 bias: bool = True,
                 initializer: str = "he"):
        
        self.in_features = in_features
        self.out_features = out_features
        
        # Initialize weights
        if initializer == 'he':
            scale = cp.sqrt(2.0 / in_features)  # Fixed denominator
        elif initializer == 'xavier':
            scale = cp.sqrt(1.0 / in_features)  # Fixed denominator
        else:  # Plain initialization
            scale = 1.0

        self.weights = cp.random.randn(out_features, in_features) * scale
        self.bias = cp.zeros((out_features,)) if bias else None

        # Initialize gradients
        self.dweights = cp.zeros_like(self.weights)
        self.dbias = cp.zeros_like(self.bias) if bias else None

    def forward(self, x: cp.ndarray) -> cp.ndarray:
        self.x = x 
        return cp.dot(x, self.weights.T) + (self.bias if self.bias is not None else 0)  # Ensures correct shape


In [42]:
x = cp.array([[1, 2, 3],
            [4, 5, 6],
            [7, 8, 9],
            [10, 11, 12]])

layer = Linear(3, 2)  # 3 input features, 2 output features
out  = layer.forward(x)

print(out)



[[-1.32285007 -0.58949284]
 [-1.8907843  -2.54665149]
 [-2.45871853 -4.50381014]
 [-3.02665276 -6.46096879]]


In [45]:
class Linear:
    def __init__(self, in_features: int, out_features: int, bias: bool = True, initializer: str = "he"):
        self.in_features = in_features
        self.out_features = out_features
        
        # Initialize weights
        if initializer == 'he':
            scale = cp.sqrt(2.0 / in_features)
        elif initializer == 'xavier':
            scale = cp.sqrt(1.0 / in_features)
        else:
            scale = 1.0

        self.weights = cp.random.randn(out_features, in_features) * scale
        self.bias = cp.zeros((out_features,)) if bias else None

        # Initialize gradients
        self.dweights = cp.zeros_like(self.weights)
        self.dbias = cp.zeros_like(self.bias) if bias else None

    def forward(self, x: cp.ndarray) -> cp.ndarray:
        """ Forward pass: Computes Y = XW^T + b """
        self.x = x  # Store input for backprop
        return cp.dot(x, self.weights.T) + (self.bias if self.bias is not None else 0)

    def backward(self, upstream_grad: cp.ndarray) -> cp.ndarray:
        """
        Backward pass (backpropagation)

        Args:
            upstream_grad: Gradient from subsequent layer, shape (batch_size, output_dim)

        Returns:
            Gradient with respect to input, shape (batch_size, input_dim)
        """
        # Compute gradients
        self.dweights = cp.dot(upstream_grad.T, self.x)  # (out_features, in_features)
        if self.bias is not None:
            self.dbias = cp.sum(upstream_grad, axis=0)    # (out_features,)

        # Compute gradient for input
        dx = cp.dot(upstream_grad, self.weights)         # (batch_size, in_features)

        return dx

    def update(self, learning_rate: float):
        """Update weights using computed gradients"""
        self.weights -= learning_rate * self.dweights
        if self.bias is not None:
            self.bias -= learning_rate * self.dbias

    @property
    def parameters(self):
        """Return weights and biases"""
        return {'weights': self.weights, 'bias': self.bias}

    @property
    def gradients(self):
        """Return current gradients"""
        return {'dweights': self.dweights, 'dbias': self.dbias}


In [48]:
import cupy as cp

# Define input
x = cp.array([[1, 2, 3],
              [4, 5, 6],
              [7, 8, 9],
              [10, 11, 12]])  # (4, 3) -> batch_size=4, input_dim=3

# Create Linear layer: input_dim=3, output_dim=2
linear_layer = Linear(in_features=3, out_features=2, bias=True)

# Forward pass
output = linear_layer.forward(x)
print("Forward Output:\n", output)

# Define upstream gradient (random values for testing)
upstream_grad = cp.random.randn(4, 2)  # Same shape as output

# Backward pass
grad_input = linear_layer.backward(upstream_grad)
print("\nGradient w.r.t Input:\n", grad_input)

# Print gradients of weights and bias
print("\nWeight Gradient:\n", linear_layer.dweights)
print("\nBias Gradient:\n", linear_layer.dbias)

# Update parameters
learning_rate = 0.01
linear_layer.update(learning_rate)

# Print updated weights and bias
print("\nUpdated Weights:\n", linear_layer.weights)
print("\nUpdated Bias:\n", linear_layer.bias)


Forward Output:
 [[ -6.19577508   2.3351226 ]
 [-14.32964619   6.19233303]
 [-22.46351731  10.04954346]
 [-30.59738842  13.90675389]]

Gradient w.r.t Input:
 [[-1.14042808 -1.28208691 -1.0282569 ]
 [-0.37664654 -0.58789628 -0.47720214]
 [ 1.59104384  0.88767229  0.68070948]
 [-0.01415871  0.74768519  0.62611587]]

Weight Gradient:
 [[-3.62594284 -3.36871256 -3.11148227]
 [12.96040288 13.21238065 13.46435842]]

Bias Gradient:
 [0.25723028 0.25197777]

Updated Weights:
 [[-0.27652006 -1.27884993 -1.054859  ]
 [ 0.4270596   0.27663676  0.18566903]]

Updated Bias:
 [-0.0025723  -0.00251978]


In [49]:
class Activation:
    def __init__(self, activation: str):
        self.activation = activation
        self.x = None  # Store input for backprop

        # Set activation function and its derivative
        if activation == "relu":
            self.func = ActivationFunctions.relu
            self.derivative = ActivationFunctions.relu_derivative
        elif activation == "sigmoid":
            self.func = ActivationFunctions.sigmoid
            self.derivative = ActivationFunctions.sigmoid_derivative
        elif activation == "tanh":
            self.func = ActivationFunctions.tanh
            self.derivative = ActivationFunctions.tanh_derivative
        else:
            raise ValueError(f"Unsupported activation: {activation}")

    def forward(self, x):
        """ Forward pass: Apply activation function """
        self.x = x  # Store for backward pass
        return self.func(x)

    def backward(self, upstream_grad):
        """ Backward pass: Apply activation derivative """
        return upstream_grad * self.derivative(self.x)


In [51]:
# Define input
x = cp.array([[1, 2, 3],
              [4, 5, 6],
              [7, 8, 9],
              [10, 11, 12]])  # (4, 3)

# Create layers
linear_layer = Linear(in_features=3, out_features=2, bias=True)
activation_layer = Activation("tanh")  # Can be "sigmoid" or "tanh"

# Forward pass
linear_output = linear_layer.forward(x)
activated_output = activation_layer.forward(linear_output)

print("Activated Output:\n", activated_output)

# Backward pass (simulating loss gradient)
upstream_grad = cp.random.randn(4, 2)
grad_activated = activation_layer.backward(upstream_grad)
grad_input = linear_layer.backward(grad_activated)

print("\nGradient w.r.t Input:\n", grad_input)


Activated Output:
 [[ 0.9998911  -0.99980217]
 [ 1.         -1.        ]
 [ 1.         -1.        ]
 [ 1.         -1.        ]]

Gradient w.r.t Input:
 [[-5.89217561e-04  2.08099719e-04 -6.22952602e-04]
 [-7.44122715e-11 -1.65746782e-10 -1.07999116e-10]
 [-2.36778475e-16  2.05251102e-16 -2.31760313e-16]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]]


In [52]:
class Sequential:
    def __init__(self, *layers):
        """
        A simple sequential model to stack layers.

        Args:
            *layers: A list of layers (Linear, Activation, etc.)
        """
        self.layers = layers

    def forward(self, x):
        """ Forward pass through all layers """
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def backward(self, upstream_grad):
        """ Backward pass through all layers in reverse order """
        for layer in reversed(self.layers):
            upstream_grad = layer.backward(upstream_grad)
        return upstream_grad

    def update(self, learning_rate):
        """ Update weights of layers that have parameters (Linear layers) """
        for layer in self.layers:
            if hasattr(layer, "update"):
                layer.update(learning_rate)


In [54]:
# Define input
x = cp.array([[1, 2, 3],
              [4, 5, 6],
              [7, 8, 9],
              [10, 11, 12]])  # (4, 3)

# Create a simple feedforward network
model = Sequential(
    Linear(in_features=3, out_features=5, bias=True),
    Activation("relu"),
    Linear(in_features=5, out_features=2, bias=True),
    Activation("sigmoid")
)

# Forward pass
output = model.forward(x)
print("Model Output:\n", output)

# Backward pass (simulating loss gradient)
upstream_grad = cp.random.randn(4, 2)
model.backward(upstream_grad)

# Update weights
model.update(learning_rate=0.01)


Model Output:
 

CUDARuntimeError: cudaErrorLaunchFailure: unspecified launch failure