In [9]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
%autoreload 2

In [64]:
import numpy as np
import torch
import simplegrad as sg
from simplegrad import Tensor
from simplegrad import ops
from ..my_tests import test_ops

ImportError: attempted relative import with no known parent package

In [50]:
ishape, oshape = (3, 1, 4), (4, 3, 5, 4)
aligned = [1] * (len(oshape) - len(ishape)) + list(ishape)
axes = tuple([i for i, axis in enumerate(aligned) if axis == 1])
aligned

[1, 3, 1, 4]

In [47]:
[1] * (len(oshape) - len(ishape))

[]

In [102]:
def log(tensor):
    out = Tensor(np.log(tensor.data), requires_grad=tensor.requires_grad)
    def _backward():
        tensor.grad += out.grad / tensor.data
    out._backward = _backward
    out._prev = {tensor, }
    return out

def exp(tensor):
    out = Tensor(np.exp(tensor.data), requires_grad=tensor.requires_grad)
    
    def _backward():
        if tensor.requires_grad:
            tensor.grad += out.data * out.grad 
    
    out._backward = _backward
    out._prev = {tensor, }
    return out

def summation(tensor, axis=None, keepdims=False):
    """
    local_grad = d.sum(x) / d.xi = 1.
    therefore derivative of x is basically just out.grad
    broadcasted to the shape of the input tensor.
    """
    out = Tensor(np.sum(tensor.data, axis=axis, keepdims=keepdims), requires_grad=tensor.requires_grad)
    def _backward():
        if tensor.requires_grad:
            input_shape, axes = tensor.data.shape, axis
            
            if not keepdims:
                if axis is None: # if self.axes is None, take sum over all axes.
                    axes = tuple(i for i in range(len(input_shape)))
                elif isinstance(axis, int): 
                    axes = (axis,)

                shape_range = range(len(input_shape))
                mask = np.array([0 if i in axes else 1 for i in shape_range])
                new_shape = np.array(input_shape) * mask + (1 - mask)
                grad = np.reshape(out.grad, new_shape)
                grad = np.broadcast_to(grad, input_shape)
            else:
                grad = np.broadcast_to(out.grad, input_shape)
                
            tensor.grad += grad
        
    out._backward = _backward
    out._prev = {tensor,}
    return out
    
def broadcast_to(tensor, shape):
    """this is interestingly the reverse of summation."""
    if tensor.shape == shape: # Optimization: no-op if shapes match
        return tensor
        
    out_data = np.broadcast_to(tensor.data, shape)
    out = Tensor(out_data, requires_grad=tensor.requires_grad)
    
    input_shape = tensor.shape # Capture input shape for backward pass
    def _backward():
        if tensor.requires_grad:
            ishape, oshape = tensor.data.shape, out.grad.shape
            ## in = (3, 1, 4), out = (3, 5, 4) -> aligned = (3, 1, 4)
            ## i think numpy only implicitly broadcast to prefix dims :/
            aligned = [1] * (len(oshape) - len(ishape)) + list(ishape)
            broadcast_axes = tuple([i for i, axis in enumerate(aligned) if axis == 1])
            grad = np.sum(out.grad, axis=broadcast_axes, keepdims=True)
            grad = np.reshape(grad, ishape)

            tensor.grad += grad
        
    out._backward = _backward
    out._prev = {tensor,}
    return out
    
def logsumexp(tensor, axis=None, keepdims=False):
    """
    mathematical operations, applied to 1D vector: 
    forward: log(e^z1 + e^z2 + ... + e^zn) = sum(e^zi)
    backward: local_grad[i] = e^zi / sum(e^zi)
    ------
    for numerical stability:
    forward: log(sum(e^zi))  = log(sum(e^(zi - zmax)) * e^zmax)
                             = log(sum(e^(zi - zmax))) + log(e^zmax)
    backward: e^zi/sum(e^zi) = e^(zi - zmax) / sum(e^(zi - zmax))
    """
    max_z = np.max(tensor.data, axis=axis, keepdims=True)
    stable_z = tensor.data - max_z
    exp_stable_z = np.exp(stable_z)
    stable_sum = np.sum(exp_stable_z, axis=axis, keepdims=keepdims)
    max_term = max_z if keepdims else np.squeeze(max_z, axis=axis)
    data = np.log(stable_sum) + max_term
    out = Tensor(data, requires_grad=tensor.requires_grad)
    
    def _backward():
        if tensor.requires_grad:
            if axis is None:
                # For None axis, basically all dims.
                if not keepdims:
                    grad_shaped = out.grad * np.ones_like(tensor.data)
                    softmax_terms = exp_stable_z / np.sum(exp_stable_z)
                    tensor.grad += grad_shaped * softmax_terms
                else:
                    # keepdims=True with axis=None
                    softmax_terms = exp_stable_z / stable_sum
                    tensor.grad += out.grad * softmax_terms
            else:
                # For specific axis reduction
                grad_shaped = out.grad
                if not keepdims:
                    grad_shaped = np.expand_dims(grad_shaped, axis=axis)
    
                denom = stable_sum if keepdims \
                        else np.expand_dims(stable_sum, axis=axis)
                softmax_terms = exp_stable_z / denom
                tensor.grad += grad_shaped * softmax_terms
    
    out._backward = _backward
    out._prev = {tensor, }
    return out

"""
    1. note: id(i, j) = 1{i == j}
    mathematical operations, applied to 1D vector:
    
    forward: softmax(z)[i] = e^zi / sum(e^z)
    backward: since softmax is a vector-to-vector function,
              the local_grad we need to compute is a Jacobian:
        local_grad[i,j] = softmax(z)[i] * (id(i,j) - softmax(z)[j])
    
    
    2. for numerical stability. 
    forward: softmax(z)[i] = e^zi / sum(e^z)
             = e^(zi) / e^(logsumexp(z))
             = e^(zi - logsumexp(z))
             
    backward (vectorized): 
        let ID.shape = local_grad.shape = (N, N). 
            ID[i, j] = id(i, j).
            local_grad[i, j] = d.s[i] / d.z[j]
            
        local_grad[i, j] = out[i] * ID[i, j] - out[i] * out[j]
        local_grad[i, :] = out[i] * ID[i, :] - out[i] * out[:]
        local_grad[:, :] = out[:] * ID[:, :] - out[:, None] * out[None, :]
                         = diag(out) - outer(out, out)

        shit, however, this is not really efficient.
"""
def softmax(tensor, axis: int = None):
    """
    to reduce headache, actually I should implement an exp ops,
    then let the chain rule do its job automatically.
    """
    lse = logsumexp(tensor, axis=axis, keepdims=True)
    # print(tensor.shape, lse.shape)
    lse_broadcast = broadcast_to(lse, tensor.data.shape)
    log_softmax = tensor - lse_broadcast
    out = exp(log_softmax)
          
    return out

In [112]:
print("Testing LogSumExp operation...")
test_logsumexp()

print("\nTesting LogSumExp specific cases...")
test_logsumexp_specific_cases()

print("\nAll LogSumExp tests completed successfully!")

Testing LogSumExp operation...
LogSumExp forward test 1 passed!
LogSumExp backward test 1 passed!
LogSumExp forward test 2 passed!
LogSumExp backward test 2 passed!
LogSumExp forward test 3 passed!
LogSumExp backward test 3 passed!
LogSumExp forward test 4 passed!
LogSumExp backward test 4 passed!
LogSumExp forward test 5 passed!
LogSumExp backward test 5 passed!
LogSumExp forward test 6 passed!
LogSumExp backward test 6 passed!
LogSumExp forward test 7 passed!
LogSumExp backward test 7 passed!
LogSumExp forward test 8 passed!
LogSumExp backward test 8 passed!
LogSumExp forward test 9 passed!
LogSumExp backward test 9 passed!
LogSumExp forward test 10 passed!
LogSumExp backward test 10 passed!

Testing LogSumExp specific cases...
LogSumExp specific case 1 (large identical values) passed!
LogSumExp specific case 2 (extreme value differences) passed!
LogSumExp specific case 3 (softmax relation) passed!

All LogSumExp tests completed successfully!


In [61]:
test_broadcast_to()

broadcast_to forward test 1 passed!
broadcast_to backward test 1 passed!
broadcast_to forward test 2 passed!
broadcast_to backward test 2 passed!
broadcast_to forward test 3 passed!
broadcast_to backward test 3 passed!
broadcast_to forward test 4 passed!
broadcast_to backward test 4 passed!
broadcast_to forward test 5 passed!
broadcast_to backward test 5 passed!
broadcast_to forward test 6 passed!
broadcast_to backward test 6 passed!
broadcast_to forward test 7 passed!
broadcast_to backward test 7 passed!
broadcast_to forward test 8 passed!
broadcast_to backward test 8 passed!
broadcast_to forward test 9 passed!
broadcast_to backward test 9 passed!
broadcast_to forward test 10 passed!
broadcast_to backward test 10 passed!


In [62]:
print("Testing Softmax operation...")
test_softmax()

Testing Softmax operation...
Softmax forward test 1 passed!
Softmax backward test 1 passed!
Softmax forward test 2 passed!
Softmax backward test 2 passed!
Softmax forward test 3 passed!
Softmax backward test 3 passed!
Softmax forward test 4 passed!
Softmax backward test 4 passed!
Softmax forward test 5 passed!
Softmax backward test 5 passed!
Softmax forward test 6 passed!
Softmax backward test 6 passed!
Softmax forward test 7 passed!
Softmax backward test 7 passed!
Softmax forward test 8 passed!
Softmax backward test 8 passed!
Softmax forward test 9 passed!
Softmax backward test 9 passed!
Softmax forward test 10 passed!
Softmax backward test 10 passed!


In [4]:
import numpy as np
import torch
import torch.nn.functional as F

from simplegrad.tensor import Tensor


def test_logsumexp():
    # Define diverse test cases
    test_cases = [
        # Small values
        {"data": np.random.rand(3, 4) * 0.001, "axis": None, "keepdims": False},
        # Large values
        {"data": np.random.rand(3, 4) * 100, "axis": None, "keepdims": False},
        # Negative values
        {"data": np.random.rand(3, 4) * -10, "axis": None, "keepdims": False},
        # Mixed values
        {"data": np.random.rand(3, 4) * 2 - 1, "axis": None, "keepdims": False},
        # Single dimension reduction with keepdims=True
        {"data": np.random.rand(3, 4) * 2 - 1, "axis": 0, "keepdims": True},
        # Single dimension reduction with keepdims=False
        {"data": np.random.rand(3, 4) * 2 - 1, "axis": 1, "keepdims": False},
        # Multiple dimensions
        {"data": np.random.rand(2, 3, 4) * 2 - 1, "axis": None, "keepdims": False},
        # Multiple dimensions with specific axis
        {"data": np.random.rand(2, 3, 4) * 2 - 1, "axis": 1, "keepdims": False},
        # Multiple dimensions with tuple axis
        {"data": np.random.rand(2, 3, 4) * 2 - 1, "axis": (0, 2), "keepdims": False},
        # Multiple dimensions with tuple axis and keepdims=True
        {"data": np.random.rand(2, 3, 4) * 2 - 1, "axis": (0, 2), "keepdims": True}
    ]
    
    for i, test_case in enumerate(test_cases):
        data = test_case["data"]
        axis = test_case["axis"]
        keepdims = test_case["keepdims"]
        
        # Convert to tensors
        pt_x = torch.tensor(data, dtype=torch.float32, requires_grad=True)
        x = Tensor.from_torch(pt_x)
        
        # PyTorch version
        if axis is None:
            # PyTorch's logsumexp requires a specific dim
            expected = torch.logsumexp(pt_x, dim=tuple(range(pt_x.dim())), keepdim=keepdims)
        elif isinstance(axis, int):
            expected = torch.logsumexp(pt_x, dim=axis, keepdim=keepdims)
        else:
            # For multiple axes, we need to handle them one by one in PyTorch
            temp = pt_x
            # Process axes in reverse order to maintain correct dimensions
            for ax in sorted(axis, reverse=True):
                temp = torch.logsumexp(temp, dim=ax, keepdim=keepdims)
            expected = temp
        
        # Our implementation
        result = logsumexp(x, axis=axis, keepdims=keepdims)
        
        # Check forward pass
        np.testing.assert_allclose(
            result.data, 
            expected.detach().numpy(), 
            rtol=1e-5, atol=1e-5,
            err_msg=f"Forward pass failed for test case {i+1}: data shape {data.shape}, axis {axis}, keepdims {keepdims}"
        )
        print(f"LogSumExp forward test {i+1} passed!")
        
        # Compute gradients
        grad_output = torch.ones_like(expected)
        expected.backward(grad_output)
        result.backward()
        
        # Check backward pass
        np.testing.assert_allclose(
            x.grad, 
            pt_x.grad.detach().numpy(), 
            rtol=1e-5, atol=1e-5,
            err_msg=f"Backward pass failed for test case {i+1}: data shape {data.shape}, axis {axis}, keepdims {keepdims}"
        )
        print(f"LogSumExp backward test {i+1} passed!")


def test_logsumexp_specific_cases():
    """Test specific edge cases for logsumexp"""
    
    # Case 1: All elements are the same (tests numerical stability)
    data = np.ones((3, 3)) * 1000  # Large identical values
    pt_x = torch.tensor(data, dtype=torch.float32, requires_grad=True)
    x = Tensor.from_torch(pt_x)
    
    expected = torch.logsumexp(pt_x, dim=1, keepdim=False)
    result = logsumexp(x, axis=1, keepdims=False)
    
    np.testing.assert_allclose(result.data, expected.detach().numpy(), rtol=1e-5, atol=1e-5)
    print("LogSumExp specific case 1 (large identical values) passed!")
    
    # Case 2: Extreme differences between values (tests numerical stability)
    data = np.array([[1e-10, 1e10], [1e-10, 1e-10]])
    pt_x = torch.tensor(data, dtype=torch.float32, requires_grad=True)
    x = Tensor.from_torch(pt_x)
    
    expected = torch.logsumexp(pt_x, dim=1, keepdim=False)
    result = logsumexp(x, axis=1, keepdims=False)
    
    np.testing.assert_allclose(result.data, expected.detach().numpy(), rtol=1e-5, atol=1e-5)
    print("LogSumExp specific case 2 (extreme value differences) passed!")
    
    # Case 3: Test with softmax relation (logsumexp is used in softmax implementation)
    data = np.random.rand(5, 10) * 2 - 1
    pt_x = torch.tensor(data, dtype=torch.float32, requires_grad=True)
    x = Tensor.from_torch(pt_x)
    
    # Standard softmax calculation using logsumexp
    pt_logsumexp = torch.logsumexp(pt_x, dim=1, keepdim=True)
    pt_softmax = torch.exp(pt_x - pt_logsumexp)
    
    our_logsumexp = logsumexp(x, axis=1, keepdims=True)
    our_softmax = np.exp(x.data - our_logsumexp.data)
    
    np.testing.assert_allclose(our_softmax, pt_softmax.detach().numpy(), rtol=1e-5, atol=1e-5)
    print("LogSumExp specific case 3 (softmax relation) passed!")


In [28]:
def test_broadcast_to():
    # Define diverse test cases
    test_cases = [
        # Broadcast single value to vector
        {"data": np.array([1.0]), "shape": (5,)},
        # Broadcast vector to matrix (adding explicit dimension for PyTorch compatibility)
        {"data": np.random.rand(1, 3), "shape": (3, 3)},
        # Broadcast row vector to matrix
        {"data": np.random.rand(1, 4), "shape": (3, 4)},
        # Broadcast column vector to matrix
        {"data": np.random.rand(3, 1), "shape": (3, 4)},
        # Broadcast scalar to matrix
        {"data": np.array([1.0]), "shape": (3, 4)},
        # Broadcast to higher dimensions
        {"data": np.random.rand(2, 1, 3), "shape": (2, 5, 3)},
        # No broadcasting (same shape)
        {"data": np.random.rand(2, 3), "shape": (2, 3)},
        # Broadcast in middle dimension
        {"data": np.random.rand(2, 1, 4), "shape": (2, 3, 4)},
        # Multiple dimensions broadcasted
        {"data": np.random.rand(1, 1, 3), "shape": (2, 4, 3)},
        # Broadcasting with various data values
        {"data": np.random.rand(1, 3) * 10 - 5, "shape": (4, 3)},
    ]
    
    for i, test_case in enumerate(test_cases):
        data = test_case["data"]
        shape = test_case["shape"]
        
        # Convert to tensors
        pt_x = torch.tensor(data, dtype=torch.float32, requires_grad=True)
        x = Tensor(data, requires_grad=True)
        
        # PyTorch version
        expected = pt_x.expand(shape)
        
        # Our implementation
        result = broadcast_to(x, shape)
        
        # Check forward pass
        np.testing.assert_allclose(
            result.data, 
            expected.detach().numpy(), 
            rtol=1e-5, atol=1e-6,
            err_msg=f"broadcast_to Forward pass failed for test case {i+1}: data shape {data.shape}, target shape {shape}"
        )
        print(f"broadcast_to forward test {i+1} passed!")
        
        # Generate random gradient for backward pass
        grad_output_np = np.random.rand(*shape).astype(np.float32)
        grad_output_torch = torch.tensor(grad_output_np)
        
        # Compute gradients
        expected.backward(grad_output_torch)
        
        # With our updated backward method, we can pass the gradient directly:
        result.backward(grad_output_np)
        
        # Check backward pass
        np.testing.assert_allclose(
            x.grad, 
            pt_x.grad.detach().numpy(), 
            rtol=1e-5, atol=1e-5,
            err_msg=f"broadcast_to Backward pass failed for test case {i+1}: data shape {data.shape}, target shape {shape}"
        )
        print(f"broadcast_to backward test {i+1} passed!")

In [81]:
np.float32(np.random.rand())

np.float32(0.08273586)

In [85]:
np.ndarray(4)

array([1.e+15, 1.e-15, 1.e-15, 1.e+15])

In [88]:
torch.tensor(5.44)

tensor(5.4400)

In [98]:
test_case = {"data": np.random.rand(5, 5), "axis": 1, "keepdims": True, "name": "Basic 2D, axis 1 with keepdims"}
data = test_case["data"]
axis = test_case["axis"]
keepdims = test_case["keepdims"]
name = test_case["name"]

pt_x = torch.tensor(data, dtype=torch.float32, requires_grad=True)
x = Tensor(data, requires_grad=True)

result = summation(x, axis=axis, keepdims=keepdims)

In [99]:
x, result

(Tensor([[0.29151535 0.4114775  0.15306664 0.12506993 0.24554916]
  [0.9562889  0.5814308  0.21557261 0.2233879  0.9167526 ]
  [0.75078666 0.49505213 0.8327872  0.6036786  0.17937946]
  [0.42289641 0.71969664 0.03585196 0.6252484  0.32845837]
  [0.23447901 0.07065094 0.81188375 0.7233557  0.7745351 ]], requires_grad=True),
 Tensor([[1.2266786]
  [2.8934329]
  [2.8616838]
  [2.1321516]
  [2.6149046]], requires_grad=True))

In [100]:
pt_x.sum(dim=axis, keepdim=keepdims)

tensor([[1.2267],
        [2.8934],
        [2.8617],
        [2.1322],
        [2.6149]], grad_fn=<SumBackward1>)

In [110]:
def test_summation():
    """Test the summation operation with challenging cases"""
    print("\n=== TESTING SUMMATION ===")
    
    # Define challenging test cases
    test_cases = [
        # Basic cases
        {"data": np.random.rand(5, 5), "axis": None, "keepdims": False, "name": "Basic 2D, all axes"},
        {"data": np.random.rand(5, 5), "axis": 0, "keepdims": False, "name": "Basic 2D, axis 0"},
        {"data": np.random.rand(5, 5), "axis": 1, "keepdims": True, "name": "Basic 2D, axis 1 with keepdims"},
        
        # Extreme values
        {"data": np.random.rand(10, 10) * 1e10, "axis": None, "keepdims": False, "name": "Large values (1e10)"},
        {"data": np.random.rand(10, 10) * 1e-10, "axis": 0, "keepdims": True, "name": "Small values (1e-10)"},
        {"data": np.array([[1e15, 1e-15], [1e-15, 1e15]]), "axis": 1, "keepdims": False, "name": "Mixed extreme values"},
        
        # Large dimensions
        {"data": np.random.rand(1000, 5), "axis": 0, "keepdims": False, "name": "Large first dimension (1000x5)"},
        {"data": np.random.rand(5, 1000), "axis": 1, "keepdims": True, "name": "Large second dimension (5x1000)"},
        
        # Higher dimensions
        {"data": np.random.rand(10, 10, 10), "axis": (0, 2), "keepdims": False, "name": "3D with multiple axes"},
        {"data": np.random.rand(5, 5, 5, 5), "axis": (1, 2), "keepdims": True, "name": "4D with multiple axes and keepdims"},
        
        # Special patterns
        {"data": np.ones((20, 20)), "axis": None, "keepdims": False, "name": "All ones"},
        {"data": np.zeros((20, 20)), "axis": 0, "keepdims": True, "name": "All zeros"},
        {"data": np.eye(20), "axis": 1, "keepdims": False, "name": "Identity matrix"},
        
        # Edge cases
        {"data": np.array([1.0]), "axis": None, "keepdims": False, "name": "Single value"},
        {"data": np.random.rand(1, 1, 1, 1), "axis": (1, 2), "keepdims": True, "name": "Multiple singleton dimensions"},
    ]
    
    for i, test_case in enumerate(test_cases):
        data = test_case["data"]
        axis = test_case["axis"]
        keepdims = test_case["keepdims"]
        name = test_case["name"]
        
        # Create tensors
        pt_x = torch.tensor(data, dtype=torch.float32, requires_grad=True)
        x = Tensor(data, requires_grad=True)
        
        # PyTorch sum
        if isinstance(axis, tuple):
            # For multiple axes in PyTorch, we need to do them one by one
            expected = pt_x
            for ax in sorted(axis, reverse=True):  # Start from the highest axis
                expected = expected.sum(dim=ax, keepdim=keepdims)
        else:
            expected = pt_x.sum(dim=axis, keepdim=keepdims)
        
        # Our summation
        result = summation(x, axis=axis, keepdims=keepdims)
        
        # Check forward pass
        try:
            np.testing.assert_allclose(
                result.data,
                expected.detach().numpy(),
                rtol=1e-5, atol=1e-5,
                err_msg=f"Summation forward pass failed"
            )
            # print(f"  ✓ Forward pass successful")
        except Exception as e:
            print(f"  ✗ Forward pass failed: {e}")
            continue
        
        # Generate random gradient for backward pass
        grad_output_np = np.random.rand(*result.data.shape)
        if isinstance(grad_output_np, float):
            grad_output_np = np.float32(grad_output_np)
        else:
            grad_output_np = grad_output_np.astype(np.float32)
        grad_output_torch = torch.tensor(grad_output_np)
        
        # Compute gradients
        expected.backward(grad_output_torch)
        result.backward(grad_output_np)
        
        # Check backward pass
        try:
            np.testing.assert_allclose(
                x.grad,
                pt_x.grad.detach().numpy(),
                rtol=1e-4, atol=1e-5,
                err_msg=f"broadcast_to backward pass failed"
            )
            # print(f"  ✓ Backward pass successful")
            result_msg = "Successful."
        except Exception as e:
            result_msg = "Failed."
            print(f"  ✗ Backward pass failed: {e}")
        
        # Reset gradients
        pt_x.grad = None
        x.grad = np.zeros_like(x.data)
        
        print(
            f"Test case {i+1}: {name}."
            # f" Shape: {data.shape}, Axis: {axis}, Keepdims: {keepdims}."
            f" {result_msg}"
        )

def test_broadcast_to():
    """Test the broadcast_to operation with challenging cases"""
    print("\n=== TESTING BROADCAST_TO ===")
    
    # Define challenging test cases
    test_cases = [
        # Basic broadcasting
        {"data": np.random.rand(1), "shape": (10,), "name": "Scalar to vector"},
        {"data": np.random.rand(1, 5), "shape": (10, 5), "name": "Row to matrix"},
        {"data": np.random.rand(5, 1), "shape": (5, 10), "name": "Column to matrix"},
        
        # Extreme values
        {"data": np.random.rand(1, 3) * 1e9, "shape": (5, 3), "name": "Large values (1e9)"},
        {"data": np.random.rand(1, 3) * 1e-9, "shape": (5, 3), "name": "Small values (1e-9)"},
        {"data": np.array([[1e15], [1e-15]]), "shape": (2, 5), "name": "Mixed extreme values"},
        
        # Large dimensions
        {"data": np.random.rand(1, 5), "shape": (1000, 5), "name": "Broadcast to large first dim (1000)"},
        {"data": np.random.rand(5, 1), "shape": (5, 1000), "name": "Broadcast to large second dim (1000)"},
        
        # Higher dimensions
        {"data": np.random.rand(1, 5, 1), "shape": (10, 5, 8), "name": "3D broadcasting"},
        {"data": np.random.rand(1, 1, 1, 5), "shape": (7, 6, 5, 5), "name": "4D broadcasting"},
        
        # Multiple dimensions broadcasted
        {"data": np.random.rand(1, 1, 3), "shape": (8, 8, 3), "name": "Broadcasting multiple dimensions"},
        
        # Special patterns
        {"data": np.ones((1, 5)), "shape": (10, 5), "name": "Broadcasting ones"},
        {"data": np.zeros((1, 5)), "shape": (10, 5), "name": "Broadcasting zeros"},
        
        # No broadcasting (identity case)
        {"data": np.random.rand(5, 5), "shape": (5, 5), "name": "No broadcasting (same shape)"},
        
        # Edge cases
        {"data": np.array([1.0]), "shape": (1, 1, 1, 1), "name": "Scalar to higher dims"},
    ]
    
    for i, test_case in enumerate(test_cases):
        data = test_case["data"]
        shape = test_case["shape"]
        name = test_case["name"]
        
        # Create tensors
        pt_x = torch.tensor(data, dtype=torch.float32, requires_grad=True)
        x = Tensor(data, requires_grad=True)
        
        # PyTorch broadcast (expand)
        # Handle the case of broadcasting to higher dimensions
        if pt_x.dim() < len(shape):
            # Add dimensions to match the target shape
            expanded_dims = len(shape) - pt_x.dim()
            reshape_dims = [1] * expanded_dims + list(pt_x.shape)
            pt_x_reshaped = pt_x.reshape(reshape_dims)
            expected = pt_x_reshaped.expand(shape)
        else:
            expected = pt_x.expand(shape)
        
        # Our broadcast_to
        result = broadcast_to(x, shape)
        
        # Check forward pass
        try:
            np.testing.assert_allclose(
                result.data,
                expected.detach().numpy(),
                rtol=1e-5, atol=1e-5,
                err_msg=f"broadcast_to forward pass failed"
            )
            # print(f"  ✓ Forward pass successful")
        except Exception as e:
            print(f"  ✗ Forward pass failed: {e}")
            continue
        
        # Generate random gradient for backward pass
        grad_output_np = np.random.rand(*shape).astype(np.float32)
        grad_output_torch = torch.tensor(grad_output_np)
        
        # Compute gradients
        expected.backward(grad_output_torch)
        result.backward(grad_output_np)
        
        # Check backward pass
        try:
            np.testing.assert_allclose(
                x.grad,
                pt_x.grad.detach().numpy(),
                rtol=1e-4, atol=1e-5,
                err_msg=f"broadcast_to backward pass failed"
            )
            # print(f"  ✓ Backward pass successful")
            result_msg = "Successful."
        except Exception as e:
            result_msg = "Failed."
            print(f"  ✗ Backward pass failed: {e}")
        
        # Reset gradients
        pt_x.grad = None
        x.grad = np.zeros_like(x.data)
        
        print(
            f"Test case {i+1}: {name}."
            # f" Shape: {data.shape}, Axis: {axis}, Keepdims: {keepdims}."
            f" {result_msg}"
        )

def test_softmax():
    """Test the softmax operation with challenging cases"""
    print("\n=== TESTING SOFTMAX ===")
    
    # Define challenging test cases
    test_cases = [
        # Basic cases
        {"data": np.random.rand(10), "axis": None, "name": "Basic 1D vector"},
        {"data": np.random.rand(5, 5), "axis": 1, "name": "Basic 2D, axis 1"},
        {"data": np.random.rand(5, 5), "axis": 0, "name": "Basic 2D, axis 0"},
        
        # Extreme values
        {"data": np.random.rand(10) * 1e9, "axis": None, "name": "Large values (1e9)"},
        {"data": np.random.rand(10) * 1e-9, "axis": None, "name": "Small values (1e-9)"},
        {"data": np.array([1e15, 1e-15, 0, -1e-15, -1e15]), "axis": None, "name": "Mixed extreme values"},
        
        # Numerical stability challenges
        {"data": np.array([1000, 0, -1000]), "axis": None, "name": "Very different values"},
        {"data": np.array([1e5, 1e5 + 1e-5]), "axis": None, "name": "Nearly identical large values"},
        {"data": np.ones(10) * 1e5, "axis": None, "name": "All identical large values"},
        
        # Large dimensions
        {"data": np.random.rand(1000, 5), "axis": 1, "name": "Large first dimension (1000x5)"},
        {"data": np.random.rand(5, 1000) * 1e9, "axis": 0, "name": "Large second dimension (5x1000)"},
        
        # Higher dimensions
        {"data": np.random.rand(10, 10, 10), "axis": 2, "name": "3D tensor, last axis"},
        {"data": np.random.rand(10, 10, 10), "axis": 1, "name": "3D tensor, middle axis"},
        {"data": np.random.rand(5, 5, 5, 5), "axis": 0, "name": "4D tensor, first axis"},
        
        # Special patterns
        {"data": np.zeros((10, 10)), "axis": 1, "name": "All zeros (uniform distribution)"},
        {"data": np.ones((10, 10)), "axis": 1, "name": "All ones (uniform distribution)"},
        {"data": np.eye(10), "axis": 1, "name": "Identity matrix"},
        
        # Edge cases
        {"data": np.array([42.0]), "axis": None, "name": "Single value (should be 1.0)"},
        {"data": np.zeros((1, 1, 1)), "axis": 1, "name": "Multiple singleton dimensions"},
    ]
    
    for i, test_case in enumerate(test_cases):
        data = test_case["data"]
        axis = test_case["axis"]
        name = test_case["name"]
        
        # print(f"\nTest case {i+1}: {name}")
        # print(f"  Shape: {data.shape}, Axis: {axis}")
        
        # Create tensors
        pt_x = torch.tensor(data, dtype=torch.float32, requires_grad=True)
        x = Tensor(data, requires_grad=True)
        
        # PyTorch softmax
        if axis is None:
            # Flatten for axis=None
            flattened = pt_x.reshape(-1)
            expected = torch.nn.functional.softmax(flattened, dim=0)
        else:
            expected = torch.nn.functional.softmax(pt_x, dim=axis)
        
        # Our softmax
        result = softmax(x, axis=axis)
        
        # Check forward pass
        try:
            np.testing.assert_allclose(
                result.data,
                expected.detach().numpy(),
                rtol=1e-5, atol=1e-5,
                err_msg=f"Softmax forward pass failed"
            )
            # print(f"  ✓ Forward pass successful")
        except Exception as e:
            print(f"  ✗ Forward pass failed: {e}")
            continue
        
        # Generate random gradient for backward pass
        grad_output_np = np.random.rand(*result.data.shape).astype(np.float32)
        
        # For PyTorch, ensure gradient has the right shape
        if axis is None:
            grad_output_torch = torch.tensor(grad_output_np.reshape(-1))
        else:
            grad_output_torch = torch.tensor(grad_output_np)
        
        # Compute gradients
        expected.backward(grad_output_torch)
        result.backward(grad_output_np)
        
        # Check backward pass
        try:
            np.testing.assert_allclose(
                x.grad,
                pt_x.grad.detach().numpy(),
                rtol=1e-4, atol=1e-5,
                err_msg=f"broadcast_to backward pass failed"
            )
            # print(f"  ✓ Backward pass successful")
            result_msg = "Successful."
        except Exception as e:
            result_msg = "Failed."
            print(f"  ✗ Backward pass failed: {e}")
        
        # Reset gradients
        pt_x.grad = None
        x.grad = np.zeros_like(x.data)
        
        print(
            f"Test case {i+1}: {name}."
            # f" Shape: {data.shape}, Axis: {axis}, Keepdims: {keepdims}."
            f" {result_msg}"
        )

In [111]:
test_summation()
test_broadcast_to()
test_softmax()


=== TESTING SUMMATION ===
Test case 1: Basic 2D, all axes. Successful.
Test case 2: Basic 2D, axis 0. Successful.
Test case 3: Basic 2D, axis 1 with keepdims. Successful.
Test case 4: Large values (1e10). Successful.
Test case 5: Small values (1e-10). Successful.
Test case 6: Mixed extreme values. Successful.
Test case 7: Large first dimension (1000x5). Successful.
Test case 8: Large second dimension (5x1000). Successful.
Test case 9: 3D with multiple axes. Successful.
Test case 10: 4D with multiple axes and keepdims. Successful.
Test case 11: All ones. Successful.
Test case 12: All zeros. Successful.
Test case 13: Identity matrix. Successful.
Test case 14: Single value. Successful.
Test case 15: Multiple singleton dimensions. Successful.

=== TESTING BROADCAST_TO ===
Test case 1: Scalar to vector. Successful.
Test case 2: Row to matrix. Successful.
Test case 3: Column to matrix. Successful.
Test case 4: Large values (1e9). Successful.
Test case 5: Small values (1e-9). Successful.
Tes

In [47]:
X = torch.rand(2, 3, 4)

In [78]:
Y = torch.rand(4)
np.expand_dims(Y.numpy(), axis=(0,2)).shape

(1, 4, 1)

In [71]:
torch.logsumexp(X, dim=1, keepdims=True).numpy()

array([[[1.482567 , 1.7950699, 1.6493765, 1.4997053]],

       [[1.7428087, 1.6379938, 1.7570441, 1.3153229]]], dtype=float32)

In [73]:
Y = logsumexp(Tensor.from_torch(X), axis=1).data
Y

array([[[1.482567 , 1.7950699, 1.6493764, 1.4997053]],

       [[1.7428087, 1.6379938, 1.7570441, 1.3153229]]], dtype=float32)

In [74]:
np.squeeze(Y)

array([[1.482567 , 1.7950699, 1.6493764, 1.4997053],
       [1.7428087, 1.6379938, 1.7570441, 1.3153229]], dtype=float32)

In [62]:
X[0], X[1]

(tensor([[0.3181, 0.8340, 0.0570, 0.2979],
         [0.1563, 0.9888, 0.6898, 0.1264],
         [0.6209, 0.0290, 0.7662, 0.6925]]),
 tensor([[0.8639, 0.0777, 0.8982, 0.2250],
         [0.6347, 0.8138, 0.0857, 0.2969],
         [0.3748, 0.5920, 0.8112, 0.1204]]))