# Calculus for Neural Networks

This notebook contains PyTorch examples demonstrating calculus concepts essential for understanding neural networks.

## Table of Contents
1. [Derivatives](#derivatives)
2. [Partial Derivatives](#partial-derivatives)
3. [Chain Rule](#chain-rule)
4. [Gradient](#gradient)
5. [Hessian](#hessian)
6. [Jacobian](#jacobian)

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt

## Derivatives

**Formula:** $f'(x) = \lim_{h \to 0} \frac{f(x+h) - f(x)}{h}$

Foundation of gradient-based learning.

In [None]:
# Automatic differentiation in PyTorch
x = torch.tensor(2.0, requires_grad=True)
y = x**3 + 2*x**2 + x + 1  # f(x) = x³ + 2x² + x + 1

# Compute derivative automatically
y.backward()
print(f"f({x.item()}) = {y.item()}")
print(f"f'({x.item()}) = {x.grad.item()}")

# Manual verification: f'(x) = 3x² + 4x + 1
manual_derivative = 3 * x.item()**2 + 4 * x.item() + 1
print(f"Manual calculation: f'(2) = {manual_derivative}")

# Loss function derivative example
def simple_loss(w, x, y_true):
    y_pred = w * x
    return (y_pred - y_true)**2

w = torch.tensor(1.5, requires_grad=True)
x_val, y_true = 3.0, 10.0

loss = simple_loss(w, x_val, y_true)
loss.backward()

print(f"\nLoss: {loss.item():.3f}")
print(f"Gradient w.r.t. weight: {w.grad.item():.3f}")
print(f"Direction to move weight: {'decrease' if w.grad > 0 else 'increase'}")

## Partial Derivatives

**Formula:** $\frac{\partial f}{\partial x_i}$

Derivative with respect to one variable while holding others constant.

In [None]:
# Multi-variable function
x = torch.tensor(1.0, requires_grad=True)
y = torch.tensor(2.0, requires_grad=True)
z = x**2 * y + x * y**2  # f(x,y) = x²y + xy²

z.backward()
print(f"f({x.item()}, {y.item()}) = {z.item()}")
print(f"∂f/∂x = {x.grad.item()}")  # Should be 2xy + y²
print(f"∂f/∂y = {y.grad.item()}")  # Should be x² + 2xy

# Neural network layer with multiple parameters
batch_size, input_dim, output_dim = 4, 3, 2
X = torch.randn(batch_size, input_dim)
W = torch.randn(output_dim, input_dim, requires_grad=True)
b = torch.randn(output_dim, requires_grad=True)

# Forward pass
Y = X @ W.T + b
loss = Y.sum()
loss.backward()

print(f"\nWeight gradients shape: {W.grad.shape}")
print(f"Bias gradients shape: {b.grad.shape}")
print(f"Each gradient shows how loss changes w.r.t. that parameter")

# Examine specific parameter gradients
print(f"∂loss/∂W[0,0] = {W.grad[0,0].item():.3f}")
print(f"∂loss/∂b[0] = {b.grad[0].item():.3f}")

## Chain Rule

**Formula:** $\frac{d}{dx}f(g(x)) = f'(g(x)) \cdot g'(x)$

Mathematical foundation of backpropagation.

In [None]:
# Manual chain rule demonstration
x = torch.tensor(2.0, requires_grad=True)

# Composition: f(g(h(x))) where h(x)=x², g(u)=u+1, f(v)=v³
h = x**2        # h(x) = x²
g = h + 1       # g(h) = h + 1  
f = g**3        # f(g) = g³

f.backward()
print(f"Input: {x.item()}")
print(f"h(x) = x² = {h.item()}")
print(f"g(h) = h + 1 = {g.item()}")
print(f"f(g) = g³ = {f.item()}")
print(f"df/dx via chain rule: {x.grad.item()}")

# Manual verification: 
# df/dx = df/dg * dg/dh * dh/dx = 3g² * 1 * 2x = 3(x²+1)² * 2x
manual = 3 * (x.item()**2 + 1)**2 * 2 * x.item()
print(f"Manual calculation: {manual}")

# Neural network chain rule
class SimpleNet(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = torch.nn.Linear(2, 3)
        self.layer2 = torch.nn.Linear(3, 1)
        
    def forward(self, x):
        h1 = torch.relu(self.layer1(x))  # First composition
        h2 = self.layer2(h1)             # Second composition
        return h2

net = SimpleNet()
x_input = torch.randn(1, 2)
target = torch.randn(1, 1)

output = net(x_input)
loss = torch.nn.functional.mse_loss(output, target)
loss.backward()

print(f"\nNetwork output: {output.item():.3f}")
print(f"Loss: {loss.item():.3f}")
print(f"Layer 1 weight gradients: {net.layer1.weight.grad[0][:2]}")
print(f"Layer 2 weight gradients: {net.layer2.weight.grad[0][:2]}")
print("Gradients computed via automatic chain rule application")

## Gradient

**Formula:** $\nabla f = \left[\frac{\partial f}{\partial x_1}, \frac{\partial f}{\partial x_2}, \ldots, \frac{\partial f}{\partial x_n}\right]$

Points in direction of steepest increase.

In [None]:
# 2D function visualization
import matplotlib.pyplot as plt
import numpy as np

# Function: f(x,y) = x² + y² - 2x - 4y + 5 (has minimum at (1,2))
def f(x, y):
    return x**2 + y**2 - 2*x - 4*y + 5

# Gradient: ∇f = [2x-2, 2y-4]
def gradient(x, y):
    return torch.tensor([2*x - 2, 2*y - 4])

# Starting point
position = torch.tensor([0.0, 0.0], requires_grad=True)
learning_rate = 0.1
path = [position.detach().clone()]

print("Gradient descent optimization:")
for step in range(10):
    # Compute function value and gradient
    x, y = position
    loss = f(x, y)
    
    # Clear previous gradients
    if position.grad is not None:
        position.grad.zero_()
    
    loss.backward()
    
    print(f"Step {step}: pos=({x:.2f}, {y:.2f}), f={loss:.3f}, grad=({position.grad[0]:.2f}, {position.grad[1]:.2f})")
    
    # Update position (gradient descent step)
    with torch.no_grad():
        position -= learning_rate * position.grad
    
    path.append(position.detach().clone())
    
    # Stop if gradient is small
    if torch.norm(position.grad) < 0.01:
        break

print(f"\nFinal position: ({position[0]:.3f}, {position[1]:.3f})")
print(f"Theoretical minimum: (1.000, 2.000)")

# Gradient-based feature importance
model = torch.nn.Linear(5, 1)
input_data = torch.randn(1, 5, requires_grad=True)
target = torch.randn(1, 1)

output = model(input_data)
loss = torch.nn.functional.mse_loss(output, target)
loss.backward()

feature_importance = torch.abs(input_data.grad).squeeze()
print(f"\nFeature importance (|gradient|): {feature_importance}")
print(f"Most important feature: {feature_importance.argmax().item()}")

## Hessian

**Formula:** $\mathbf{H}_{ij} = \frac{\partial^2 f}{\partial x_i \partial x_j}$

Matrix of second derivatives describing curvature.

In [None]:
# Computing Hessian for simple function
def quadratic_loss(x):
    return 0.5 * (x[0]**2 + 2*x[1]**2 + x[0]*x[1])

x = torch.tensor([1.0, 2.0], requires_grad=True)
loss = quadratic_loss(x)

# Compute gradients
grad = torch.autograd.grad(loss, x, create_graph=True)[0]

# Compute Hessian (second derivatives)
hessian = torch.zeros(2, 2)
for i in range(2):
    grad2 = torch.autograd.grad(grad[i], x, retain_graph=True)[0]
    hessian[i] = grad2

print(f"Loss: {loss.item():.3f}")
print(f"Gradient: {grad}")
print(f"Hessian:\n{hessian}")

# Condition number analysis
eigenvals = torch.linalg.eigvals(hessian).real
condition_number = eigenvals.max() / eigenvals.min()
print(f"Condition number: {condition_number:.2f}")
print(f"Well-conditioned: {condition_number < 100}")

## Jacobian

**Formula:** $\mathbf{J}_{ij} = \frac{\partial f_i}{\partial x_j}$

Matrix of first derivatives for vector-valued functions.

In [None]:
# Vector-valued function example
def vector_function(x):
    return torch.stack([
        x[0]**2 + x[1],
        x[0] * x[1],
        torch.sin(x[0]) + torch.cos(x[1])
    ])

x = torch.tensor([1.0, 2.0], requires_grad=True)
y = vector_function(x)

# Compute Jacobian
jacobian = torch.zeros(3, 2)
for i in range(3):
    if x.grad is not None:
        x.grad.zero_()
    y[i].backward(retain_graph=True)
    jacobian[i] = x.grad.clone()

print(f"Input: {x}")
print(f"Output: {y}")
print(f"Jacobian:\n{jacobian}")

# Neural network layer Jacobian
layer = torch.nn.Linear(3, 2)
x_batch = torch.randn(1, 3, requires_grad=True)
y_batch = layer(x_batch)

# Jacobian for neural network layer
jac = torch.autograd.functional.jacobian(layer, x_batch)
print(f"NN Jacobian shape: {jac.shape}")  # (batch, output_dim, batch, input_dim)