# Nonlinear Functions for Neural Networks

This notebook contains PyTorch examples demonstrating nonlinear activation functions.

## Table of Contents
1. [Hyperbolic Tangent (tanh)](#hyperbolic-tangent-tanh)
2. [Sigmoid](#sigmoid)
3. [ReLU](#relu)

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt

## Hyperbolic Tangent (tanh)

**Formula:** $\tanh(x) = \frac{e^x - e^{-x}}{e^x + e^{-x}}$

Zero-centered activation function.

In [None]:
# Comparing different activation functions
x = torch.linspace(-5, 5, 100)
tanh_output = torch.tanh(x)
sigmoid_output = torch.sigmoid(x)
relu_output = torch.relu(x)

print(f"Tanh range: [{tanh_output.min():.3f}, {tanh_output.max():.3f}]")
print(f"Sigmoid range: [{sigmoid_output.min():.3f}, {sigmoid_output.max():.3f}]")

# Gradient comparison
x_test = torch.tensor(0.0, requires_grad=True)
tanh_out = torch.tanh(x_test)
tanh_out.backward()
tanh_grad = x_test.grad.item()

x_test = torch.tensor(0.0, requires_grad=True)
sigmoid_out = torch.sigmoid(x_test)
sigmoid_out.backward()
sigmoid_grad = x_test.grad.item()

print(f"Tanh gradient at 0: {tanh_grad:.3f}")
print(f"Sigmoid gradient at 0: {sigmoid_grad:.3f}")

# Neural network with tanh
class TanhNet(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(2, 10),
            torch.nn.Tanh(),
            torch.nn.Linear(10, 10),
            torch.nn.Tanh(),
            torch.nn.Linear(10, 1)
        )
    
    def forward(self, x):
        return self.layers(x)

# Test gradient flow
net = TanhNet()
x_input = torch.randn(1, 2, requires_grad=True)
output = net(x_input)
output.backward()

print(f"Input gradient norm: {torch.norm(x_input.grad):.3f}")
print("Tanh allows gradients to flow but can saturate for large inputs")

## Sigmoid

**Formula:** $\sigma(x) = \frac{1}{1 + e^{-x}}$

Outputs probability-like values between 0 and 1.

In [None]:
# Sigmoid for binary classification
def binary_classifier(x):
    """Simple binary classifier using sigmoid"""
    w = torch.tensor([[1.5], [-2.0]])  # Weights
    b = torch.tensor([0.5])             # Bias
    logits = x @ w + b
    return torch.sigmoid(logits)

# Test data
x_data = torch.tensor([[1.0, 2.0], [0.5, 1.0], [-1.0, 0.5]])
probabilities = binary_classifier(x_data)
predictions = (probabilities > 0.5).float()

print(f"Logits: {(x_data @ torch.tensor([[1.5], [-2.0]]) + 0.5).squeeze()}")
print(f"Probabilities: {probabilities.squeeze()}")
print(f"Predictions: {predictions.squeeze()}")

# Vanishing gradient problem
x_extreme = torch.tensor([-10.0, 0.0, 10.0], requires_grad=True)
y = torch.sigmoid(x_extreme)
loss = y.sum()
loss.backward()

print(f"Sigmoid outputs for extreme inputs: {y}")
print(f"Gradients: {x_extreme.grad}")
print("Notice near-zero gradients for extreme inputs")

## ReLU

**Formula:** $\text{ReLU}(x) = \max(0, x)$

Simple nonlinearity that enables deep networks.

In [None]:
# ReLU and its variants
x = torch.linspace(-3, 3, 100)
relu = torch.relu(x)
leaky_relu = torch.nn.functional.leaky_relu(x, negative_slope=0.1)
elu = torch.nn.functional.elu(x)

print(f"ReLU gradient: {1.0 if x[75] > 0 else 0.0}")  # x[75] is positive
print(f"ReLU negative outputs: {(relu < 0).sum()}")   # Should be 0

# Sparsity demonstration
def count_active_neurons(activations):
    return (activations > 0).float().mean()

# Network with ReLU
class ReLUNet(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(100, 200),
            torch.nn.ReLU(),
            torch.nn.Linear(200, 200),
            torch.nn.ReLU(),
            torch.nn.Linear(200, 10)
        )
    
    def forward(self, x):
        activations = []
        for layer in self.layers:
            x = layer(x)
            if isinstance(layer, torch.nn.ReLU):
                activations.append(x)
        return x, activations

net = ReLUNet()
x_input = torch.randn(32, 100)  # Batch of 32 samples
output, activations = net(x_input)

for i, act in enumerate(activations):
    sparsity = count_active_neurons(act)
    print(f"Layer {i+1} sparsity: {sparsity:.2%} neurons active")

# Gradient flow comparison
def test_gradient_flow(activation_fn, name):
    x = torch.randn(1, 10, requires_grad=True)
    y = activation_fn(x)
    loss = y.sum()
    loss.backward()
    return torch.norm(x.grad).item(), name

relu_grad, _ = test_gradient_flow(torch.relu, "ReLU")
tanh_grad, _ = test_gradient_flow(torch.tanh, "Tanh")
sigmoid_grad, _ = test_gradient_flow(torch.sigmoid, "Sigmoid")

print(f"Gradient norms - ReLU: {relu_grad:.3f}, Tanh: {tanh_grad:.3f}, Sigmoid: {sigmoid_grad:.3f}")