In [1]:
import numpy as np

# Linear activation function
def linear(z):
    """
    Computes the linear activation of input z.

    Arguments:
    z -- A scalar or numpy array.

    Returns:
    The input value z.
    """
    return z

# Gradient of linear activation function
def linear_gradient(z):
    """
    Computes the gradient of the linear activation function.

    Arguments:
    z -- A scalar or numpy array.

    Returns:
    A numpy array of ones with the same shape as z.
    """
    return np.ones_like(z)

# Example usage
z = np.array([1, -2, 3])
print("Linear activation:", linear(z))  # Output: [1 -2 3]
print("Gradient of linear activation:", linear_gradient(z))  # Output: [1 1 1]

Linear activation: [ 1 -2  3]
Gradient of linear activation: [1 1 1]


In [2]:
# ReLU activation function
def relu(z):
    """
    Computes the ReLU activation of input z.

    Arguments:
    z -- A scalar or numpy array.

    Returns:
    A numpy array where each element is max(z, 0).
    """
    return np.maximum(0, z)

# Gradient of ReLU activation function
def relu_gradient(z):
    """
    Computes the gradient of the ReLU activation function.

    Arguments:
    z -- A scalar or numpy array.

    Returns:
    A numpy array where gradient is 1 if z > 0, else 0.
    """
    return np.where(z > 0, 1, 0)

# Example usage
z = np.array([1, -2, 3])
print("ReLU activation:", relu(z))  # Output: [1 0 3]
print("Gradient of ReLU activation:", relu_gradient(z))  # Output: [1 0 1]

ReLU activation: [1 0 3]
Gradient of ReLU activation: [1 0 1]


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define a simple deep network
class DeepNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(DeepNetwork, self).__init__()
        layers = []
        for _ in range(num_layers):
            layers.append(nn.Linear(input_size, hidden_size))
            layers.append(nn.ReLU())
            input_size = hidden_size
        layers.append(nn.Linear(hidden_size, output_size))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)

# Example architecture
input_size = 10
hidden_size = 64
num_layers = 10
output_size = 1

# Create the model
model = DeepNetwork(input_size, hidden_size, num_layers, output_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Dummy data
x = torch.randn(64, input_size)
y = torch.randn(64, output_size)

# Training loop
for epoch in range(100):
    optimizer.zero_grad()

    # Forward pass
    outputs = model(x)
    loss = criterion(outputs, y)

    # Backward pass
    loss.backward()

    # Check gradient norms for exploding/vanishing gradients
    total_norm = 0
    for p in model.parameters():
        param_norm = p.grad.detach().data.norm(2)
        total_norm += param_norm.item() ** 2
    total_norm = total_norm ** 0.5
    print(f'Epoch {epoch}, Gradient Norm: {total_norm}')

    # Gradient clipping to avoid exploding gradients
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

    # Optimizer step
    optimizer.step()


Epoch 0, Gradient Norm: 0.4523767410462905
Epoch 1, Gradient Norm: 0.41859387022787214
Epoch 2, Gradient Norm: 0.3879631561111073
Epoch 3, Gradient Norm: 0.35912419756485336
Epoch 4, Gradient Norm: 0.3324897984511198
Epoch 5, Gradient Norm: 0.30137015990803107
Epoch 6, Gradient Norm: 0.2712081427664562
Epoch 7, Gradient Norm: 0.2480034852777202
Epoch 8, Gradient Norm: 0.22443045397307265
Epoch 9, Gradient Norm: 0.20021969507660514
Epoch 10, Gradient Norm: 0.17710474789806716
Epoch 11, Gradient Norm: 0.15368450088441532
Epoch 12, Gradient Norm: 0.13202803219179884
Epoch 13, Gradient Norm: 0.10609386827604012
Epoch 14, Gradient Norm: 0.08347735423949314
Epoch 15, Gradient Norm: 0.06010956516261824
Epoch 16, Gradient Norm: 0.045946435071724655
Epoch 17, Gradient Norm: 0.05076420832922851
Epoch 18, Gradient Norm: 0.04771187647117514
Epoch 19, Gradient Norm: 0.08078144057309919
Epoch 20, Gradient Norm: 0.07998951140627654
Epoch 21, Gradient Norm: 0.097031759145595
Epoch 22, Gradient Norm: 0

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define a densely connected network (like DenseNet)
class DenseConnectedNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(DenseConnectedNetwork, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.layers = nn.ModuleList()

        # Create layers with the ability to take inputs from all previous layers
        for i in range(num_layers):
            self.layers.append(nn.Linear(input_size + i * hidden_size, hidden_size))

        # Output layer
        self.output_layer = nn.Linear(input_size + num_layers * hidden_size, output_size)

    def forward(self, x):
        outputs = [x]
        for i in range(self.num_layers):
            concat_input = torch.cat(outputs, dim=1)  # Concatenate outputs from all previous layers
            h = torch.relu(self.layers[i](concat_input))
            outputs.append(h)  # Store this layer's output
        concat_final = torch.cat(outputs, dim=1)  # Final concatenation for output layer
        return self.output_layer(concat_final)

# Example architecture
input_size = 10
hidden_size = 64
num_layers = 5
output_size = 1

# Create the model
model = DenseConnectedNetwork(input_size, hidden_size, num_layers, output_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Dummy data
x = torch.randn(64, input_size)
y = torch.randn(64, output_size)

# Training loop with gradient monitoring
for epoch in range(100):
    optimizer.zero_grad()

    # Forward pass
    outputs = model(x)
    loss = criterion(outputs, y)

    # Backward pass
    loss.backward()

    # Check gradient norms to monitor vanishing/exploding gradients
    total_norm = 0
    for p in model.parameters():
        param_norm = p.grad.detach().data.norm(2)
        total_norm += param_norm.item() ** 2
    total_norm = total_norm ** 0.5
    print(f'Epoch {epoch}, Gradient Norm: {total_norm}')

    # Gradient clipping to avoid exploding gradients
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

    # Optimizer step
    optimizer.step()


Epoch 0, Gradient Norm: 1.262849431997687
Epoch 1, Gradient Norm: 0.9895228226073582
Epoch 2, Gradient Norm: 0.8545424414365269
Epoch 3, Gradient Norm: 0.7726841997672383
Epoch 4, Gradient Norm: 0.690335037833154
Epoch 5, Gradient Norm: 0.6046125273097865
Epoch 6, Gradient Norm: 0.5332848945094729
Epoch 7, Gradient Norm: 0.49437541775811966
Epoch 8, Gradient Norm: 0.48796102911596906
Epoch 9, Gradient Norm: 0.49088526416259065
Epoch 10, Gradient Norm: 0.4920652225906443
Epoch 11, Gradient Norm: 0.5007111581244583
Epoch 12, Gradient Norm: 0.5161000643457264
Epoch 13, Gradient Norm: 0.5281004086563275
Epoch 14, Gradient Norm: 0.5271068074467375
Epoch 15, Gradient Norm: 0.5189915161137928
Epoch 16, Gradient Norm: 0.5154748716831468
Epoch 17, Gradient Norm: 0.5112056764613432
Epoch 18, Gradient Norm: 0.5002191745376299
Epoch 19, Gradient Norm: 0.4948650537216334
Epoch 20, Gradient Norm: 0.4909484172941085
Epoch 21, Gradient Norm: 0.4787629475970402
Epoch 22, Gradient Norm: 0.46363765556804