In [None]:
import numpy as np
import torch
import torch.optim as optim
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.datasets import make_regression
import matplotlib.pyplot as plt

In [None]:
np.random.seed(42)
torch.manual_seed(42)
tf.random.set_seed(42)

X = np.random.randn(10, 3)
y = np.random.randn(10, 1)

In [None]:
def print_activations(X, W, b, activation='relu'):
    pre_activation = np.dot(X, W) + b
    print("\n1. Pre-activated values:\n", pre_activation)
    
    if activation == 'relu':
        post_activation = np.maximum(0, pre_activation)
    elif activation == 'sigmoid':
        post_activation = 1 / (1 + np.exp(-pre_activation))
    elif activation == 'tanh':
        post_activation = np.tanh(pre_activation)
    else:
        raise ValueError("Unsupported activation function")
    
    print("\nPost-activated values:\n", post_activation)
    return pre_activation, post_activation

In [None]:
W = np.random.randn(3, 4)
b = np.random.randn(4)
pre_act, post_act = print_activations(X, W, b, activation='relu')

In [None]:
def check_backprop():
    print("\n2. Checking backpropagation...")
    
    model = torch.nn.Sequential(
        torch.nn.Linear(2, 3),
        torch.nn.Tanh(),
        torch.nn.Linear(3, 1)
    ).double()
    
    X = torch.randn(5, 2, dtype=torch.double, requires_grad=True)
    y = torch.randn(5, 1, dtype=torch.double)
    
    test_input = torch.autograd.gradcheck(
        lambda x: torch.nn.functional.mse_loss(model(x), y),
        X,
        eps=1e-6,
        atol=1e-4,
        rtol=1e-4,
        raise_exception=False
    )
    print("Input gradient check passed:", test_input)
    
    for name, param in model.named_parameters():
        if param.requires_grad:
            def func(input):
                with torch.no_grad():
                    old_data = param.data.clone()
                    param.data.copy_(input)
                output = model(X)
                loss = torch.nn.functional.mse_loss(output, y)
                with torch.no_grad():
                    param.data.copy_(old_data)
                return loss
            
            test_param = torch.autograd.gradcheck(
                func,
                param.data.clone().requires_grad_(True),
                eps=1e-6,
                atol=1e-4,
                rtol=1e-4,
                raise_exception=False
            )
            print(f"Parameter {name} gradient check passed:", test_param)

In [None]:
check_backprop()

In [None]:
def standardize_data(X):
    print("\n3. Standardizing data...")
    scaler = StandardScaler()
    X_std = scaler.fit_transform(X)
    print("Mean after standardization:", X_std.mean(axis=0))
    print("Std after standardization:", X_std.std(axis=0))
    return X_std

In [None]:
X_std = standardize_data(X)

In [None]:
def normalize_data(X):
    print("\n4. Normalizing data (min=10, max=50)...")
    scaler = MinMaxScaler(feature_range=(10, 50))
    X_norm = scaler.fit_transform(X)
    print("Min after normalization:", X_norm.min(axis=0))
    print("Max after normalization:", X_norm.max(axis=0))
    return X_norm

In [None]:
X_norm = normalize_data(X)

In [None]:
def xavier_init(n_input, n_output):
    print("\n5. Xavier initialization...")
    std_normal = np.sqrt(2.0 / (n_input + n_output))
    weights_normal = np.random.normal(0, std_normal, (n_input, n_output))
    
    limit = np.sqrt(6.0 / (n_input + n_output))
    weights_uniform = np.random.uniform(-limit, limit, (n_input, n_output))
    
    print("Xavier Normal weights mean/std:", weights_normal.mean(), weights_normal.std())
    print("Xavier Uniform weights min/max:", weights_uniform.min(), weights_uniform.max())
    return weights_normal, weights_uniform

In [None]:
weights_normal, weights_uniform = xavier_init(3, 4)

In [None]:
def apply_relu_variants(pre_activation):
    print("\n5. Applying ReLU variants...")
    relu = np.maximum(0, pre_activation)
    
    leaky = np.where(pre_activation > 0, pre_activation, pre_activation * 0.01)
    
    parametric = np.where(pre_activation > 0, pre_activation, pre_activation * 0.25)
    
    elu = np.where(pre_activation > 0, pre_activation, 1.0 * (np.exp(pre_activation) - 1))

    swish = pre_activation * (1 / (1 + np.exp(-pre_activation)))
    
    print("Original values:\n", pre_activation)
    print("\nReLU:\n", relu)
    print("\nLeaky ReLU:\n", leaky)
    print("\nParametric ReLU:\n", parametric)
    print("\nELU:\n", elu)
    print("\nSwish:\n", swish)
    
    return {'relu': relu, 'leaky': leaky, 'parametric': parametric, 'elu': elu, 'swish': swish}

In [None]:
relu_results = apply_relu_variants(pre_act)

In [None]:
def learning_rate_decay(initial_lr=0.1, decay_type='exponential', steps=100):
    print(f"\n6. {decay_type} learning rate decay over {steps} steps:")
    
    lr_values = []
    for global_step in range(steps):
        if decay_type == 'exponential':
            decay_rate = 0.96
            lr = initial_lr * decay_rate ** (global_step / (steps/10))
        elif decay_type == 'inverse_time':
            decay_rate = 1.0
            lr = initial_lr / (1 + decay_rate * global_step / (steps/10))
        else:
            raise ValueError("Unsupported decay type")
        lr_values.append(lr)
    
    plt.figure(figsize=(10, 5))
    plt.plot(lr_values)
    plt.title(f"{decay_type} Learning Rate Decay")
    plt.xlabel("Step")
    plt.ylabel("Learning Rate")
    plt.grid()
    plt.show()
    
    return lr_values

In [None]:
lr_exp = learning_rate_decay(decay_type='exponential')
lr_inv = learning_rate_decay(decay_type='inverse_time')

In [None]:
def compare_optimizers():
    print("\n7. Comparing optimizers...")
    
    X, y = make_regression(n_samples=100, n_features=5, noise=0.1, random_state=42)
    X = X.astype(np.float32)
    y = y.astype(np.float32).reshape(-1, 1)

    X_tensor = torch.tensor(X, dtype=torch.float32)
    y_tensor = torch.tensor(y, dtype=torch.float32)
    
    def create_model():
        return torch.nn.Sequential(
            torch.nn.Linear(5, 10),
            torch.nn.ReLU(),
            torch.nn.Linear(10, 1)
        )

    optimizers = {
        'SGD': optim.SGD,
        'Adagrad': optim.Adagrad,
        'RMSprop': optim.RMSprop,
        'Adam': optim.Adam,
        'Adadelta': optim.Adadelta
    }
    
    n_epochs = 200
    results = {}
    
    plt.figure(figsize=(12, 6))
    
    for name, opt_class in optimizers.items():
        model = create_model()
        optimizer = opt_class(model.parameters(), lr=0.01)
        
        losses = []
        for epoch in range(n_epochs):
            optimizer.zero_grad()
            y_pred = model(X_tensor)
            loss = torch.nn.functional.mse_loss(y_pred, y_tensor)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        
        results[name] = losses
        plt.plot(losses, label=name)
        print(f"{name}: Final loss = {losses[-1]:.4f}")
    
    plt.title("Optimizer Comparison")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.grid()
    plt.show()
    
    return results

In [None]:
optimizer_results = compare_optimizers()

In [None]:
def clip_gradients(gradients, method='value', threshold=1.0):
    print(f"\n8. Gradient clipping ({method} with threshold={threshold})...")
    
    if method == 'value':
        clipped = [np.clip(g, -threshold, threshold) for g in gradients]
    elif method == 'norm':
        global_norm = np.sqrt(sum(np.sum(g**2) for g in gradients))
        scale = threshold / max(global_norm, threshold)
        clipped = [g * scale for g in gradients]
    else:
        raise ValueError("Unsupported clipping method")
    
    print("Original gradient norms:", [np.linalg.norm(g) for g in gradients])
    print("Clipped gradient norms:", [np.linalg.norm(g) for g in clipped])
    
    return clipped

In [None]:
gradients = [np.random.randn(3,4), np.random.randn(4,1)]
clipped_value = clip_gradients(gradients, method='value', threshold=1.0)
clipped_norm = clip_gradients(gradients, method='norm', threshold=1.0)

In [None]:
def hessian_free_optimization():
    print("\n9. Hessian-Free Optimization...")
    
    print("PyTorch implementation:")
    x = torch.randn(3, requires_grad=True, dtype=torch.double)
    
    def f(x):
        return x @ torch.diag(torch.tensor([1.0, 2.0, 3.0], dtype=torch.double)) @ x
    
    grad = torch.autograd.grad(f(x), x, create_graph=True)[0]
    
    def hvp(v):
        return torch.autograd.grad(grad @ v, x, retain_graph=True)[0]
    
    def cg_solve(A, b, max_iter=10, tol=1e-6):
        x = torch.zeros_like(b)
        r = b - A(x)
        p = r.clone()
        rsold = r @ r
        
        for i in range(max_iter):
            Ap = A(p)
            alpha = rsold / (p @ Ap)
            x = x + alpha * p
            r = r - alpha * Ap
            rsnew = r @ r
            if torch.sqrt(rsnew) < tol:
                break
            p = r + (rsnew / rsold) * p
            rsold = rsnew
        
        return x
    
    v = cg_solve(hvp, -grad)
    print("Solution v:", v.detach().numpy())

In [None]:
hessian_free_optimization()

In [None]:
def hessian_analysis():
    print("\n10. Hessian matrix analysis...")
    
    def f(x):
        return x[0]**2 + x[1]**3 - x[1]**2
    
    critical_point = np.array([0, 2/3])
    
    eps = 1e-5
    hessian = np.zeros((2, 2))
    
    for i in range(2):
        for j in range(2):
            def partial_derivative(x):
                x_plus = x.copy()
                x_plus[j] += eps
                x_minus = x.copy()
                x_minus[j] -= eps
                return (f(x_plus) - f(x_minus)) / (2 * eps)
            
            x_plus = critical_point.copy()
            x_plus[i] += eps
            x_minus = critical_point.copy()
            x_minus[i] -= eps
            hessian[i,j] = (partial_derivative(x_plus) - partial_derivative(x_minus)) / (2 * eps)
    
    print("Hessian matrix:\n", hessian)
    
    eigenvalues = np.linalg.eigvals(hessian)
    print("Eigenvalues:", eigenvalues)
    
    if all(eig > 0 for eig in eigenvalues):
        print("Conclusion: Local minimum")
    elif all(eig < 0 for eig in eigenvalues):
        print("Conclusion: Local maximum")
    else:
        print("Conclusion: Saddle point")

In [None]:
hessian_analysis()