In [36]:
import numpy as np
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

transform = transforms.ToTensor()

train_dataset = datasets.FashionMNIST(root='./data', train=True, transform=transform, download = True)
test_dataset = datasets.FashionMNIST(root='./data', train=False, transform=transform, download = True)

d(loss)/d(weights) = d(loss)/d(output) * d(output)/d(weights) = d_output * d(output)/d(weights)

As output = weights * input
d(output)/d(weights) = input

So,
d(loss)/d(weights) = d_output * input

similarly for bias term. 
d(loss)/d(bias) = d_output * 1

In [68]:
class DenseLayer:
    def __init__(self, input_dim, output_dim):
        # Initialize weights with small random values and biases with zeros
        self.weights = np.random.randn(input_dim, output_dim) * 0.01
        self.biases = np.zeros((1, output_dim))

        self.d_biases = None
        self.d_weights = None
    
    def forward(self, inputs):
        # Perform the forward pass
        self.inputs = inputs
        self.output = np.dot(inputs, self.weights) + self.biases
        return self.output
    
    def backward(self, d_output):
        # Compute the gradient for weights, biases, and inputs
        self.d_weights = np.dot(self.inputs.T, d_output)
        self.d_biases = np.sum(d_output, axis=0, keepdims=True)
        d_inputs = np.dot(d_output, self.weights.T)
        
        # # Update weights and biases
        # self.weights -= learning_rate * d_weights
        # self.biases -= learning_rate * d_biases
        
        return d_inputs

In [66]:
class BatchNormalization:
    def __init__(self, num_features, momentum=0.9, epsilon=1e-5):
        self.gamma = np.ones(num_features)  # Scale parameter
        self.beta = np.zeros(num_features)   # Shift parameter
        self.momentum = momentum
        self.epsilon = epsilon

        # Running averages for inference
        self.running_mean = np.zeros(num_features)
        self.running_var = np.ones(num_features)

        # To store intermediate values during forward pass
        self.input = None
        self.mean = None
        self.var = None

    def forward(self, x, training=True):
        if training:
            self.mean = np.mean(x, axis=0)
            self.var = np.var(x, axis=0)
            self.input = x  # Save input for backward pass

            # Normalize the input
            x_normalized = (x - self.mean) / np.sqrt(self.var + self.epsilon)

            # Update running averages
            self.running_mean = self.momentum * self.running_mean + (1 - self.momentum) * self.mean
            self.running_var = self.momentum * self.running_var + (1 - self.momentum) * self.var

            # Scale and shift
            return self.gamma * x_normalized + self.beta
        else:
            # During inference, use running averages
            x_normalized = (x - self.running_mean) / np.sqrt(self.running_var + self.epsilon)
            return self.gamma * x_normalized + self.beta

    def backward(self, d_output):
        N, D = self.input.shape  # Use stored input from forward pass

        # Calculate gradients
        dx_normalized = d_output * self.gamma

        # Gradients for gamma and beta
        dgamma = np.sum(d_output * (self.input - self.mean) / np.sqrt(self.var + self.epsilon), axis=0)
        dbeta = np.sum(d_output, axis=0)

        # Gradients for variance and mean
        dvar = np.sum(dx_normalized * (self.input - self.mean) * -0.5 * np.power(self.var + self.epsilon, -1.5), axis=0)
        dmean = np.sum(dx_normalized * -1 / np.sqrt(self.var + self.epsilon), axis=0) + dvar * np.mean(-2 * (self.input - self.mean), axis=0)

        # Gradients for input
        dx = (dx_normalized / np.sqrt(self.var + self.epsilon)) + (dvar * 2 * (self.input - self.mean) / N) + (dmean / N)

        # Store gradients for the optimizer
        self.d_gamma = dgamma
        self.d_beta = dbeta

        return dx


In [27]:
class ReLU:
    def __init__(self):
        self.cache = None

    # Forward pass
    def forward(self, X):
        self.cache = X
        return np.maximum(0, X)

    # Backward pass
    def backward(self, dout):
        # Gradient of ReLU: 1 where input > 0, else 0
        dX = dout * (self.cache > 0)
        return dX

In [29]:
import numpy as np

class Dropout:
    def __init__(self, dropout_rate=0.5):
        self.dropout_rate = dropout_rate
        self.mask = None

    # Forward pass
    def forward(self, X, training=True):
        if training:
            # Create a mask where each element is 0 with probability `dropout_rate`
            self.mask = (np.random.rand(*X.shape) > self.dropout_rate) / (1 - self.dropout_rate)
            # Apply the mask to the input
            return X * self.mask
        else:
            # During inference, do not apply dropout; use the original values
            return X

    # Backward pass
    def backward(self, dout):
        # During backpropagation, only propagate gradients through the neurons that were not dropped out
        return dout * self.mask


In [71]:
class Adam:
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = {}
        self.v = {}
        self.t = 0

    def update(self, w, dw, id):
        if id not in self.m:
            self.m[id] = np.zeros_like(w)
            self.v[id] = np.zeros_like(w)

        self.t += 1
        self.m[id] = self.beta1 * self.m[id] + (1 - self.beta1) * dw
        self.v[id] = self.beta2 * self.v[id] + (1 - self.beta2) * (dw ** 2)

        m_hat = self.m[id] / (1 - self.beta1 ** self.t)
        v_hat = self.v[id] / (1 - self.beta2 ** self.t)

        w -= self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon)
        return w


In [31]:
class Softmax:
    def __init__(self):
        self.output = None

    def forward(self, logits):
        # Shift the logits by subtracting the max value for numerical stability
        exp_values = np.exp(logits - np.max(logits, axis=1, keepdims=True))
        # Normalize by dividing by the sum of exponentials along each row
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        self.output = probabilities
        return probabilities

    def backward(self, dL_dout):
        # Gradient of the loss with respect to the input logits
        dL_dz = self.output * (dL_dout - np.sum(dL_dout * self.output, axis=1, keepdims=True))
        return dL_dz

In [45]:
def cross_entropy_loss(predictions, targets):
        # One-hot encode targets
        targets_one_hot = np.zeros_like(predictions)
        targets_one_hot[np.arange(len(targets)), targets] = 1
        return -np.mean(np.sum(targets_one_hot * np.log(predictions + 1e-12), axis=1))

def derivative_cross_entropy(predictions, targets):
        # One-hot encode targets
        targets_one_hot = np.zeros_like(predictions)
        targets_one_hot[np.arange(len(targets)), targets] = 1
        return predictions - targets_one_hot

In [77]:
class FeedForwardNetwork:
    def __init__(self, layers, optimizer, epochs=10, batch_size=64):
        self.layers = layers
        self.optimizer = optimizer
        self.epochs = epochs
        self.batch_size = batch_size
        # self.learning_rate = learning_rate

    def train(self, X_train, y_train):
            for epoch in range(self.epochs):
                # Shuffle the training data
                indices = np.arange(X_train.shape[0])
                np.random.shuffle(indices)
                X_train = X_train[indices]
                y_train = y_train[indices]

                for i in range(0, X_train.shape[0], self.batch_size):
                    X_batch = X_train[i:i + self.batch_size]
                    y_batch = y_train[i:i + self.batch_size]

                    # Forward pass through the network
                    input_data = X_batch
                    for layer in self.layers:
                        if isinstance(layer, Dropout):
                            input_data = layer.forward(input_data, training=True)
                        else:
                            input_data = layer.forward(input_data)

                    # Calculate loss and backward pass
                    loss = cross_entropy_loss(input_data, y_batch)
                    dL_dout = derivative_cross_entropy(input_data, y_batch)

                    # Backpropagation
                    for layer in reversed(self.layers):
                        dL_dout = layer.backward(dL_dout)

                    # Update weights using Adam
                    for layer in self.layers:
                        if hasattr(layer, 'weights'):
                            layer.weights = self.optimizer.update(layer.weights, layer.d_weights, id(layer.weights))
                            layer.biases = self.optimizer.update(layer.biases, layer.d_biases, id(layer.biases))

                print(f'Epoch {epoch + 1}/{self.epochs}, Loss: {loss:.4f}')


    def test(self, X_test, y_test):
            correct_predictions = 0
            total_samples = X_test.shape[0]

            # Forward pass through the network for the test set
            input_data = X_test
            for layer in self.layers:
                input_data = layer.forward(input_data)

            # Get predictions by taking the argmax of the output
            predictions = np.argmax(input_data, axis=1)

            # Compare predictions with true labels
            correct_predictions = np.sum(predictions == y_test)

            # Calculate accuracy
            accuracy = correct_predictions / total_samples
            print(f'Test Accuracy: {accuracy * 100:.2f}%')

In [76]:
layers = [
    DenseLayer(784, 128),
    ReLU(),
    BatchNormalization(128),
    Dropout(0.5),
    DenseLayer(128, 64),
    ReLU(),
    BatchNormalization(64),
    Dropout(0.5),
    DenseLayer(64, 32),
    ReLU(),
    BatchNormalization(32),
    Dropout(0.5),
    DenseLayer(32, 10),
    Softmax()
]

# Initialize Adam optimizer
adam = Adam(learning_rate=0.001)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


# Prepare training data
X_train, y_train = next(iter(train_loader))
X_train = X_train.numpy().reshape(-1, 28*28)  # Flatten the images
y_train = y_train.numpy()


X_test, y_test = next(iter(test_loader))
X_test = X_test.numpy().reshape(-1, 28 * 28)  # Flatten the images
y_test = y_test.numpy()



# Create and train the network
network = FeedForwardNetwork(layers=layers, optimizer=adam, epochs=1000, batch_size=64)
network.train(X_train, y_train)

network.test(X_test, y_test)

Epoch 1/1000, Loss: 2.3042
Epoch 2/1000, Loss: 2.2661
Epoch 3/1000, Loss: 2.2723
Epoch 4/1000, Loss: 2.2781
Epoch 5/1000, Loss: 2.2529
Epoch 6/1000, Loss: 2.2164
Epoch 7/1000, Loss: 2.2073
Epoch 8/1000, Loss: 2.1857
Epoch 9/1000, Loss: 2.1900
Epoch 10/1000, Loss: 2.1566
Epoch 11/1000, Loss: 2.0972
Epoch 12/1000, Loss: 2.0830
Epoch 13/1000, Loss: 2.0775
Epoch 14/1000, Loss: 2.0438
Epoch 15/1000, Loss: 2.0060
Epoch 16/1000, Loss: 1.9617
Epoch 17/1000, Loss: 1.9361
Epoch 18/1000, Loss: 1.8998
Epoch 19/1000, Loss: 1.8649
Epoch 20/1000, Loss: 1.8295
Epoch 21/1000, Loss: 1.7551
Epoch 22/1000, Loss: 1.7670
Epoch 23/1000, Loss: 1.7340
Epoch 24/1000, Loss: 1.7225
Epoch 25/1000, Loss: 1.6671
Epoch 26/1000, Loss: 1.6029
Epoch 27/1000, Loss: 1.6142
Epoch 28/1000, Loss: 1.5984
Epoch 29/1000, Loss: 1.5715
Epoch 30/1000, Loss: 1.5139
Epoch 31/1000, Loss: 1.4705
Epoch 32/1000, Loss: 1.3676
Epoch 33/1000, Loss: 1.3826
Epoch 34/1000, Loss: 1.3619
Epoch 35/1000, Loss: 1.3194
Epoch 36/1000, Loss: 1.3356
E