In [191]:
import numpy as np
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

transform = transforms.ToTensor()

train_dataset = datasets.FashionMNIST(root='./data', train=True, transform=transform, download = True)
test_dataset = datasets.FashionMNIST(root='./data', train=False, transform=transform, download = True)

In [192]:
class Dense:
    def __init__(self, input_dim, output_dim):
        self.weights = np.random.randn(input_dim, output_dim).astype(np.float64)
        self.biases = np.zeros((1, output_dim)).astype(np.float64)

    def forward(self, inputs):
        self.inputs = inputs
        return np.dot(inputs, self.weights) + self.biases

    def backward(self, d_output):
        # Compute gradients for weights and biases
        self.d_weights = np.dot(self.inputs.T, d_output)  # Shape: (input_dim, output_dim)
        self.d_biases = np.sum(d_output, axis=0, keepdims=True)  # Ensure shape: (1, output_dim)
        return np.dot(d_output, self.weights.T)


In [193]:
class BatchNormalization:
    def __init__(self, epsilon=1e-5, momentum=0.9):
        self.epsilon = epsilon
        self.momentum = momentum
        self.running_mean = 0
        self.running_var = 0

    def forward(self, inputs):
        self.inputs = inputs  # Store inputs for use in the backward pass
        self.mean = np.mean(inputs, axis=0)
        self.var = np.var(inputs, axis=0)
        self.normalized = (inputs - self.mean) / np.sqrt(self.var + self.epsilon)
        return self.normalized

    def backward(self, d_output):
        N, D = d_output.shape
        d_normalized = d_output / np.sqrt(self.var + self.epsilon)
        d_var = np.sum(d_output * (self.inputs - self.mean) * -0.5 * (self.var + self.epsilon)**(-1.5), axis=0)
        d_mean = np.sum(d_output * -1 / np.sqrt(self.var + self.epsilon), axis=0) + d_var * np.mean(-2 * (self.inputs - self.mean), axis=0)
        d_input = (d_normalized + d_var * 2 * (self.inputs - self.mean) / N + d_mean / N)
        return d_input


In [194]:
class ReLU:
    def forward(self, inputs):
        self.inputs = inputs
        return np.maximum(0, inputs)

    def backward(self, d_output):
        return d_output * (self.inputs > 0)


In [195]:
class Dropout:
    def __init__(self, rate):
        self.rate = rate

    def forward(self, inputs, training=True):
        if training:
            self.mask = np.random.binomial(1, 1 - self.rate, size=inputs.shape) / (1 - self.rate)
            return inputs * self.mask
        return inputs

    def backward(self, d_output):
        return d_output * self.mask


In [196]:
class Softmax:
    def forward(self, inputs):
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        return exp_values / np.sum(exp_values, axis=1, keepdims=True)

    def backward(self, predictions, labels):
        m = labels.shape[0]
        grad = predictions.copy()
        grad[range(m), labels] -= 1
        grad /= m
        return grad


In [197]:
class Adam:
    def __init__(self, learning_rate=0.0001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = {}  # Moving averages of gradients
        self.v = {}  # Moving averages of squared gradients
        self.t = 0   # Time step

    def update(self, param_name, param, grad):
        # Initialize m and v if not already done
        if param_name not in self.m:
            self.m[param_name] = np.zeros_like(grad)
            self.v[param_name] = np.zeros_like(grad)

        # Increment time step
        self.t += 1

        # Update biased first moment estimate
        self.m[param_name] = self.beta1 * self.m[param_name] + (1 - self.beta1) * grad

        # Update biased second raw moment estimate
        self.v[param_name] = self.beta2 * self.v[param_name] + (1 - self.beta2) * (grad ** 2)

        # Correct bias in first and second moment estimates
        m_hat = self.m[param_name] / (1 - self.beta1 ** self.t)
        v_hat = self.v[param_name] / (1 - self.beta2 ** self.t)

        # Update parameters
        update_value = self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon)
        return update_value



In [198]:
class NeuralNetwork:
    def __init__(self, architecture):
        self.layers = []
        self.build(architecture)

    def build(self, architecture):
        for layer in architecture:
            if layer['type'] == 'Dense':
                self.layers.append(Dense(layer['input_dim'], layer['output_dim']))
            elif layer['type'] == 'BatchNormalization':
                self.layers.append(BatchNormalization(layer['dim']))
            elif layer['type'] == 'ReLU':
                self.layers.append(ReLU())
            elif layer['type'] == 'Dropout':
                self.layers.append(Dropout(layer['rate']))
            elif layer['type'] == 'Softmax':
                self.layers.append(Softmax())
        
        self.optimizer = Adam()

    def forward(self, inputs):
        for layer in self.layers:
            inputs = layer.forward(inputs)
        return inputs
    
    
    def backward(self, d_output, labels=None):
        for layer in reversed(self.layers):
            if hasattr(layer, 'backward'):
                if isinstance(layer, Softmax) and labels is not None:
                    d_output = layer.backward(d_output, labels)  # Pass labels for softmax
                else:
                    d_output = layer.backward(d_output)  # No need for labels in other layers


    def update(self):
        for idx, layer in enumerate(self.layers):
            if isinstance(layer, Dense):
                # Create unique names for weights and biases
                weight_name = f"layer_{idx}_weights"
                bias_name = f"layer_{idx}_biases"
                
                # Print statements to confirm shapes (for debugging)
                # print(f"Updating weights: {layer.weights.shape}, d_weights: {layer.d_weights.shape}")
                # print(f"Updating biases: {layer.biases.shape}, d_biases: {layer.d_biases.shape}")
                
                # Correctly pass parameters to the optimizer, including the gradient
                layer.weights -= self.optimizer.update(weight_name, layer.weights, layer.d_weights)
                layer.biases -= self.optimizer.update(bias_name, layer.biases, layer.d_biases)


    def train(self, X, y, epochs, batch_size):
        for epoch in range(epochs):
            for i in range(0, len(X), batch_size):
                X_batch = X[i:i + batch_size]
                y_batch = y[i:i + batch_size]
                
                # Forward pass
                predictions = self.forward(X_batch)

                # Compute loss (e.g., categorical cross-entropy)
                loss = -np.mean(np.log(predictions[range(len(y_batch)), y_batch]))

                # Backward pass
                d_output = self.layers[-1].backward(predictions, y_batch)  # Use true labels here
                self.backward(d_output, y_batch)

                # Update weights
                self.update()

                if i % (batch_size * 10) == 0:  # Print every 10 batches
                    print(f"Epoch {epoch + 1}, Batch {i // batch_size}: Loss = {loss:.4f}")



    def test(self, X_test, y_test):
        predictions = self.forward(X_test)
        predicted_classes = np.argmax(predictions, axis=1)  # Get the index of the max log-probability
        accuracy = np.mean(predicted_classes == y_test)  # Calculate accuracy
        print(f"Test Accuracy: {accuracy * 100:.2f}%")


In [199]:
if __name__ == "__main__":
    # Load your FMNIST dataset here
    # X_train, y_train = ...

    architecture = [
        {'type': 'Dense', 'input_dim': 784, 'output_dim': 256},
        {'type': 'BatchNormalization', 'dim': 256},
        {'type': 'ReLU'},
        {'type': 'Dropout', 'rate': 0.2},
        {'type': 'Dense', 'input_dim': 256, 'output_dim': 128},
        {'type': 'BatchNormalization', 'dim': 128},
        {'type': 'ReLU'},
        {'type': 'Dropout', 'rate': 0.2},
        {'type': 'Dense', 'input_dim': 128, 'output_dim': 10},
        {'type': 'Softmax'},
    ]

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


    # Prepare training data
    X_train, y_train = next(iter(train_loader))
    X_train = X_train.numpy().reshape(-1, 28*28)  # Flatten the images
    y_train = y_train.numpy()


    X_test, y_test = next(iter(test_loader))
    X_test = X_test.numpy().reshape(-1, 28 * 28)  # Flatten the images
    y_test = y_test.numpy()

    X_train = X_train.astype(np.float32) / 255.0
    X_test = X_test.astype(np.float32) / 255.0

    
    model = NeuralNetwork(architecture)
    model.train(X_train, y_train, epochs=50, batch_size=32)
    model.test(X_test, y_test)


Epoch 1, Batch 0: Loss = 2.3025
Epoch 2, Batch 0: Loss = 2.3007
Epoch 3, Batch 0: Loss = 2.2985
Epoch 4, Batch 0: Loss = 2.2980
Epoch 5, Batch 0: Loss = 2.2965
Epoch 6, Batch 0: Loss = 2.2951
Epoch 7, Batch 0: Loss = 2.2885
Epoch 8, Batch 0: Loss = 2.2860
Epoch 9, Batch 0: Loss = 2.2839
Epoch 10, Batch 0: Loss = 2.2830
Epoch 11, Batch 0: Loss = 2.2776
Epoch 12, Batch 0: Loss = 2.2747
Epoch 13, Batch 0: Loss = 2.2730
Epoch 14, Batch 0: Loss = 2.2649
Epoch 15, Batch 0: Loss = 2.2641
Epoch 16, Batch 0: Loss = 2.2629
Epoch 17, Batch 0: Loss = 2.2584
Epoch 18, Batch 0: Loss = 2.2567
Epoch 19, Batch 0: Loss = 2.2510
Epoch 20, Batch 0: Loss = 2.2478
Epoch 21, Batch 0: Loss = 2.2401
Epoch 22, Batch 0: Loss = 2.2373
Epoch 23, Batch 0: Loss = 2.2301
Epoch 24, Batch 0: Loss = 2.2291
Epoch 25, Batch 0: Loss = 2.2189
Epoch 26, Batch 0: Loss = 2.2197
Epoch 27, Batch 0: Loss = 2.2124
Epoch 28, Batch 0: Loss = 2.2062
Epoch 29, Batch 0: Loss = 2.1990
Epoch 30, Batch 0: Loss = 2.1978
Epoch 31, Batch 0: 