In [8]:
import numpy as np

class DenseLayer:
    def __init__(self, input_dim, output_dim):
        # Initialize weights and biases
        self.weights = np.random.randn(input_dim, output_dim) * 0.01
        self.bias = np.zeros((1, output_dim))

    def forward(self, X):
        # Linear forward pass
        self.input = X
        self.output = np.dot(X, self.weights) + self.bias
        return self.output

    def backward(self, d_out, learning_rate):
        # Compute gradients for weights and bias
        d_weights = np.dot(self.input.T, d_out)
        d_bias = np.sum(d_out, axis=0, keepdims=True)
        d_input = np.dot(d_out, self.weights.T)

        # Update weights and biases
        self.weights -= learning_rate * d_weights
        self.bias -= learning_rate * d_bias

        return d_input


In [9]:
class ReLU:
    def forward(self, X):
        # ReLU forward pass
        self.input = X
        return np.maximum(0, X)

    def backward(self, d_out):
        # Gradient for ReLU
        return d_out * (self.input > 0)


In [10]:
class BatchNormalization:
    def __init__(self, dim, epsilon=1e-5, momentum=0.9):
        self.epsilon = epsilon
        self.momentum = momentum
        self.gamma = np.ones((1, dim))
        self.beta = np.zeros((1, dim))
        self.running_mean = np.zeros((1, dim))
        self.running_var = np.zeros((1, dim))

    def forward(self, X, training=True):
        self.input = X  # Store the input for use in backward pass
        if training:
            self.mean = np.mean(X, axis=0)
            self.var = np.var(X, axis=0)
            self.X_normalized = (X - self.mean) / np.sqrt(self.var + self.epsilon)
            self.output = self.gamma * self.X_normalized + self.beta
            # Update running mean and variance
            self.running_mean = self.momentum * self.running_mean + (1 - self.momentum) * self.mean
            self.running_var = self.momentum * self.running_var + (1 - self.momentum) * self.var
        else:
            # Use running mean and variance during inference
            self.X_normalized = (X - self.running_mean) / np.sqrt(self.running_var + self.epsilon)
            self.output = self.gamma * self.X_normalized + self.beta
        return self.output

    def backward(self, d_out, learning_rate):
        # Gradient computation for batch normalization
        N, D = d_out.shape
        X_mu = self.input - self.mean
        std_inv = 1. / np.sqrt(self.var + self.epsilon)

        dX_norm = d_out * self.gamma
        dvar = np.sum(dX_norm * X_mu, axis=0) * -0.5 * std_inv**3
        dmean = np.sum(dX_norm * -std_inv, axis=0) + dvar * np.mean(-2. * X_mu, axis=0)

        dX = (dX_norm * std_inv) + (dvar * 2 * X_mu / N) + (dmean / N)
        dgamma = np.sum(d_out * self.X_normalized, axis=0)
        dbeta = np.sum(d_out, axis=0)

        # Update gamma and beta
        self.gamma -= learning_rate * dgamma
        self.beta -= learning_rate * dbeta

        return dX


In [11]:
class Dropout:
    def __init__(self, dropout_rate):
        self.dropout_rate = dropout_rate

    def forward(self, X, training=True):
        if training:
            # Create dropout mask
            self.mask = np.random.rand(*X.shape) > self.dropout_rate
            return X * self.mask / (1 - self.dropout_rate)
        else:
            # During inference, do nothing
            return X

    def backward(self, d_out):
        # Apply dropout mask to gradient
        return d_out * self.mask / (1 - self.dropout_rate)


In [12]:
class AdamOptimizer:
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = {}
        self.v = {}
        self.t = 0

    def update(self, params, grads):
        self.t += 1
        updated_params = {}
        for key in params:
            if key not in self.m:
                self.m[key] = np.zeros_like(grads[key])
                self.v[key] = np.zeros_like(grads[key])
            
            # Update biased first moment estimate
            self.m[key] = self.beta1 * self.m[key] + (1 - self.beta1) * grads[key]
            # Update biased second raw moment estimate
            self.v[key] = self.beta2 * self.v[key] + (1 - self.beta2) * (grads[key]**2)
            # Correct bias in first moment
            m_hat = self.m[key] / (1 - self.beta1**self.t)
            # Correct bias in second raw moment
            v_hat = self.v[key] / (1 - self.beta2**self.t)
            # Update parameters
            updated_params[key] = params[key] - self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon)
        
        return updated_params


In [13]:
class Softmax:
    def forward(self, X):
        exps = np.exp(X - np.max(X, axis=1, keepdims=True))
        self.output = exps / np.sum(exps, axis=1, keepdims=True)
        return self.output

    def backward(self, d_out):
        return self.output * (d_out - np.sum(d_out * self.output, axis=1, keepdims=True))


In [14]:
import numpy as np
import pickle
from torchvision import datasets, transforms
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, random_split

# Custom classes for layers and optimizer are assumed to be already defined as in the provided code

# Hyperparameters
input_dim = 28 * 28  # Flatten 28x28 images
hidden_dim = 128     # Number of neurons in hidden layer
output_dim = 10      # Number of output classes (FashionMNIST has 10 classes)
learning_rate = 0.001
batch_size = 64
epochs = 10
dropout_rate = 0.5

# 1. Load and preprocess the FashionMNIST dataset
transform = transforms.ToTensor()
train_dataset = datasets.FashionMNIST(root='./data', train=True, transform=transform, download=True)
test_dataset = datasets.FashionMNIST(root='./data', train=False, transform=transform, download=True)

# Split training dataset into train and validation sets
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_data, val_data = random_split(train_dataset, [train_size, val_size])

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 2. Define the network architecture
class FashionMNISTModel:
    def __init__(self):
        self.layers = [
            DenseLayer(input_dim, hidden_dim),
            BatchNormalization(hidden_dim),
            ReLU(),
            Dropout(dropout_rate),
            DenseLayer(hidden_dim, output_dim),
            Softmax()
        ]

    def forward(self, X, training=True):
        # Sequentially apply each layer
        for layer in self.layers:
            if isinstance(layer, Dropout):
                X = layer.forward(X, training)
            elif isinstance(layer, BatchNormalization):
                X = layer.forward(X, training)
            else:
                X = layer.forward(X)
        return X

    def backward(self, d_out, learning_rate):
        # Backward pass through each layer in reverse order
        for layer in reversed(self.layers):
            if isinstance(layer, (DenseLayer, BatchNormalization)):
                d_out = layer.backward(d_out, learning_rate)
            else:
                d_out = layer.backward(d_out)

# 3. Training loop
def train(model, train_loader, optimizer, epochs):
    for epoch in range(epochs):
        for batch_idx, (data, target) in enumerate(train_loader):
            # Flatten the input data
            data = data.view(data.size(0), -1).numpy()
            target = target.numpy()

            # Forward pass
            output = model.forward(data, training=True)

            # Compute loss (cross-entropy)
            loss = -np.sum(np.log(output[np.arange(len(target)), target])) / len(target)

            # Backward pass
            d_out = output
            d_out[np.arange(len(target)), target] -= 1
            d_out /= len(target)

            # Update weights using optimizer
            model.backward(d_out, optimizer.learning_rate)

        print(f'Epoch {epoch+1}/{epochs}, Loss: {loss:.4f}')

# 4. Evaluation function
def evaluate(model, data_loader):
    all_preds = []
    all_targets = []
    for data, target in data_loader:
        data = data.view(data.size(0), -1).numpy()
        target = target.numpy()

        output = model.forward(data, training=False)
        preds = np.argmax(output, axis=1)

        all_preds.extend(preds)
        all_targets.extend(target)

    accuracy = accuracy_score(all_targets, all_preds)
    return accuracy

# 5. Save model function
def save_model(model, filename):
    with open(filename, 'wb') as f:
        pickle.dump(model, f)

# 6. Instantiate and train the model
model = FashionMNISTModel()
optimizer = AdamOptimizer(learning_rate=learning_rate)

# Training the model
train(model, train_loader, optimizer, epochs)

# Evaluate the model on the validation set
val_accuracy = evaluate(model, val_loader)
print(f'Validation Accuracy: {val_accuracy:.4f}')

# Save the trained model
save_model(model, 'fashion_mnist_model.pkl')

# Evaluate the model on the test set
test_accuracy = evaluate(model, test_loader)
print(f'Test Accuracy: {test_accuracy:.4f}')


Epoch 1/10, Loss: 2.3476
Epoch 2/10, Loss: 2.3708
Epoch 3/10, Loss: 2.4276
Epoch 4/10, Loss: 2.4475
Epoch 5/10, Loss: 2.4427
Epoch 6/10, Loss: 2.4643
Epoch 7/10, Loss: 2.4825
Epoch 8/10, Loss: 2.5228
Epoch 9/10, Loss: 2.5561
Epoch 10/10, Loss: 2.5473
Validation Accuracy: 0.0030
Test Accuracy: 0.0024
