## Importing Necessary Libraries

In [1]:
import numpy as np
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torchvision import datasets, transforms
from tqdm import tqdm
import pickle

## Loading The dataset

In [2]:
# Data transformation to convert images to tensors and normalize
transform = transforms.ToTensor()
np.random.seed(42)
# Load the FashionMNIST dataset
train_data = datasets.FashionMNIST(root='./data', train=True, transform=transform, download=True)
test_data = datasets.FashionMNIST(root='./data', train=False, transform=transform, download=True)

# Split the dataset into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Convert the datasets into numpy arrays
def dataset_to_numpy(dataset):
    X = []
    y = []
    for img, label in dataset:
        X.append(img.numpy().reshape(-1))  # Flatten the image
        y.append(label)
    return np.array(X), np.array(y)

# Prepare numpy arrays for training and validation sets
X_train, y_train = dataset_to_numpy(train_data)
X_val, y_val = dataset_to_numpy(val_data)
X_test, y_test = dataset_to_numpy(test_data)

print("Training data shape:", X_train.shape)
print("Validation data shape:", X_val.shape)
print("Test data shape:", X_test.shape)

Training data shape: (48000, 784)
Validation data shape: (12000, 784)
Test data shape: (10000, 784)


## Dense Layer With Adam

In [3]:
class Adam:
    def __init__(self, shape, learning_rate=0.005, epsilon=1e-8, beta1=0.9, beta2=0.999):
        self.v = np.zeros(shape)
        self.s = np.zeros(shape)
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.t = 0

    def update(self, w, grad_w):
        self.t += 1
        self.v = self.beta1 * self.v + (1 - self.beta1) * grad_w
        self.s = self.beta2 * self.s + (1 - self.beta2) * (grad_w ** 2)
        v_norm = self.v / (1 - self.beta1 ** self.t)
        s_norm = self.s / (1 - self.beta2 ** self.t)
        w -= self.learning_rate * v_norm / (np.sqrt(s_norm) + self.epsilon)
        return w

class DenseLayer:
    def __init__(self, input_size, output_size):
        self.weights = np.random.randn(input_size, output_size) * np.sqrt(2. / input_size)
        self.biases = np.zeros((1, output_size))
        self.weight_optimizer = Adam(self.weights.shape)
        self.bias_optimizer = Adam(self.biases.shape)

    def forward(self, X):
        self.input = X
        self.output = np.dot(X, self.weights) + self.biases
        return self.output

    def backward(self, d_out):
        grad_weights = np.dot(self.input.T, d_out)
        grad_biases = np.sum(d_out, axis=0, keepdims=True)
        d_input = np.dot(d_out, self.weights.T)

        # Update weights and biases using Adam optimizer
        self.weights = self.weight_optimizer.update(self.weights, grad_weights)
        self.biases = self.bias_optimizer.update(self.biases, grad_biases)
        return d_input


## Normalization (batch Normalization)

In [4]:
class BatchNormalization:
    def __init__(self, input_size, epsilon=1e-5):
        momentum=0.9
        self.epsilon = epsilon
        self.momentum = momentum
        self.gamma = np.ones((1, input_size))
        self.beta = np.zeros((1, input_size))
        self.moving_mean = np.zeros((1, input_size))
        self.moving_var = np.zeros((1, input_size))
        self.gamma_optimizer = Adam(self.gamma.shape)
        self.beta_optimizer = Adam(self.beta.shape)

    def forward(self, X, training=True):
        self.input = X
        if training:
            self.mean = np.mean(X, axis=0, keepdims=True)
            self.var = np.var(X, axis=0, keepdims=True)
            self.std = np.sqrt(self.var + self.epsilon)
            self.x_norm = (X - self.mean) / self.std
            self.moving_mean = self.momentum * self.moving_mean + (1 - self.momentum) * self.mean
            self.moving_var = self.momentum * self.moving_var + (1 - self.momentum) * self.var
        else:
            self.x_norm = (X - self.moving_mean) / np.sqrt(self.moving_var + self.epsilon)
        output = self.gamma * self.x_norm + self.beta
        return output

    def backward(self, d_out):
        m = self.input.shape[0]
        self.d_gamma = np.sum(d_out * self.x_norm, axis=0, keepdims=True)
        self.d_beta = np.sum(d_out, axis=0, keepdims=True)

        dx_norm = d_out * self.gamma
        d_var = np.sum(dx_norm * (self.input - self.mean) * -0.5 * (self.var + self.epsilon) ** (-1.5), axis=0)
        d_mean = np.sum(dx_norm * -1 / self.std, axis=0) + d_var * np.mean(-2 * (self.input - self.mean), axis=0)
        d_input = dx_norm / self.std + d_var * 2 * (self.input - self.mean) / m + d_mean / m

        # Update gamma and beta using Adam optimizer
        self.gamma = self.gamma_optimizer.update(self.gamma, self.d_gamma)
        self.beta = self.beta_optimizer.update(self.beta, self.d_beta)
        return d_input


## Activation (ReLU)

In [5]:
class ReLU:
    def forward(self, X):
        self.input = X
        return np.maximum(0, X)

    def backward(self, d_out):
        d_input = d_out.copy()
        d_input[self.input <= 0] = 0
        return d_input


## Regularization (Dropout)

In [6]:
class Dropout:
    def __init__(self, dropout_rate=0.5):
        self.rate = dropout_rate

    def forward(self, X, training=True):
        if training:
            self.mask = (np.random.rand(*X.shape) > self.rate) / (1 - self.rate)
            return X * self.mask
        else:
            return X

    def backward(self, d_out):
        return d_out * self.mask


## Regression (SoftMax)

In [7]:
class SoftmaxCrossEntropyLoss:
    def forward(self, logits, targets):
        # Compute softmax probabilities
        exp_values = np.exp(logits - np.max(logits, axis=1, keepdims=True))
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        self.probabilities = probabilities

        # Compute cross-entropy loss
        samples = logits.shape[0]
        correct_logprobs = -np.log(self.probabilities[range(samples), targets])
        loss = np.sum(correct_logprobs) / samples
        return loss

    def backward(self, targets):
        samples = self.probabilities.shape[0]
        d_logits = self.probabilities.copy()
        d_logits[range(samples), targets] -= 1
        d_logits /= samples
        return d_logits


In [8]:
class NeuralNetwork:
    def __init__(self, layers):
        self.layers = layers

    def forward(self, X, training=True):
        for layer in self.layers:
            if isinstance(layer, (Dropout, BatchNormalization)):
                X = layer.forward(X, training)
            else:
                X = layer.forward(X)
        return X

    def backward(self, d_out):
        for layer in reversed(self.layers):
            d_out = layer.backward(d_out)


## Architecture

In [9]:
input_size = X_train.shape[1]  # Number of features
num_classes = len(np.unique(y_train))  # Number of classes

# Example architecture
layers = [
    DenseLayer(input_size, 128),
    BatchNormalization(128),
    ReLU(),
    Dropout(0.5),
    # DenseLayer(128, 64),
    # BatchNormalization(64),
    # ReLU(),
    # Dropout(0.5),
    DenseLayer(128, num_classes)
]

model = NeuralNetwork(layers)
loss_function = SoftmaxCrossEntropyLoss()
# print(input_size)

## Training The Model

In [10]:
epochs = 10
batch_size = 64
learning_rate = 0.005

# Placeholder lists to store metrics
train_losses, val_losses = [], []
train_accuracies, val_accuracies = [], []
val_macro_f1_scores = []

# Function to create mini-batches
def create_batches(X, y, batch_size):
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    for start_idx in range(0, X.shape[0] - batch_size + 1, batch_size):
        batch_indices = indices[start_idx:start_idx + batch_size]
        yield X[batch_indices], y[batch_indices]

for epoch in range(epochs):
    # Training phase
    model_layers = model.layers
    train_loss = 0
    correct_train = 0
    total_train = 0

    # for X_batch, y_batch in create_batches(X_train, y_train, batch_size): 
    for X_batch, y_batch in tqdm(create_batches(X_train, y_train, batch_size), 
                                 desc=f"Training Epoch {epoch+1}", 
                                 total=int(np.ceil(X_train.shape[0] / batch_size))):
        # Forward pass
        logits = model.forward(X_batch, training=True)
        loss = loss_function.forward(logits, y_batch)
        train_loss += loss

        # Backward pass
        d_out = loss_function.backward(y_batch)
        model.backward(d_out)

        # Predictions and accuracy
        predictions = np.argmax(loss_function.probabilities, axis=1)
        correct_train += np.sum(predictions == y_batch)
        total_train += y_batch.shape[0]
        # print(correct_train, total_train)

    avg_train_loss = train_loss / (X_train.shape[0] / batch_size)
    train_accuracy = correct_train / total_train
    train_losses.append(avg_train_loss)
    train_accuracies.append(train_accuracy)

    # Validation phase
    val_loss = 0
    correct_val = 0
    total_val = 0
    all_predictions = []
    all_targets = []

    # No need to shuffle validation data
    for X_batch, y_batch in create_batches(X_val, y_val, batch_size):
        # Forward pass
        logits = model.forward(X_batch, training=False)
        loss = loss_function.forward(logits, y_batch)
        val_loss += loss

        # Predictions and accuracy
        predictions = np.argmax(loss_function.probabilities, axis=1)
        correct_val += np.sum(predictions == y_batch)
        total_val += y_batch.shape[0]
        all_predictions.extend(predictions)
        all_targets.extend(y_batch)

    avg_val_loss = val_loss / (X_val.shape[0] / batch_size)
    val_accuracy = correct_val / total_val
    val_losses.append(avg_val_loss)
    val_accuracies.append(val_accuracy)

    # Calculate macro-F1 score
    macro_f1 = f1_score(all_targets, all_predictions, average='macro')
    val_macro_f1_scores.append(macro_f1)

    # Report metrics
    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
    print(f"Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val Macro-F1: {macro_f1:.4f}\n")

print(train_losses)


Training Epoch 1: 100%|██████████| 750/750 [00:18<00:00, 41.09it/s] 


Epoch 1/10
Train Loss: 0.5747, Train Accuracy: 0.7941
Val Loss: 0.4091, Val Accuracy: 0.8505, Val Macro-F1: 0.8441



Training Epoch 2: 100%|██████████| 750/750 [00:06<00:00, 112.06it/s]


Epoch 2/10
Train Loss: 0.4614, Train Accuracy: 0.8333
Val Loss: 0.3848, Val Accuracy: 0.8592, Val Macro-F1: 0.8570



Training Epoch 3: 100%|██████████| 750/750 [00:06<00:00, 111.24it/s]


Epoch 3/10
Train Loss: 0.4278, Train Accuracy: 0.8478
Val Loss: 0.3728, Val Accuracy: 0.8614, Val Macro-F1: 0.8600



Training Epoch 4: 100%|██████████| 750/750 [00:12<00:00, 59.72it/s] 


Epoch 4/10
Train Loss: 0.4123, Train Accuracy: 0.8496
Val Loss: 0.3562, Val Accuracy: 0.8673, Val Macro-F1: 0.8650



Training Epoch 5: 100%|██████████| 750/750 [00:14<00:00, 50.46it/s]


Epoch 5/10
Train Loss: 0.3943, Train Accuracy: 0.8575
Val Loss: 0.3492, Val Accuracy: 0.8692, Val Macro-F1: 0.8678



Training Epoch 6: 100%|██████████| 750/750 [00:04<00:00, 155.06it/s]


Epoch 6/10
Train Loss: 0.3858, Train Accuracy: 0.8602
Val Loss: 0.3402, Val Accuracy: 0.8764, Val Macro-F1: 0.8751



Training Epoch 7: 100%|██████████| 750/750 [00:06<00:00, 109.97it/s]


Epoch 7/10
Train Loss: 0.3769, Train Accuracy: 0.8636
Val Loss: 0.3514, Val Accuracy: 0.8710, Val Macro-F1: 0.8680



Training Epoch 8: 100%|██████████| 750/750 [00:06<00:00, 116.30it/s]


Epoch 8/10
Train Loss: 0.3699, Train Accuracy: 0.8655
Val Loss: 0.3406, Val Accuracy: 0.8752, Val Macro-F1: 0.8747



Training Epoch 9: 100%|██████████| 750/750 [00:07<00:00, 105.33it/s]


Epoch 9/10
Train Loss: 0.3584, Train Accuracy: 0.8692
Val Loss: 0.3251, Val Accuracy: 0.8814, Val Macro-F1: 0.8815



Training Epoch 10: 100%|██████████| 750/750 [00:05<00:00, 130.27it/s]


Epoch 10/10
Train Loss: 0.3549, Train Accuracy: 0.8722
Val Loss: 0.3328, Val Accuracy: 0.8804, Val Macro-F1: 0.8803

[np.float64(0.5746926949775026), np.float64(0.4613905624695355), np.float64(0.4277792856682923), np.float64(0.4123108667140513), np.float64(0.39428277503210557), np.float64(0.38584172301478414), np.float64(0.3768922777271113), np.float64(0.36988420402859584), np.float64(0.35840337308198605), np.float64(0.3548978299264037)]


In [11]:
import pandas as pd
data = {
    'Train Loss': train_losses,
    'Val Loss': val_losses,
    'Train Accuracy': train_accuracies,
    'Val Accuracy': val_accuracies,
    'Val Macro F1': val_macro_f1_scores
}

metrics_df = pd.DataFrame(data)
metrics_df = metrics_df.style.set_table_styles(
    [{'selector': 'table', 'props': [('border', '1px solid black')]},
     {'selector': 'th', 'props': [('border', '1px solid black')]},
     {'selector': 'td', 'props': [('border', '1px solid black')]}]
)
# Display the table
metrics_df

Unnamed: 0,Train Loss,Val Loss,Train Accuracy,Val Accuracy,Val Macro F1
0,0.574693,0.409061,0.794083,0.850518,0.844064
1,0.461391,0.384785,0.833313,0.859208,0.857004
2,0.427779,0.372836,0.847833,0.86138,0.86
3,0.412311,0.356236,0.849583,0.867313,0.864962
4,0.394283,0.349244,0.8575,0.869235,0.86784
5,0.385842,0.34015,0.860167,0.87642,0.875119
6,0.376892,0.351366,0.863646,0.870989,0.868032
7,0.369884,0.340639,0.865521,0.875167,0.874712
8,0.358403,0.325087,0.869167,0.88135,0.881459
9,0.354898,0.332827,0.872188,0.880431,0.880274


## Saving the model

In [12]:
with open("1905105.pickle", "wb") as file:
    pickle.dump(model, file)

## Loading & Testing The Model

In [13]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# Function to evaluate the model on the test set
def evaluate_model_on_test(model, X_test, y_test, loss_fn):
    # Forward pass through the model to get logits
    logits = model.forward(X_test, training=False)  # Set training=False to disable dropout
    
    # Calculate test loss
    test_loss = loss_fn.forward(logits, y_test)     # Pass logits and targets to loss function
    probabilities = loss_fn.probabilities
    # Convert probabilities to class predictions
    predicted_classes = np.argmax(probabilities, axis=1)  # Specify axis=1 to get predictions per sample
    
    # Calculate test accuracy and macro-F1 score
    test_accuracy = accuracy_score(y_test, predicted_classes)
    test_macro_f1 = f1_score(y_test, predicted_classes, average='macro')
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_test, predicted_classes)
    
    # Print results
    print("Test Loss:", test_loss)
    print("Test Accuracy:", test_accuracy)
    print("Test Macro-F1 Score:", test_macro_f1)
    print("Confusion Matrix:\n", conf_matrix)

loss_fn = SoftmaxCrossEntropyLoss()
with open("1905105.pickle", "rb") as file:
    loaded_model = pickle.load(file)
# loaded_model = model
evaluate_model_on_test(loaded_model, X_test, y_test, loss_fn=loss_fn)
print("Done")

Test Loss: 0.36087644626330423
Test Accuracy: 0.8729
Test Macro-F1 Score: 0.8725546236397171
Confusion Matrix:
 [[843   0   9  38   1   3 100   0   6   0]
 [  4 969   1  20   2   0   2   0   2   0]
 [ 14   1 807  15  73   0  87   0   3   0]
 [ 24   6   7 911  17   1  31   0   3   0]
 [  0   1 104  50 738   0 104   0   3   0]
 [  0   0   0   0   0 955   0  31   2  12]
 [142   1 103  35  51   1 653   0  14   0]
 [  0   0   0   0   0  21   0 931   0  48]
 [  2   1   2   4   3   4  15   3 966   0]
 [  0   0   0   0   0  14   1  29   0 956]]
Done
