# FEED FORWARD NEURAL NETWORK (MLP - Multilayer Perceptron)

**Modified code to use Intel XPU**
**More can be found here: https://www.intel.com/content/www/us/en/developer/articles/technical/introduction-to-intel-extension-for-tensorflow.html**
**https://intel.github.io/intel-extension-for-tensorflow/latest/docs/install/install_for_xpu.html**

In [None]:
!pip install --upgrade intel-extension-for-tensorflow[xpu]

In [None]:
!pip install pandas numpy matplotlib seaborn scikit-learn tensorflow keras


In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
#import intel_extension_for_tensorflow as itex

print("TensorFlow version:", tf.__version__)
print("TenserFlow executint eagerly: {}".format(tf.executing_eagerly()))

In [None]:
### Import Fashion MNIST Dataset and split into training and test sets
fashion_mnist = tf.keras.datasets.fashion_mnist
(train_and_validation_images, train_and_validation_labels), (test_images, test_labels) = fashion_mnist.load_data()

text_labels = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']


In [None]:
### Construct a validation set from the training set
validation_images = train_and_validation_images[-10000:, :, :]
validation_labels = train_and_validation_labels[-10000:]
training_images = train_and_validation_images[:-50000, :, :]
training_labels = train_and_validation_labels[:-50000]

In [None]:
### Visualize the data
plt.figure()
random_index = np.random.randint(0, len(training_images))
plt.imshow(training_images[random_index], cmap='gray_r')
plt.colorbar()
numerical_label = training_labels[random_index]
text_description = text_labels[numerical_label]
plt.title('Label: {} ("{}")'.format(numerical_label, text_description))
plt.gca().grid(False)

In [None]:
### Showing 25 random images from the training set
plt.figure(figsize=(10,10))
for i in range(25):
    plt.subplot(5,5,i+1)
    random_index = np.random.randint(0, len(training_images))
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(training_images[random_index], cmap='gray_r')
    numerical_label = training_labels[random_index]
    text_description = text_labels[numerical_label]
    plt.title(text_description)

In [None]:
batch_size = 128  ### Hyperparameter: batch size
train_dataset = tf.data.Dataset.from_tensor_slices((training_images, training_labels))
train_dataset = train_dataset.map(lambda x, y: (tf.cast(x, tf.float32) / 255.0, tf.cast(y, tf.int32)))
train_dataset = train_dataset.shuffle(buffer_size=batch_size * 10)
train_dataset = train_dataset.batch(batch_size)

validation_dataset = tf.data.Dataset.from_tensor_slices((validation_images, validation_labels))
validation_dataset = validation_dataset.map(lambda x, y: (tf.cast(x, tf.float32) / 255.0, tf.cast(y, tf.int32)))
validation_dataset = validation_dataset.batch(batch_size)

In [None]:
### Define the Model
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28) , name='flatten_input_layer'),
    tf.keras.layers.Dense(256, activation=tf.nn.relu, name='first_hidden_layer'),
    tf.keras.layers.Dense(128, activation=tf.nn.relu, name='second_hidden_layer'),
    tf.keras.layers.Dense(10, name='hidden_to_logits')
])

model.summary()


In [None]:
### Train the Model
optimizer = tf.keras.optimizers.Adam()
loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

num_epochs = 50  ### Hyperparameter: number of epochs

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
train_losses = []
train_accuracies = []

val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='val_accuracy')
val_losses = []
val_accuracies = []



In [None]:
### Function to train the model using tf.GradientTape
@tf.function
def train_step(image, label):
    with tf.GradientTape() as tape:
        logits = model(image)
        loss = loss_function(label, logits)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)
    train_accuracy(label, logits)

In [None]:
### Function to evaluate the model on validation data
@tf.function
def val_step(image, label):
    logits = model(image, training=False)
    loss = loss_function(label, logits)

    val_loss(loss)
    val_accuracy(label, logits)

In [None]:
for epoch in range(num_epochs):
    for images, labels in train_dataset:
        train_step(images, labels)

    for val_images, val_labels in validation_dataset:
        val_step(val_images, val_labels)

    template = 'Epoch {:03d}, Loss: {:.03f}, Acc: {:.3%}, Val Loss: {:.03f}, Val Acc: {:.3%}'
    print(template.format(epoch + 1,
                          train_loss.result(),
                          train_accuracy.result(),
                          val_loss.result(),
                          val_accuracy.result()))

    train_losses.append(train_loss.result())
    train_accuracies.append(train_accuracy.result())
    
    val_losses.append(val_loss.result())
    val_accuracies.append(val_accuracy.result())

In [None]:
### Plot training and validation loss/accuracy over epochs --- Answer -1
epochs = range(1, num_epochs + 1)

plt.figure()
plt.plot(epochs, train_losses, label='Training Loss')
plt.plot(epochs, val_losses, label='Validation Loss')
plt.title('Training vs Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.figure()
plt.plot(epochs, train_accuracies, label='Training Accuracy')
plt.plot(epochs, val_accuracies, label='Validation Accuracy')
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

In [None]:
### Learning behavior analysis and validation vs test comparison ---- Answer -1
final_train_loss = float(train_losses[-1])
final_train_acc = float(train_accuracies[-1])
final_val_loss = float(val_losses[-1])
final_val_acc = float(val_accuracies[-1])

gap_acc = final_train_acc - final_val_acc
gap_loss = final_val_loss - final_train_loss

print('Model WITHOUT Dropout:')
print('Final epoch:')
print(f'  Train Loss: {final_train_loss:.4f}, Train Acc: {final_train_acc:.2%}')
print(f'  Val   Loss: {final_val_loss:.4f}, Val   Acc: {final_val_acc:.2%}')
print(f'  Generalization gap (Acc): {gap_acc:.2%}, (Loss): {gap_loss:.4f}')

# Store baseline results for comparison
baseline_train_losses = train_losses.copy()
baseline_train_accuracies = train_accuracies.copy()
baseline_val_losses = val_losses.copy()
baseline_val_accuracies = val_accuracies.copy()
baseline_final_train_acc = final_train_acc
baseline_final_val_acc = final_val_acc
baseline_gap_acc = gap_acc

In [None]:
images, labels = next(iter(train_dataset))

_logits = model(images, training=False)
predictions = tf.argmax(_logits, axis=1, output_type=tf.int32)

img_indexes = np.arange(images.numpy().shape[0])
np.random.shuffle(img_indexes)

plt.figure(figsize=(10,10))
for i in range(25):
    plt.subplot(5,5,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)

    img_index = img_indexes[i]
    predicted_label = int(predictions[img_index])
    plt.imshow(images[img_index], cmap='gray_r')
    true_label = labels[img_index].numpy()
    predicted_label = predictions[img_index].numpy()
    color = 'blue' if true_label == predicted_label else 'red'
    plt.title('TRUE: {}\nPREDICTED: {}'.format(text_labels[true_label],
                                    text_labels[predicted_label]),
                                    color=color)
    
    plt.tight_layout()
    plt.show()

In [None]:
### Test the Model on the test dataset
tf_test_images = tf.convert_to_tensor(test_images, dtype=tf.float32) / 255.0
tf_test_labels = tf.convert_to_tensor(test_labels, dtype=tf.int32)

In [None]:
test_logits = model(tf_test_images, training=False)
test_loss = tf.keras.metrics.Mean(name='test_loss')
t_loss = loss_function(tf_test_labels, test_logits)
test_loss(t_loss)

test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')
test_accuracy(tf_test_labels, test_logits)

print('Test Loss: {:.03f}, Test Accuracy: {:.3%}'.format(test_loss.result(),
                                                         test_accuracy.result()))

In [None]:
test_predictions = tf.argmax(test_logits, axis=1, output_type=tf.int32)
plt.figure(figsize=(10,10))
for i in range(25):
    plt.subplot(5,5,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)

    plt.imshow(test_images[i], cmap='gray_r')
    true_label = test_labels[i]
    predicted_label = test_predictions[i].numpy()
    color = 'blue' if true_label == predicted_label else 'red'
    plt.title('TRUE: {}\nPREDICTED: {}'.format(text_labels[true_label],
                                    text_labels[predicted_label]),
                                    color=color)
    
    plt.tight_layout()
    plt.show()

## Model with Dropout Layers --- Answer - 2


In [None]:
### Define the Model WITH Dropout layers
model_dropout = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28), name='flatten_input_layer'),
    tf.keras.layers.Dense(256, activation=tf.nn.relu, name='first_hidden_layer'),
    tf.keras.layers.Dropout(0.3, name='dropout_1'),  # Dropout layer with 30% dropout rate
    tf.keras.layers.Dense(128, activation=tf.nn.relu, name='second_hidden_layer'),
    tf.keras.layers.Dropout(0.3, name='dropout_2'),  # Dropout layer with 30% dropout rate
    tf.keras.layers.Dense(10, name='hidden_to_logits')
])

model_dropout.summary()

In [None]:
### Setup training for model WITH Dropout
optimizer_dropout = tf.keras.optimizers.Adam()
loss_function_dropout = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

train_loss_dropout = tf.keras.metrics.Mean(name='train_loss_dropout')
train_accuracy_dropout = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy_dropout')
train_losses_dropout = []
train_accuracies_dropout = []

val_loss_dropout = tf.keras.metrics.Mean(name='val_loss_dropout')
val_accuracy_dropout = tf.keras.metrics.SparseCategoricalAccuracy(name='val_accuracy_dropout')
val_losses_dropout = []
val_accuracies_dropout = []

In [None]:
### Training and validation functions for dropout model
@tf.function
def train_step_dropout(image, label):
    with tf.GradientTape() as tape:
        logits = model_dropout(image, training=True)  # training=True enables dropout
        loss = loss_function_dropout(label, logits)
    gradients = tape.gradient(loss, model_dropout.trainable_variables)
    optimizer_dropout.apply_gradients(zip(gradients, model_dropout.trainable_variables))

    train_loss_dropout(loss)
    train_accuracy_dropout(label, logits)

@tf.function
def val_step_dropout(image, label):
    logits = model_dropout(image, training=False)  # training=False disables dropout
    loss = loss_function_dropout(label, logits)

    val_loss_dropout(loss)
    val_accuracy_dropout(label, logits)



In [None]:
### Train the model WITH Dropout
for epoch in range(num_epochs):
    for images, labels in train_dataset:
        train_step_dropout(images, labels)

    for val_images, val_labels in validation_dataset:
        val_step_dropout(val_images, val_labels)

    template = 'Epoch {:03d}, Loss: {:.03f}, Acc: {:.3%}, Val Loss: {:.03f}, Val Acc: {:.3%}'
    print(template.format(epoch + 1,
                          train_loss_dropout.result(),
                          train_accuracy_dropout.result(),
                          val_loss_dropout.result(),
                          val_accuracy_dropout.result()))

    train_losses_dropout.append(train_loss_dropout.result())
    train_accuracies_dropout.append(train_accuracy_dropout.result())
    val_losses_dropout.append(val_loss_dropout.result())
    val_accuracies_dropout.append(val_accuracy_dropout.result())

In [None]:
### Comparison plots: Model WITHOUT Dropout vs Model WITH Dropout
epochs = range(1, num_epochs + 1)

# Loss comparison
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.plot(epochs, baseline_train_losses, 'b-', label='Train (No Dropout)', alpha=0.7)
plt.plot(epochs, baseline_val_losses, 'b--', label='Val (No Dropout)', alpha=0.7)
plt.plot(epochs, train_losses_dropout, 'r-', label='Train (With Dropout)', alpha=0.7)
plt.plot(epochs, val_losses_dropout, 'r--', label='Val (With Dropout)', alpha=0.7)
plt.title('Loss Comparison: Without vs With Dropout')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True, alpha=0.3)

# Accuracy comparison
plt.subplot(1, 2, 2)
plt.plot(epochs, baseline_train_accuracies, 'b-', label='Train (No Dropout)', alpha=0.7)
plt.plot(epochs, baseline_val_accuracies, 'b--', label='Val (No Dropout)', alpha=0.7)
plt.plot(epochs, train_accuracies_dropout, 'r-', label='Train (With Dropout)', alpha=0.7)
plt.plot(epochs, val_accuracies_dropout, 'r--', label='Val (With Dropout)', alpha=0.7)
plt.title('Accuracy Comparison: Without vs With Dropout')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
### Quantitative comparison and analysis --- Answer - 2
print('=' * 70)
print('COMPARISON: Model WITHOUT Dropout vs Model WITH Dropout')
print('=' * 70)

# Without Dropout
final_train_loss_no_dropout = float(baseline_train_losses[-1])
final_train_acc_no_dropout = float(baseline_train_accuracies[-1])
final_val_loss_no_dropout = float(baseline_val_losses[-1])
final_val_acc_no_dropout = float(baseline_val_accuracies[-1])
gap_acc_no_dropout = final_train_acc_no_dropout - final_val_acc_no_dropout
gap_loss_no_dropout = final_val_loss_no_dropout - final_train_loss_no_dropout

# With Dropout
final_train_loss_dropout = float(train_losses_dropout[-1])
final_train_acc_dropout = float(train_accuracies_dropout[-1])
final_val_loss_dropout = float(val_losses_dropout[-1])
final_val_acc_dropout = float(val_accuracies_dropout[-1])
gap_acc_dropout = final_train_acc_dropout - final_val_acc_dropout
gap_loss_dropout = final_val_loss_dropout - final_train_loss_dropout

print('\nModel WITHOUT Dropout:')
print(f'  Train Acc: {final_train_acc_no_dropout:.2%}, Val Acc: {final_val_acc_no_dropout:.2%}')
print(f'  Train Loss: {final_train_loss_no_dropout:.4f}, Val Loss: {final_val_loss_no_dropout:.4f}')
print(f'  Generalization Gap (Acc): {gap_acc_no_dropout:.2%}, (Loss): {gap_loss_no_dropout:.4f}')

print('\nModel WITH Dropout:')
print(f'  Train Acc: {final_train_acc_dropout:.2%}, Val Acc: {final_val_acc_dropout:.2%}')
print(f'  Train Loss: {final_train_loss_dropout:.4f}, Val Loss: {final_val_loss_dropout:.4f}')
print(f'  Generalization Gap (Acc): {gap_acc_dropout:.2%}, (Loss): {gap_loss_dropout:.4f}')

print('\n' + '=' * 70)
print('IMPROVEMENT ANALYSIS')
print('=' * 70)

improvement_gap_acc = gap_acc_no_dropout - gap_acc_dropout
improvement_gap_loss = gap_loss_no_dropout - gap_loss_dropout
improvement_val_acc = final_val_acc_dropout - final_val_acc_no_dropout

print(f'\nGeneralization Gap Reduction:')
print(f'  Accuracy gap reduced by: {improvement_gap_acc:.2%}')
print(f'  Loss gap reduced by: {improvement_gap_loss:.4f}')
print(f'\nValidation Performance Change:')
print(f'  Validation accuracy change: {improvement_val_acc:+.2%}')

print('\n' + '=' * 70)
print('HOW DROPOUT AFFECTS OVERFITTING')
print('=' * 70)
print('\nDropout is a regularization technique that:')
print('  1. Randomly deactivates neurons during training (30% in our case)')
print('  2. Forces the network to learn robust features')
print('  3. Prevents co-adaptation of neurons')
print('  4. Acts as an ensemble method')

if gap_acc_dropout < gap_acc_no_dropout:
    print('\n✓ Dropout REDUCED overfitting:')
    print(f'    - Generalization gap decreased from {gap_acc_no_dropout:.2%} to {gap_acc_dropout:.2%}')
    print(f'    - Training-validation gap is now smaller')
    if final_val_acc_dropout > final_val_acc_no_dropout:
        print(f'    - Validation accuracy improved by {improvement_val_acc:.2%}')
    else:
        print(f'    - Slight trade-off: validation accuracy changed by {improvement_val_acc:.2%}')
        print('      (This is acceptable as the model generalizes better)')
else:
    print('\n⚠ Unexpected: Dropout did not reduce the generalization gap.')
    print('   Consider: adjusting dropout rate, training longer, or other regularization.')

print('\nObservations from the plots:')
print('  • Training accuracy should be lower with dropout (neurons randomly disabled)')
print('  • Validation accuracy should be closer to training accuracy')
print('  • The gap between training and validation curves should be smaller')
print('  • Model with dropout is less prone to overfitting the training data')

In [None]:
### Test the Dropout model on test dataset
test_logits_dropout = model_dropout(tf_test_images, training=False)
test_loss_dropout = tf.keras.metrics.Mean(name='test_loss_dropout')
t_loss_dropout = loss_function_dropout(tf_test_labels, test_logits_dropout)
test_loss_dropout(t_loss_dropout)

test_accuracy_dropout = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy_dropout')
test_accuracy_dropout(tf_test_labels, test_logits_dropout)

print('\n' + '=' * 70)
print('TEST SET PERFORMANCE COMPARISON')
print('=' * 70)
print(f'\nModel WITHOUT Dropout - Test Acc: {test_accuracy.result():.2%}, Test Loss: {test_loss.result():.4f}')
print(f'Model WITH Dropout    - Test Acc: {test_accuracy_dropout.result():.2%}, Test Loss: {test_loss_dropout.result():.4f}')
print(f'\nTest Accuracy Improvement: {(test_accuracy_dropout.result() - test_accuracy.result()):.2%}')

## Model with Batch Normalization --- Answer - 3


In [None]:
### Define the Model WITH Batch Normalization layers
model_batchnorm = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28), name='flatten_input_layer'),
    tf.keras.layers.Dense(256, activation=None, name='first_hidden_layer'),  # No activation here
    tf.keras.layers.BatchNormalization(name='batch_norm_1'),  # Batch Normalization layer
    tf.keras.layers.Activation(tf.nn.relu, name='relu_1'),  # Activation after BatchNorm
    tf.keras.layers.Dense(128, activation=None, name='second_hidden_layer'),  # No activation here
    tf.keras.layers.BatchNormalization(name='batch_norm_2'),  # Batch Normalization layer
    tf.keras.layers.Activation(tf.nn.relu, name='relu_2'),  # Activation after BatchNorm
    tf.keras.layers.Dense(10, name='hidden_to_logits')
])

model_batchnorm.summary()

In [None]:
### Setup training for model WITH Batch Normalization
optimizer_batchnorm = tf.keras.optimizers.Adam()
loss_function_batchnorm = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

train_loss_batchnorm = tf.keras.metrics.Mean(name='train_loss_batchnorm')
train_accuracy_batchnorm = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy_batchnorm')
train_losses_batchnorm = []
train_accuracies_batchnorm = []

val_loss_batchnorm = tf.keras.metrics.Mean(name='val_loss_batchnorm')
val_accuracy_batchnorm = tf.keras.metrics.SparseCategoricalAccuracy(name='val_accuracy_batchnorm')
val_losses_batchnorm = []
val_accuracies_batchnorm = []

In [None]:
### Training and validation functions for batch normalization model
@tf.function
def train_step_batchnorm(image, label):
    with tf.GradientTape() as tape:
        logits = model_batchnorm(image, training=True)  # training=True updates BN statistics
        loss = loss_function_batchnorm(label, logits)
    gradients = tape.gradient(loss, model_batchnorm.trainable_variables)
    optimizer_batchnorm.apply_gradients(zip(gradients, model_batchnorm.trainable_variables))

    train_loss_batchnorm(loss)
    train_accuracy_batchnorm(label, logits)

@tf.function
def val_step_batchnorm(image, label):
    logits = model_batchnorm(image, training=False)  # training=False uses running stats
    loss = loss_function_batchnorm(label, logits)

    val_loss_batchnorm(loss)
    val_accuracy_batchnorm(label, logits)


In [None]:
### Train the model WITH Batch Normalization
for epoch in range(num_epochs):
    for images, labels in train_dataset:
        train_step_batchnorm(images, labels)

    for val_images, val_labels in validation_dataset:
        val_step_batchnorm(val_images, val_labels)

    template = 'Epoch {:03d}, Loss: {:.03f}, Acc: {:.3%}, Val Loss: {:.03f}, Val Acc: {:.3%}'
    print(template.format(epoch + 1,
                          train_loss_batchnorm.result(),
                          train_accuracy_batchnorm.result(),
                          val_loss_batchnorm.result(),
                          val_accuracy_batchnorm.result()))

    train_losses_batchnorm.append(train_loss_batchnorm.result())
    train_accuracies_batchnorm.append(train_accuracy_batchnorm.result())
    val_losses_batchnorm.append(val_loss_batchnorm.result())
    val_accuracies_batchnorm.append(val_accuracy_batchnorm.result())

In [None]:
### Comprehensive comparison plots: All three models
epochs = range(1, num_epochs + 1)

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Training Loss Comparison
axes[0, 0].plot(epochs, baseline_train_losses, 'b-', label='No Regularization', alpha=0.8, linewidth=2)
axes[0, 0].plot(epochs, train_losses_dropout, 'r-', label='With Dropout', alpha=0.8, linewidth=2)
axes[0, 0].plot(epochs, train_losses_batchnorm, 'g-', label='With Batch Norm', alpha=0.8, linewidth=2)
axes[0, 0].set_title('Training Loss Comparison', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Epoch', fontsize=12)
axes[0, 0].set_ylabel('Loss', fontsize=12)
axes[0, 0].legend(fontsize=11)
axes[0, 0].grid(True, alpha=0.3)

# Validation Loss Comparison
axes[0, 1].plot(epochs, baseline_val_losses, 'b--', label='No Regularization', alpha=0.8, linewidth=2)
axes[0, 1].plot(epochs, val_losses_dropout, 'r--', label='With Dropout', alpha=0.8, linewidth=2)
axes[0, 1].plot(epochs, val_losses_batchnorm, 'g--', label='With Batch Norm', alpha=0.8, linewidth=2)
axes[0, 1].set_title('Validation Loss Comparison', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Epoch', fontsize=12)
axes[0, 1].set_ylabel('Loss', fontsize=12)
axes[0, 1].legend(fontsize=11)
axes[0, 1].grid(True, alpha=0.3)

# Training Accuracy Comparison
axes[1, 0].plot(epochs, baseline_train_accuracies, 'b-', label='No Regularization', alpha=0.8, linewidth=2)
axes[1, 0].plot(epochs, train_accuracies_dropout, 'r-', label='With Dropout', alpha=0.8, linewidth=2)
axes[1, 0].plot(epochs, train_accuracies_batchnorm, 'g-', label='With Batch Norm', alpha=0.8, linewidth=2)
axes[1, 0].set_title('Training Accuracy Comparison', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Epoch', fontsize=12)
axes[1, 0].set_ylabel('Accuracy', fontsize=12)
axes[1, 0].legend(fontsize=11)
axes[1, 0].grid(True, alpha=0.3)

# Validation Accuracy Comparison
axes[1, 1].plot(epochs, baseline_val_accuracies, 'b--', label='No Regularization', alpha=0.8, linewidth=2)
axes[1, 1].plot(epochs, val_accuracies_dropout, 'r--', label='With Dropout', alpha=0.8, linewidth=2)
axes[1, 1].plot(epochs, val_accuracies_batchnorm, 'g--', label='With Batch Norm', alpha=0.8, linewidth=2)
axes[1, 1].set_title('Validation Accuracy Comparison', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Epoch', fontsize=12)
axes[1, 1].set_ylabel('Accuracy', fontsize=12)
axes[1, 1].legend(fontsize=11)
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
### Convergence Speed Analysis --- Answer - 3
import numpy as np

print('=' * 80)
print('CONVERGENCE SPEED ANALYSIS')
print('=' * 80)

# Define target accuracy thresholds
target_train_acc = 0.85
target_val_acc = 0.80

# Function to find epoch where threshold is first reached
def find_convergence_epoch(accuracies, threshold):
    for i, acc in enumerate(accuracies):
        if float(acc) >= threshold:
            return i + 1  # Return 1-indexed epoch
    return None  # Threshold not reached

# Training convergence
train_conv_baseline = find_convergence_epoch(baseline_train_accuracies, target_train_acc)
train_conv_dropout = find_convergence_epoch(train_accuracies_dropout, target_train_acc)
train_conv_batchnorm = find_convergence_epoch(train_accuracies_batchnorm, target_train_acc)

# Validation convergence
val_conv_baseline = find_convergence_epoch(baseline_val_accuracies, target_val_acc)
val_conv_dropout = find_convergence_epoch(val_accuracies_dropout, target_val_acc)
val_conv_batchnorm = find_convergence_epoch(val_accuracies_batchnorm, target_val_acc)

print(f'\nEpochs to reach {target_train_acc:.0%} training accuracy:')
print(f'  No Regularization:  {train_conv_baseline if train_conv_baseline else "Not reached"} epochs')
print(f'  With Dropout:       {train_conv_dropout if train_conv_dropout else "Not reached"} epochs')
print(f'  With Batch Norm:    {train_conv_batchnorm if train_conv_batchnorm else "Not reached"} epochs')

print(f'\nEpochs to reach {target_val_acc:.0%} validation accuracy:')
print(f'  No Regularization:  {val_conv_baseline if val_conv_baseline else "Not reached"} epochs')
print(f'  With Dropout:       {val_conv_dropout if val_conv_dropout else "Not reached"} epochs')
print(f'  With Batch Norm:    {val_conv_batchnorm if val_conv_batchnorm else "Not reached"} epochs')

# Early epochs learning rate (first 10 epochs)
early_epochs = 10
baseline_early_improvement = float(baseline_train_accuracies[early_epochs-1]) - float(baseline_train_accuracies[0])
dropout_early_improvement = float(train_accuracies_dropout[early_epochs-1]) - float(train_accuracies_dropout[0])
batchnorm_early_improvement = float(train_accuracies_batchnorm[early_epochs-1]) - float(train_accuracies_batchnorm[0])

print(f'\nAccuracy improvement in first {early_epochs} epochs:')
print(f'  No Regularization:  {baseline_early_improvement:.2%}')
print(f'  With Dropout:       {dropout_early_improvement:.2%}')
print(f'  With Batch Norm:    {batchnorm_early_improvement:.2%}')

if batchnorm_early_improvement > baseline_early_improvement:
    speedup = (batchnorm_early_improvement / baseline_early_improvement - 1) * 100
    print(f'\n✓ Batch Normalization converges {speedup:.1f}% faster in early epochs!')

In [None]:
### Training Stability Analysis --- Answer - 3
print('\n' + '=' * 80)
print('TRAINING STABILITY ANALYSIS')
print('=' * 80)

# Calculate loss variance across epochs (indicator of stability)
baseline_train_loss_std = np.std([float(x) for x in baseline_train_losses])
dropout_train_loss_std = np.std([float(x) for x in train_losses_dropout])
batchnorm_train_loss_std = np.std([float(x) for x in train_losses_batchnorm])

baseline_val_loss_std = np.std([float(x) for x in baseline_val_losses])
dropout_val_loss_std = np.std([float(x) for x in val_losses_dropout])
batchnorm_val_loss_std = np.std([float(x) for x in val_losses_batchnorm])

print('\nTraining Loss Standard Deviation (lower = more stable):')
print(f'  No Regularization:  {baseline_train_loss_std:.4f}')
print(f'  With Dropout:       {dropout_train_loss_std:.4f}')
print(f'  With Batch Norm:    {batchnorm_train_loss_std:.4f}')

print('\nValidation Loss Standard Deviation (lower = more stable):')
print(f'  No Regularization:  {baseline_val_loss_std:.4f}')
print(f'  With Dropout:       {dropout_val_loss_std:.4f}')
print(f'  With Batch Norm:    {batchnorm_val_loss_std:.4f}')

# Calculate smoothness (difference between consecutive epochs)
def calculate_smoothness(losses):
    differences = [abs(float(losses[i+1]) - float(losses[i])) for i in range(len(losses)-1)]
    return np.mean(differences)

baseline_smoothness = calculate_smoothness(baseline_train_losses)
dropout_smoothness = calculate_smoothness(train_losses_dropout)
batchnorm_smoothness = calculate_smoothness(train_losses_batchnorm)

print('\nTraining Loss Smoothness (lower = more stable):')
print(f'  No Regularization:  {baseline_smoothness:.4f}')
print(f'  With Dropout:       {dropout_smoothness:.4f}')
print(f'  With Batch Norm:    {batchnorm_smoothness:.4f}')

if batchnorm_smoothness < baseline_smoothness:
    improvement = (1 - batchnorm_smoothness / baseline_smoothness) * 100
    print(f'\n✓ Batch Normalization training is {improvement:.1f}% smoother!')

In [None]:
### Validation Accuracy and Generalization Comparison --- Answer - 3
print('\n' + '=' * 80)
print('VALIDATION ACCURACY AND GENERALIZATION COMPARISON')
print('=' * 80)

# Final performance metrics
final_train_acc_batchnorm = float(train_accuracies_batchnorm[-1])
final_val_acc_batchnorm = float(val_accuracies_batchnorm[-1])
final_train_loss_batchnorm = float(train_losses_batchnorm[-1])
final_val_loss_batchnorm = float(val_losses_batchnorm[-1])

gap_acc_batchnorm = final_train_acc_batchnorm - final_val_acc_batchnorm
gap_loss_batchnorm = final_val_loss_batchnorm - final_train_loss_batchnorm

print('\n--- Model WITHOUT Regularization ---')
print(f'  Train Acc: {baseline_final_train_acc:.2%}, Val Acc: {baseline_final_val_acc:.2%}')
print(f'  Generalization Gap: {baseline_gap_acc:.2%}')

print('\n--- Model WITH Dropout ---')
print(f'  Train Acc: {final_train_acc_dropout:.2%}, Val Acc: {final_val_acc_dropout:.2%}')
print(f'  Generalization Gap: {gap_acc_dropout:.2%}')

print('\n--- Model WITH Batch Normalization ---')
print(f'  Train Acc: {final_train_acc_batchnorm:.2%}, Val Acc: {final_val_acc_batchnorm:.2%}')
print(f'  Generalization Gap: {gap_acc_batchnorm:.2%}')

# Best validation accuracy
print('\nValidation Accuracy Ranking:')
models_val_acc = [
    ('No Regularization', baseline_final_val_acc),
    ('Dropout', final_val_acc_dropout),
    ('Batch Normalization', final_val_acc_batchnorm)
]
models_val_acc_sorted = sorted(models_val_acc, key=lambda x: x[1], reverse=True)
for i, (name, acc) in enumerate(models_val_acc_sorted):
    print(f'  {i+1}. {name}: {acc:.2%}')

# Generalization gap ranking (lower is better)
print('\nGeneralization Gap Ranking (lower is better):')
models_gap = [
    ('No Regularization', baseline_gap_acc),
    ('Dropout', gap_acc_dropout),
    ('Batch Normalization', gap_acc_batchnorm)
]
models_gap_sorted = sorted(models_gap, key=lambda x: x[1])
for i, (name, gap) in enumerate(models_gap_sorted):
    print(f'  {i+1}. {name}: {gap:.2%}')

In [None]:
### Comprehensive Analysis: How Batch Normalization Affects Training --- Answer - 3
print('\n' + '=' * 80)
print('HOW BATCH NORMALIZATION AFFECTS NEURAL NETWORK TRAINING')
print('=' * 80)

print('\n1. CONVERGENCE SPEED:')
print('   Batch Normalization normalizes inputs to each layer, which:')
print('   • Reduces internal covariate shift')
print('   • Allows higher learning rates to be used safely')
print('   • Accelerates training, especially in early epochs')
if train_conv_batchnorm and train_conv_baseline:
    if train_conv_batchnorm < train_conv_baseline:
        print(f'   ✓ Result: Reached target accuracy {train_conv_baseline - train_conv_batchnorm} epochs faster')

print('\n2. TRAINING STABILITY:')
print('   Batch Normalization provides:')
print('   • More stable gradient flow through the network')
print('   • Reduced sensitivity to weight initialization')
print('   • Smoother loss curves with less variance')
if batchnorm_smoothness < baseline_smoothness:
    improvement = (1 - batchnorm_smoothness / baseline_smoothness) * 100
    print(f'   ✓ Result: {improvement:.1f}% reduction in training oscillations')

print('\n3. REGULARIZATION EFFECT:')
print('   Batch Normalization acts as a regularizer:')
print('   • Adds noise through mini-batch statistics')
print('   • Reduces dependence on exact weight values')
print('   • Can reduce overfitting (similar to dropout)')
if gap_acc_batchnorm < baseline_gap_acc:
    improvement = baseline_gap_acc - gap_acc_batchnorm
    print(f'   ✓ Result: Generalization gap reduced by {improvement:.2%}')

print('\n4. VALIDATION ACCURACY:')
print('   Batch Normalization often improves validation performance by:')
print('   • Better generalization through regularization')
print('   • More robust learned representations')
print('   • Reduced overfitting to training data')
if final_val_acc_batchnorm > baseline_final_val_acc:
    improvement = final_val_acc_batchnorm - baseline_final_val_acc
    print(f'   ✓ Result: Validation accuracy improved by {improvement:.2%}')

print('\n' + '=' * 80)
print('KEY DIFFERENCES: Batch Normalization vs Dropout')
print('=' * 80)
print('\nBatch Normalization:')
print('  • Normalizes layer inputs during both training and inference')
print('  • Speeds up convergence by stabilizing learning')
print('  • Reduces internal covariate shift')
print('  • Can allow higher learning rates')
print('  • Has a regularization side-effect')

print('\nDropout:')
print('  • Randomly drops neurons during training only')
print('  • Primarily a regularization technique')
print('  • Forces redundancy in learned representations')
print('  • May slow down convergence')
print('  • Stronger regularization effect')

print('\n' + '=' * 80)
print('RECOMMENDATIONS')
print('=' * 80)
print('\nBased on the analysis:')
if final_val_acc_batchnorm >= max(baseline_final_val_acc, final_val_acc_dropout):
    print('✓ Batch Normalization achieved the best validation accuracy')
    print('  → Recommended for this architecture and dataset')
    print('  → Consider combining with Dropout for even better regularization')
else:
    print('• Batch Normalization improved convergence speed and stability')
    print('  → Consider combining techniques or tuning hyperparameters')

print('\nFor optimal results:')
print('  1. Use Batch Normalization for faster, more stable training')
print('  2. Add Dropout if stronger regularization is needed')
print('  3. Experiment with both techniques combined')
print('  4. Monitor validation performance to avoid overfitting')

In [None]:
### Test the Batch Normalization model on test dataset
test_logits_batchnorm = model_batchnorm(tf_test_images, training=False)
test_loss_batchnorm = tf.keras.metrics.Mean(name='test_loss_batchnorm')
t_loss_batchnorm = loss_function_batchnorm(tf_test_labels, test_logits_batchnorm)
test_loss_batchnorm(t_loss_batchnorm)

test_accuracy_batchnorm = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy_batchnorm')
test_accuracy_batchnorm(tf_test_labels, test_logits_batchnorm)

print('\n' + '=' * 80)
print('FINAL TEST SET PERFORMANCE COMPARISON')
print('=' * 80)
print(f'\nNo Regularization      - Test Acc: {test_accuracy.result():.2%}, Test Loss: {test_loss.result():.4f}')
print(f'With Dropout           - Test Acc: {test_accuracy_dropout.result():.2%}, Test Loss: {test_loss_dropout.result():.4f}')
print(f'With Batch Norm        - Test Acc: {test_accuracy_batchnorm.result():.2%}, Test Loss: {test_loss_batchnorm.result():.4f}')

# Find best model
test_results = [
    ('No Regularization', float(test_accuracy.result())),
    ('Dropout', float(test_accuracy_dropout.result())),
    ('Batch Normalization', float(test_accuracy_batchnorm.result()))
]
best_model = max(test_results, key=lambda x: x[1])
print(f'\n✓ Best model on test set: {best_model[0]} with {best_model[1]:.2%} accuracy')