# Fashion-MNIST CNN Ensemble vs Single CNN
This notebook trains an ensemble of CNNs (bootstrap aggregation) and compares results with a single CNN.


In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.datasets import fashion_mnist
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [2]:
(X_train_full, y_train_full), (X_test_full, y_test_full) = fashion_mnist.load_data()

# Select first 50 for training and first 50 for testing
X_train, y_train = X_train_full[:50], y_train_full[:50]
X_test, y_test = X_test_full[:50], y_test_full[:50]

print("Train:", X_train.shape, y_train.shape)
print("Test:", X_test.shape, y_test.shape)


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
[1m29515/29515[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
[1m26421880/26421880[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
[1m5148/5148[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
[1m4422102/4422102[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 0us/step
Train: (50, 28, 28) (50,)
Test: (50, 28, 28) (50,)


In [3]:
X_train = X_train.astype("float32") / 255.0
X_test  = X_test.astype("float32") / 255.0

X_train = X_train.reshape(-1, 28, 28, 1)
X_test  = X_test.reshape(-1, 28, 28, 1)

print("Train reshaped:", X_train.shape)
print("Test reshaped:", X_test.shape)


Train reshaped: (50, 28, 28, 1)
Test reshaped: (50, 28, 28, 1)


In [4]:
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

print("Train split:", X_tr.shape, y_tr.shape)
print("Val split:", X_val.shape, y_val.shape)


Train split: (40, 28, 28, 1) (40,)
Val split: (10, 28, 28, 1) (10,)


In [5]:
def build_cnn():
    model = models.Sequential([
        layers.Conv2D(32, (3,3), activation='relu', input_shape=(28,28,1)),
        layers.MaxPooling2D((2,2)),
        layers.Flatten(),
        layers.Dense(10, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model


In [6]:
ensemble_models = []
n_models = 5
epochs = 3

for i in range(n_models):
    print(f"\nTraining model {i+1}/{n_models}")

    # Bootstrap sample (same size as training split)
    idx = np.random.choice(len(X_tr), size=len(X_tr), replace=True)
    X_boot = X_tr[idx]
    y_boot = np.array(y_tr)[idx]

    model = build_cnn()
    model.fit(X_boot, y_boot, epochs=epochs, verbose=0, validation_data=(X_val, y_val))

    ensemble_models.append(model)

print("\n‚úÖ Ensemble training complete!")



Training model 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Training model 2/5

Training model 3/5

Training model 4/5

Training model 5/5

‚úÖ Ensemble training complete!


In [7]:
def ensemble_predict(models_list, X):
    probs = np.zeros((X.shape[0], 10))
    for m in models_list:
        probs += m.predict(X, verbose=0)
    probs /= len(models_list)  # average
    return np.argmax(probs, axis=1)

# Validation predictions
y_val_pred_ens = ensemble_predict(ensemble_models, X_val)
ens_val_acc = accuracy_score(y_val, y_val_pred_ens)

# Test predictions
y_test_pred_ens = ensemble_predict(ensemble_models, X_test)
ens_test_acc = accuracy_score(y_test, y_test_pred_ens)

print("‚úÖ Ensemble Validation Accuracy:", ens_val_acc)
print("‚úÖ Ensemble Test Accuracy:", ens_test_acc)




‚úÖ Ensemble Validation Accuracy: 0.1
‚úÖ Ensemble Test Accuracy: 0.22


In [8]:
single_model = build_cnn()
single_model.fit(X_tr, y_tr, epochs=epochs, verbose=0, validation_data=(X_val, y_val))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


<keras.src.callbacks.history.History at 0x7e7cb4bab8f0>

In [9]:
# Single model predictions
y_val_pred_single = np.argmax(single_model.predict(X_val, verbose=0), axis=1)
single_val_acc = accuracy_score(y_val, y_val_pred_single)

y_test_pred_single = np.argmax(single_model.predict(X_test, verbose=0), axis=1)
single_test_acc = accuracy_score(y_test, y_test_pred_single)

print("‚úÖ Single Model Validation Accuracy:", single_val_acc)
print("‚úÖ Single Model Test Accuracy:", single_test_acc)


‚úÖ Single Model Validation Accuracy: 0.6
‚úÖ Single Model Test Accuracy: 0.4


In [10]:
print("\nüìä Final Comparison")
print(f"Ensemble  - Val Acc: {ens_val_acc:.4f} | Test Acc: {ens_test_acc:.4f}")
print(f"Single    - Val Acc: {single_val_acc:.4f} | Test Acc: {single_test_acc:.4f}")

if ens_test_acc > single_test_acc:
    print("‚úÖ Ensemble performs better on test set.")
elif ens_test_acc < single_test_acc:
    print("‚úÖ Single model performs better on test set.")
else:
    print("‚úÖ Both perform equally on test set.")



üìä Final Comparison
Ensemble  - Val Acc: 0.1000 | Test Acc: 0.2200
Single    - Val Acc: 0.6000 | Test Acc: 0.4000
‚úÖ Single model performs better on test set.
