In [3]:
"""
Programming Exercise 3: Unsupervised Pretraining
MSDS-534-B01: Deep Learning
Pranav Arora
August 10, 2025

Implementation of Greedy Layer-wise Unsupervised Pretraining Protocol
Based on concepts from Goodfellow et al. (2016), Deep Learning, Section 15.1

Note: Fixed recompilation issue during training by splitting into two phases
"""

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model
from sklearn.preprocessing import StandardScaler

def create_autoencoder_layer(input_dim, encoding_dim, activation='relu'):
    """
    Creates a single autoencoder for pretraining one layer
    """
    # encoder part
    encoder_input = layers.Input(shape=(input_dim,))
    encoded = layers.Dense(encoding_dim, activation=activation)(encoder_input)

    # decoder part
    decoded = layers.Dense(input_dim, activation='sigmoid')(encoded)

    # full autoencoder model
    autoencoder = Model(encoder_input, decoded)

    # separate encoder model for later use
    encoder = Model(encoder_input, encoded)

    return autoencoder, encoder

def greedy_layerwise_pretraining(X_train, layer_dims, epochs_per_layer=50, batch_size=32):
    """
    Implements greedy layer-wise unsupervised pretraining protocol

    Parameters:
    -----------
    X_train : numpy array
        Raw input training data
    layer_dims : list
        List of dimensions for each hidden layer
    epochs_per_layer : int
        Number of epochs to train each layer
    batch_size : int
        Batch size for training

    Returns:
    --------
    pretrained_model : keras Model
        The pretrained deep network
    layer_weights : list
        List of pretrained weights for each layer
    X_normalized : numpy array
        Normalized training data (needed for fine-tuning)
    """

    print("Starting Greedy Layer-wise Unsupervised Pretraining...")
    print(f"Network architecture: Input({X_train.shape[1]}) -> {' -> '.join(map(str, layer_dims))}")

    # normalize input data (important for stable training)
    scaler = StandardScaler()
    X_normalized = scaler.fit_transform(X_train)

    # store pretrained weights for each layer
    pretrained_weights = []
    pretrained_biases = []

    # current input for training (starts with raw data)
    current_input = X_normalized
    input_dim = X_train.shape[1]

    # Step 1: Greedy layer-wise pretraining
    for layer_idx, hidden_dim in enumerate(layer_dims):
        print(f"\n--- Pretraining Layer {layer_idx + 1} ---")
        print(f"Input dim: {input_dim}, Hidden dim: {hidden_dim}")

        # create autoencoder for this layer
        autoencoder, encoder = create_autoencoder_layer(input_dim, hidden_dim)

        # compile with reconstruction loss
        autoencoder.compile(
            optimizer='adam',
            loss='mse'  # reconstruction error
        )

        # train this layer's autoencoder
        history = autoencoder.fit(
            current_input, current_input,  # unsupervised: input=target
            epochs=epochs_per_layer,
            batch_size=batch_size,
            verbose=0,
            validation_split=0.1
        )

        # extract and save the encoder weights (what we actually want)
        encoder_weights = encoder.layers[1].get_weights()
        pretrained_weights.append(encoder_weights[0])  # weight matrix
        pretrained_biases.append(encoder_weights[1])   # bias vector

        print(f"Layer {layer_idx + 1} pretraining complete. Final loss: {history.history['loss'][-1]:.4f}")

        # transform current input through the encoder for next layer
        current_input = encoder.predict(current_input)
        input_dim = hidden_dim

    # Step 2: Build the full deep network with pretrained weights
    print("\n--- Building Deep Network with Pretrained Weights ---")

    # create the deep network
    model_input = layers.Input(shape=(X_train.shape[1],))
    x = model_input

    # add each layer with pretrained weights
    for layer_idx, hidden_dim in enumerate(layer_dims):
        layer = layers.Dense(
            hidden_dim,
            activation='relu',
            name=f'pretrained_layer_{layer_idx + 1}'
        )
        x = layer(x)

        # set the pretrained weights
        layer.set_weights([pretrained_weights[layer_idx], pretrained_biases[layer_idx]])

    # create the pretrained model (no output layer yet - depends on task)
    pretrained_model = Model(model_input, x, name='pretrained_deep_network')

    return pretrained_model, (pretrained_weights, pretrained_biases), X_normalized

def add_supervised_output_layer(pretrained_model, num_classes, freeze_pretrained=True):
    """
    Adds supervised output layer for fine-tuning

    Parameters:
    -----------
    pretrained_model : keras Model
        The pretrained network
    num_classes : int
        Number of output classes (or 1 for regression)
    freeze_pretrained : bool
        Whether to freeze pretrained layers during initial fine-tuning
    """

    # freeze pretrained layers if specified
    if freeze_pretrained:
        for layer in pretrained_model.layers:
            layer.trainable = False

    # add output layer for supervised task
    output = layers.Dense(num_classes, activation='softmax')(pretrained_model.output)

    # create final model
    final_model = Model(pretrained_model.input, output)

    return final_model

def fine_tune_network(model, X_train, y_train, epochs=50, unfreeze_after=25):
    """
    Fine-tunes the entire network on supervised task

    Parameters:
    -----------
    model : keras Model
        Model with pretrained weights and output layer
    X_train : numpy array
        Training data
    y_train : numpy array
        Training labels
    epochs : int
        Total epochs for fine-tuning
    unfreeze_after : int
        Epoch after which to unfreeze all layers
    """

    print("\n--- Fine-tuning on Supervised Task ---")

    # Phase 1: Train with frozen pretrained layers
    print("Phase 1: Training output layer only (frozen pretrained layers)")
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    history1 = model.fit(
        X_train, y_train,
        epochs=unfreeze_after,
        batch_size=32,
        validation_split=0.2,
        verbose=1
    )

    # Phase 2: Unfreeze and train all layers with lower learning rate
    print(f"\nPhase 2: Unfreezing all layers and fine-tuning entire network")
    for layer in model.layers:
        layer.trainable = True

    # recompile with lower learning rate for fine-tuning
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.0001),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    history2 = model.fit(
        X_train, y_train,
        epochs=epochs - unfreeze_after,
        batch_size=32,
        validation_split=0.2,
        verbose=1
    )

    # combine histories
    history = {
        'loss': history1.history['loss'] + history2.history['loss'],
        'accuracy': history1.history['accuracy'] + history2.history['accuracy'],
        'val_loss': history1.history['val_loss'] + history2.history['val_loss'],
        'val_accuracy': history1.history['val_accuracy'] + history2.history['val_accuracy']
    }

    # create a simple object to hold the combined history
    class CombinedHistory:
        def __init__(self, history_dict):
            self.history = history_dict

    return model, CombinedHistory(history)

# Alternative implementation using learning rate scheduler (more stable)
def fine_tune_network_with_scheduler(model, X_train, y_train, epochs=50, unfreeze_after=25):
    """
    Alternative fine-tuning with learning rate scheduler (avoids recompilation issues)
    """
    print("\n--- Fine-tuning on Supervised Task (with LR scheduler) ---")

    # unfreeze layers based on epochs
    def selective_unfreeze(epoch):
        if epoch == unfreeze_after:
            print(f"\nUnfreezing all layers at epoch {epoch}")
            for layer in model.layers:
                layer.trainable = True

    # learning rate schedule
    def lr_schedule(epoch):
        if epoch < unfreeze_after:
            return 0.001  # higher lr for output layer only
        else:
            return 0.0001  # lower lr for full network fine-tuning

    # callbacks
    lr_scheduler = keras.callbacks.LearningRateScheduler(lr_schedule)
    unfreeze_callback = keras.callbacks.LambdaCallback(
        on_epoch_begin=lambda epoch, logs: selective_unfreeze(epoch)
    )

    # compile once
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    # train with callbacks
    history = model.fit(
        X_train, y_train,
        epochs=epochs,
        batch_size=32,
        validation_split=0.2,
        callbacks=[lr_scheduler, unfreeze_callback],
        verbose=1
    )

    return model, history

# ============ MAIN EXECUTION EXAMPLE ============

if __name__ == "__main__":
    # simulate some data
    np.random.seed(42)
    n_samples = 1000
    n_features = 784  # like MNIST
    n_classes = 10

    # generate dummy data
    X_train = np.random.randn(n_samples, n_features)
    y_train = np.random.randint(0, n_classes, n_samples)

    # define architecture for pretraining
    hidden_layers = [512, 256, 128]  # progressively smaller layers

    # Step 1: Unsupervised pretraining (returns normalized data too)
    pretrained_model, weights, X_train_normalized = greedy_layerwise_pretraining(
        X_train,
        hidden_layers,
        epochs_per_layer=30
    )

    print("\nPretrained model summary:")
    pretrained_model.summary()

    # Step 2: Add supervised output layer
    final_model = add_supervised_output_layer(
        pretrained_model,
        num_classes=n_classes,
        freeze_pretrained=True
    )

    # Step 3: Fine-tune on supervised task
    # Use the two-phase approach (more stable than recompiling mid-training)
    trained_model, history = fine_tune_network(
        final_model,
        X_train_normalized,  # use normalized data
        y_train,
        epochs=50,
        unfreeze_after=25
    )

    # Alternative: use scheduler version if you prefer single training run
    # trained_model, history = fine_tune_network_with_scheduler(
    #     final_model,
    #     X_train_normalized,  # use normalized data
    #     y_train,
    #     epochs=50,
    #     unfreeze_after=25
    # )

    print("\n--- Pretraining Protocol Complete ---")
    print(f"Final training accuracy: {history.history['accuracy'][-1]:.4f}")
    print(f"Final validation accuracy: {history.history['val_accuracy'][-1]:.4f}")


Starting Greedy Layer-wise Unsupervised Pretraining...
Network architecture: Input(784) -> 512 -> 256 -> 128

--- Pretraining Layer 1 ---
Input dim: 784, Hidden dim: 512
Layer 1 pretraining complete. Final loss: 0.6224
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

--- Pretraining Layer 2 ---
Input dim: 512, Hidden dim: 256
Layer 2 pretraining complete. Final loss: 5.9051
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

--- Pretraining Layer 3 ---
Input dim: 256, Hidden dim: 128
Layer 3 pretraining complete. Final loss: 33.6777
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 

--- Building Deep Network with Pretrained Weights ---

Pretrained model summary:



--- Fine-tuning on Supervised Task ---
Phase 1: Training output layer only (frozen pretrained layers)
Epoch 1/25
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.1029 - loss: 19.2068 - val_accuracy: 0.0850 - val_loss: 6.9858
Epoch 2/25
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.1153 - loss: 6.6591 - val_accuracy: 0.0800 - val_loss: 4.0946
Epoch 3/25
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.1141 - loss: 4.2997 - val_accuracy: 0.0800 - val_loss: 3.8721
Epoch 4/25
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.0990 - loss: 4.1426 - val_accuracy: 0.0850 - val_loss: 3.7541
Epoch 5/25
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.1070 - loss: 4.2361 - val_accuracy: 0.0850 - val_loss: 3.7305
Epoch 6/25
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accura

'\nPSEUDO-CODE VERSION:\n\nfunction GreedyLayerwisePretraining(X_train, layer_dimensions):\n    X_normalized = Normalize(X_train)  # standardize input\n    pretrained_weights = []\n    current_input = X_normalized\n    \n    # Phase 1: Layer-wise pretraining\n    for each layer_dim in layer_dimensions:\n        # Create autoencoder for current layer\n        autoencoder = CreateAutoencoder(input_dim=current_input.shape, \n                                       hidden_dim=layer_dim)\n        \n        # Train autoencoder unsupervised (reconstruction task)\n        autoencoder.train(input=current_input, target=current_input)\n        \n        # Extract encoder portion and weights\n        encoder = autoencoder.get_encoder()\n        weights = encoder.get_weights()\n        pretrained_weights.append(weights)\n        \n        # Transform data through encoder for next layer\n        current_input = encoder.transform(current_input)\n    \n    # Phase 2: Build deep network\n    deep_networ