# Optimizing Nueral Network Architecture and Hyperparameters



# Baseline Model Training Example

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error

# Load the California Housing Dataset
data = fetch_california_housing()
X, y = data.data, data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data (important for neural networks)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the ANN model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),  # Dropout for regularization
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Set up early stopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',       # Monitor validation loss
    patience=10,              # Stop after 10 epochs with no improvement
    restore_best_weights=True # Restore model weights from the best epoch
)

# Train the model
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=100,              # Maximum epochs
    batch_size=32,
    callbacks=[early_stopping], # Add early stopping
    verbose=1
)

# Evaluate the model on the test set
loss, mae = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Mean Absolute Error (MAE): {mae:.2f}")

# Predict and calculate the Mean Squared Error
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Test Mean Squared Error (MSE): {mse:.2f}")

# Plot training history
import matplotlib.pyplot as plt

plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Mean Squared Error')
plt.legend()
plt.show()


# Setup for Neural Architecture Search (NAS)

The following demonstrates the training within an annotated function to support NAS.

In [None]:
from typing import Optional, Annotated, Union
from dataclasses import dataclass, field

import raxpy
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error
 
# Load the California Housing Dataset
data = fetch_california_housing()
X, y = data.data, data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

@dataclass
class Layer:
    dropout:Annotated[float, raxpy.Float(lb=0.0, ub=0.9)]
    neuron_count:Annotated[int, raxpy.Integer(lb=8, ub=256)]
    activation_type:bool

    def create_layer(self, input_shape):
        parts = [
            Dense(self.neuron_count, activation='relu', input_shape=input_shape),
        ]

        if self.dropout > 0.0:
            parts.append(Dropout(self.dropout))  # Dropout for regularization
        return parts


@dataclass
class LearningRateScheduleFixed:
    learning_rate:Annotated[float, raxpy.Float(lb=0.0001,ub=0.01)] = 0.001

    def create(self):
        return self.learning_rate


@dataclass
class LearningRateScheduleExponentialDecay:
    initial_learning_rate:Annotated[float, raxpy.Float(lb=0.0001,ub=0.01)] = 0.001
    decay_steps:Annotated[float, raxpy.Integer(lb=100,ub=10000)]=1000,           # Decay every 1000 steps
    decay_rate:Annotated[float, raxpy.Float(lb=0.5,ub=0.99)]=0.96,            # Multiply by 0.96 at each decay step
    staircase:bool=True              # If True, decay in discrete steps

    def create(self):
        return tf.keras.optimizers.schedules.ExponentialDecay(
            initial_learning_rate=self.initial_learning_rate,
            decay_steps=self.decay_steps,
            decay_rate=self.decay_rate,
            staircase=self.staircase
        )
    


@dataclass
class OptimizerSGD:
    learning_rate_schedule:Union[LearningRateScheduleFixed, LearningRateScheduleExponentialDecay] = field(default_factory=LearningRateScheduleFixed)
    momentum:Annotated[float, raxpy.Float(lb=0.5,ub=0.95)] = 0.9       # Momentum factor for smoother convergence
    nesterov:bool = True       # Nesterov momentum for accelerated learning

    
    def create(self):
        return SGD(
            learning_rate=self.learning_rate_schedule.create(),
            momentum=self.momentum,
            nesterov=self.nesterov
        )


@dataclass
class OptimizerADAM:
    beta_1:Annotated[float, raxpy.Float(lb=0.8,ub=0.99)]=0.9           # Exponential decay rate for the 1st moment (mean of gradients)
    beta_2:Annotated[float, raxpy.Float(lb=0.8,ub=0.9999)]=0.999         # Exponential decay rate for the 2nd moment (variance of gradients)
    amsgrad:bool=False
    learning_rate_schedule:Union[LearningRateScheduleFixed, LearningRateScheduleExponentialDecay] = field(default_factory=LearningRateScheduleFixed)

    def create(self):
        return Adam(
            learning_rate=self.learning_rate_schedule.create(),
            beta_1=self.beta_1,
            beta_2=self.beta_2,
            epsilon=1e-07,# Small constant to prevent division by zero
            amsgrad=self.amsgrad
        )


def train(
    batch_size:Annotated[int, raxpy.Integer(value_set=[8,16,32,64])],
    scaler_flag:bool,
    layer_1:Layer,
    layer_2:Layer,
    layer_3:Optional[Layer],
    layer_4:Optional[Layer],
    optimizer:Union[OptimizerADAM,OptimizerSGD]
):
    # Standardize the data (important for neural networks)

    if scaler_flag:
        scaler = StandardScaler()
    else:
        scaler = MinMaxScaler()
    X_train_t = scaler.fit_transform(X_train)
    X_test_t = scaler.transform(X_test)

    c = layer_1.create_layer((X_train_t.shape[1],)) + layer_2.create_layer((layer_1.neuron_count,))
    p_count = layer_2.neuron_count
    if layer_3 is not None:
        c = c + layer_3.create_layer((p_count,))
        p_count = layer_3.neuron_count
    if layer_4 is not None:
        c = c + layer_4.create_layer((p_count,))
        p_count = layer_4.neuron_count

    c = c + [
        Dense(1)  # Output layer for regression
    ]
    # Define the ANN model
    model = Sequential(c)

    # Compile the model
    model.compile(optimizer=optimizer.create(), loss='mse', metrics=['mae'])


    # Set up early stopping callback
    early_stopping = EarlyStopping(
        monitor='val_loss',       # Monitor validation loss
        patience=15,              # Stop after 15 epochs with no improvement
        restore_best_weights=True # Restore model weights from the best epoch
    )

    # Train the model
    history = model.fit(X_train_t, y_train, validation_split=0.2, callbacks=[early_stopping], epochs=250, batch_size=batch_size, verbose=1)

    # Evaluate the model on the test set
    loss, mae = model.evaluate(X_test_t, y_test, verbose=0)
    print(f"Test Mean Absolute Error (MAE): {mae:.2f}")

    # Predict and calculate the Mean Squared Error
    y_pred = model.predict(X_test_t)
    try:
        mse = mean_squared_error(y_test, y_pred)
        print(f"Test Mean Squared Error (MSE): {mse:.2f}")
        # Plot training history
        import matplotlib.pyplot as plt

        plt.plot(history.history['loss'], label='Train Loss')
        plt.plot(history.history['val_loss'], label='Validation Loss')
        plt.title('Training and Validation Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Mean Squared Error')
        plt.legend()
        plt.show()
        
    except Exception as e:
        mse = np.inf
        print(f"Invalid Test Mean Squared Error (MSE)")
    
    return mse


We can use the function defined above like any other Python function. This function trains an artifical neural network.

In [None]:
train(32,False,Layer(0.2,256,False),Layer(0.5,32,False), None, None, OptimizerADAM())

In [None]:
train(32,False,Layer(0.2,256,False),Layer(0.2,64,False), Layer(0.0,32,False), None, OptimizerSGD())

We can also use the function to conduct a experiment using a space-filling design to explore the affects of hyper-parameters and network architecture settings on validation loss.

In [None]:
inputs, outputs = raxpy.perform_experiment(train,n_points=10)

In [None]:
outputs

In [None]:
inputs[2]

In [None]:
inputs

In [None]:
inputs2, outputs2 = raxpy.perform_experiment(train,n_points=10)

In [None]:
inputs2

In [None]:
outputs2

In [None]:
inputs2[6]

## Hyperopt Example

The following demonstrates using raxpy with HyperOpt.

In [None]:
import raxpy.adapters.hyperopt as rhp
from hyperopt import fmin, tpe

In [None]:
def optimize(
        init_sampling_points:Annotated[int, raxpy.Integer(lb=0, ub=50)],
        designer,
        max_points=100,
        f=train,
    ):

    if init_sampling_points > 0:
        input_space = raxpy.function_spec.extract_input_space(f)
        design = designer(input_space, init_sampling_points)
        inputs = rhp.convert_design(design)
    else:
        inputs = []

    space, fn = rhp.convert_to_hp(f)

    best = fmin(
        fn=fn,
        space=space,
        algo=tpe.suggest,
        max_evals=max_points,
        points_to_evaluate=inputs,
    )

    return (best, fn)

In [None]:
explore_point_count = 10
exploit_point_count = 10
best_point, hp_f = optimize(init_sampling_points=explore_point_count, designer=raxpy.generate_design, f=train, max_points=exploit_point_count+explore_point_count)

In [None]:

def optmize_f(
    init_sample_points:Annotated[float, raxpy.Integer(lb=10, ub=100)],
    opt_points:Annotated[float, raxpy.Integer(lb=5, ub=100)],
):

    max_points = init_sample_points + opt_points
    point_1,fn = optimize(init_sampling_points=init_sample_points, designer=raxpy.generate_design, f=train, max_points=opt_points)
    point_2,fn = optimize(init_sampling_points=init_sample_points, designer=raxpy.generate_random_design, f=train, max_points=opt_points)
    point_3,fn = optimize(init_sampling_points=0, designer=raxpy.generate_random_design, f=train, max_points=max_points)

    p1_b = fn(point_1)
    p2_b = fn(point_2)
    p3_b = fn(point_3)

    return p1_b, p2_b, p3_b