# CNN model

## Notebook set-up

In [None]:
# Standard library imports
import json
import pickle
import time
import types
from pathlib import Path

# Third party imports
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from matplotlib.ticker import FormatStrFormatter

# Project imports
from ariel_data_preprocessing.data_generator_functions import make_training_datasets

Working directory: /mnt/arkk/kaggle/ariel-data-challenge


## 1. Initialize data generators

In [None]:
sample_size = 274
smoothing_window = 160

training_dataset, validation_dataset, evaluation_dataset = make_training_datasets(
    #data_file=f'{config.PROCESSED_DATA_DIRECTORY}/train-1100_smoothing-10-20-40-80-160.h5',
    sample_size=sample_size,
    smoothing_window=smoothing_window,
    standardize_wavelengths=True
)

## 2. Model definition

In [None]:
def compile_model(
        sample_size: int,
        **hyperparameters
) -> tf.keras.Model:

    '''Builds the convolutional neural network regression model'''

    hyperparameters = types.SimpleNamespace(**hyperparameters)

    # Set-up the L1L2 for the dense layers
    regularizer = tf.keras.regularizers.L1L2(
        l1=hyperparameters.l1,
        l2=hyperparameters.l2
    )

    # Define the model layers in order
    model = tf.keras.Sequential([
        tf.keras.layers.Input((sample_size,283,1)),
        tf.keras.layers.Conv2D(
            hyperparameters.first_filter_set,
            hyperparameters.first_filter_size,
            padding='same',
            kernel_initializer='he_uniform',
            bias_initializer=tf.keras.initializers.Constant(0.1),
            activation='relu',
        ),
        tf.keras.layers.MaxPooling2D(),
        tf.keras.layers.Conv2D(
            hyperparameters.second_filter_set,
            hyperparameters.second_filter_size,
            kernel_initializer='he_uniform',
            bias_initializer=tf.keras.initializers.Constant(0.1),
            padding='same',
            activation='relu',
        ),
        tf.keras.layers.MaxPooling2D(),
        tf.keras.layers.Conv2D(
            hyperparameters.third_filter_set,
            hyperparameters.third_filter_size,
            kernel_initializer='he_uniform',
            bias_initializer=tf.keras.initializers.Constant(0.1),
            padding='same',
            activation='relu',
        ),
        tf.keras.layers.MaxPooling2D(),
        tf.keras.layers.Conv2D(
            hyperparameters.fourth_filter_set,
            hyperparameters.fourth_filter_size,
            kernel_initializer='he_uniform',
            bias_initializer=tf.keras.initializers.Constant(0.1),
            padding='same',
            activation='relu',
        ),
        tf.keras.layers.MaxPooling2D(),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(
            hyperparameters.dense_units,
            kernel_regularizer=regularizer,
            kernel_initializer='he_uniform',
            bias_initializer=tf.keras.initializers.Constant(0.1),
            activation='relu',
        ),
        tf.keras.layers.Dense(
            283,
            kernel_initializer=tf.keras.initializers.GlorotNormal(),
            bias_initializer=tf.keras.initializers.Constant(0.014689),
            activation='linear'
        )
    ])

    # Define the optimizer
    optimizer = tf.keras.optimizers.Adam(
        learning_rate=hyperparameters.learning_rate,
        beta_1=hyperparameters.beta_one,
        beta_2=hyperparameters.beta_two,
        amsgrad=hyperparameters.amsgrad,
        weight_decay=hyperparameters.weight_decay,
        use_ema=hyperparameters.use_ema
    )

    # Compile the model, specifying the type of loss to use during training and any extra
    # metrics to evaluate
    model.compile(
        optimizer=optimizer,
        loss=tf.keras.losses.MeanSquaredError(name='MSE'),
        metrics=[tf.keras.metrics.RootMeanSquaredError(name='RMSE')]
    )

    return model

In [10]:
hyperparameters = {
    'learning_rate': 0.00032893709671884643,
    'l1': 0.5023691865516101,
    'l2': 0.722432138551213,
    'first_filter_set': 52,
    'second_filter_set': 60,
    'third_filter_set': 47,
    'fourth_filter_set': 47,
    'first_filter_size': 2,
    'second_filter_size': 5,
    'third_filter_size': 5,
    'fourth_filter_size': 6,
    'dense_units': 32,
    'beta_one': 0.5394897637095215,
    'beta_two': 0.6238511299185701,
    'amsgrad': False,
    'weight_decay': 0.016349602205981664,
    'use_ema': False
}

model = compile_model(
    sample_size=sample_size,
    **hyperparameters
)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 274, 283, 52)      260       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 137, 141, 52)     0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 137, 141, 60)      78060     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 68, 70, 60)       0         
 2D)                                                             
                                                                 
 conv2d_2 (Conv2D)           (None, 68, 70, 47)        70547     
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 34, 35, 47)       0

## 4. Single training run

In [None]:
epochs = 500
batch_size = 4
steps = 55

# Note: it takes 5 epochs to see every planet in the training set once

total_ksteps = (steps * epochs * batch_size) // 1000
n_params = np.sum([tf.keras.backend.count_params(p) for p in model.trainable_weights]) / 1000

#model_save_file = f'{config.MODELS_DIRECTORY}/ariel_cnn-{n_params}k-{total_ksteps}ksteps-tf2.11.keras'

if Path(model_save_file).exists() and Path(training_results_save_file).exists():

    print(f'Found existing model for {total_ksteps} ksteps, skipping training.')

    # Load the existing model
    model = tf.keras.models.load_model(model_save_file)

    # Load existing training results
    with open(training_results_save_file, 'rb') as input_file:
        training_results = pickle.load(input_file)

else:

  print(f'Training model for {total_ksteps} ksteps')

  checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
      filepath=model_save_file,   # Path to save the best model
      monitor='val_RMSE',         # Metric to monitor (e.g., 'val_accuracy', 'loss')
      save_best_only=True,        # Only save the model if the monitored metric improves
      mode='min',                 # 'min' for metrics like loss (lower is better), 'max' for accuracy
      verbose=1                   # Display messages when a new best model is saved
  )

  start_time = time.time()

  training_results = model.fit(
    training_dataset.batch(batch_size),
    validation_data=validation_dataset.batch(batch_size),
    epochs=epochs,
    steps_per_epoch=steps,
    validation_steps=100 // batch_size,  # Evaluate on 100 planets,
    callbacks=[checkpoint_callback],
    verbose=1
  )

  print(f'Training complete in {(time.time() - start_time)/60:.1f} minutes')
  model.save(model_save_file)

  with open(training_results_save_file, 'wb') as output_file:
      pickle.dump(training_results, output_file)

Training model for 110 ksteps
Epoch 1/500

In [None]:
# Set-up a 1x2 figure for accuracy and binary cross-entropy
fig, axs=plt.subplots(1,2, figsize=(12,4))

# Add the main title
fig.suptitle('CNN training curves', size='large')

# Plot training and validation loss
axs[0].set_title('Training loss (mean squared error)')
axs[0].plot(np.array(training_results.history['loss']), alpha=0.5, label='Training')
axs[0].plot(np.array(training_results.history['val_loss']), alpha=0.5, label='Validation')
axs[0].set_xlabel('Epoch')
axs[0].set_ylabel('loss')
# axs[0].set_ylim(21, 25)
# axs[0].set_yscale('log')
axs[0].legend(loc='upper right')

# Plot training and validation RMSE
axs[1].set_title('Root mean squared error')
axs[1].plot(training_results.history['RMSE'], alpha=0.5, label='Training')
axs[1].plot(training_results.history['val_RMSE'], alpha=0.5, label='Validation')
axs[1].set_xlabel('Epoch')
axs[1].set_ylabel('RMSE')
# axs[2].set_ylim(top=0.014)
axs[1].set_yscale('log')

# Show the plot
fig.tight_layout()
fig.savefig(
    f'{figures_dir}/03.2.4-ariel_cnn_training_curves_8.4M-{total_ksteps}ksteps.jpg',
    dpi=config.STD_FIG_DPI,
    bbox_inches='tight'
)

### 4.1. Upload model to Kaggle

In [None]:
!kaggle datasets metadata gperdrizet/ariel-cnn -p ./model

with open('model/dataset-metadata.json', 'r') as f:
    data = json.loads(json.load(f))

data['id'] = 'gperdrizet/ariel-cnn'

with open('model/dataset-metadata.json', "w") as f:
    json.dump(data, f)

!kaggle datasets version -p ./model/ -m "Updated ariel CNN"

## 5. Model evaluation (validation set)

In [None]:
samples = 10
planets = 550

evaluation_data = evaluation_dataset.take(planets)

signals = np.array([element[0].numpy() for element in evaluation_data])
spectra = np.array([element[1].numpy() for element in evaluation_data])

print(f'Signals shape: {signals.shape}')
print(f'Spectra shape: {spectra.shape}')

### 3.2. Predictions

In [None]:
predictions_file = '/kaggle/working/predictions'

if Path(predictions_file).is_file():
    spectrum_predictions = np.load(predictions_file)

else:
    spectrum_predictions = []
    
    for planet in signals:
        spectrum_predictions.append(model.predict(planet, batch_size=10, verbose=0))
    
    spectrum_predictions = np.array(spectrum_predictions)
    np.save(predictions_file, spectrum_predictions)

spectrum_predictions_avg = np.mean(spectrum_predictions, axis=1)
spectrum_predictions_std = np.std(spectrum_predictions, axis=1)
reference_spectra = spectra[:,0,:]

print(f'Spectrum predictions shape: {spectrum_predictions.shape}')
print(f'Spectrum predictions avg shape: {spectrum_predictions_avg.shape}')
print(f'Spectrum predictions std shape: {spectrum_predictions_std.shape}')

### 3.3. Plot

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(10,10))
axs = axs.flatten()

fig.suptitle('Spectral prediction evaluation')

axs[0].set_title('Predicted vs true spectral signals')
axs[0].scatter(
    spectra.flatten(),
    spectrum_predictions.flatten(),
    s=10,
    alpha=0.5,
    color='black',
    label='Sample predictions'
)

axs[0].scatter(
    reference_spectra.flatten(),
    spectrum_predictions_avg.flatten(),
    s=2.5,
    alpha=0.5,
    color='red', 
    label='Averaged prediction'
)

axs[0].set_xlim(0,0.1)
axs[0].set_ylim(0,0.1)
axs[0].set_aspect('equal')
axs[0].set_xlabel('True spectral signal')
axs[0].set_ylabel('Predicted spectral signal')
axs[0].legend(loc='best', markerscale=2)

residuals = spectrum_predictions.flatten() - spectra.flatten()
avg_residual = spectrum_predictions_avg.flatten() - reference_spectra.flatten()

axs[1].set_title('Prediction residuals')
axs[1].scatter(
    spectra,
    residuals,
    s=10,
    alpha=0.5,
    color='black',
    label='Sample predictions'
)

axs[1].scatter(
    reference_spectra,
    avg_residual,
    s=2.5,
    alpha=0.5,
    color='red',
    label='Averaged prediction'
)

axs[1].set_xlabel('True spectral signal')
axs[1].set_ylabel('Prediction residuals')

axs[2].set_title('Mean fit residual vs sample sigma')
axs[2].scatter(avg_residual, spectrum_predictions_std.flatten(), color='black', alpha=0.5, s=2.5)
axs[2].set_xlabel('Fit residual')
axs[2].set_ylabel('Standard deviation')

axs[3].set_title('Standard deviation of predictions')
axs[3].hist(spectrum_predictions_std.flatten(), bins=100, color='black')
axs[3].xaxis.set_major_formatter(FormatStrFormatter('%.2e'))
axs[3].tick_params(axis='x', labelrotation=45)
axs[3].set_xlabel('Standard deviation')
axs[3].set_ylabel('Counts')

fig.tight_layout()
fig.show()