# Train model for upload to Kaggle

## Notebook set up

In [None]:
# Set notebook root to project root
from helper_functions import set_project_root

# Silence tensorflow, except for errors
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Run on the GTX1080 GPU - fastest single worker/small memory performance
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

set_project_root()

# Standard library imports
import pickle
import random
import time
from functools import partial
from pathlib import Path

# Third party imports
import h5py
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np
import optuna
import tensorflow as tf
from scipy.interpolate import griddata

# Local imports
import configuration as config

# Make sure the figures directory exists
figures_dir = f'{config.FIGURES_DIRECTORY}/model_training'
Path(figures_dir).mkdir(parents=True, exist_ok=True)

# Make sure models directory exists
Path(f'{config.MODELS_DIRECTORY}/kaggle').mkdir(parents=True, exist_ok=True)

# Best settings from ~400 Optuna optimization trials
# (see model_training/optimize_cnn.py)
sample_size = 883
batch_size = 2
steps = 133
learning_rate = 0.0010050627801302561
l_one = 0.08483065154593984
l_two = 0.030378903662206216
cnn_layers = 3
first_filter_set = 88
second_filter_set = 52
third_filter_set = 31
first_filter_size = 2
second_filter_size = 4
third_filter_size = 3
dense_units = 52
beta_one = 0.7226139295714885
beta_two = 0.9265501644462477
amsgrad = True
weight_decay = 0.01645540788238239
use_ema = True

# Long training run
epochs = 1000

total_ksteps = int((epochs * steps) / 1000)
model_save_file = f'{config.MODELS_DIRECTORY}/kaggle/ariel_cnn-6.3M-{total_ksteps}ksteps.keras'
training_results_save_file = f'{config.MODELS_DIRECTORY}/kaggle/ariel_cnn-6.3M-{total_ksteps}ksteps.pkl'

## 1. Data preparation

### 1.1. Load planet list

In [None]:
# Load corrected/extracted data for a sample planet
with h5py.File(f'{config.PROCESSED_DATA_DIRECTORY}/train.h5', 'r') as hdf:
    planet_ids = list(hdf.keys())

print(f'Found {len(planet_ids)} planets in training data.')

### 1.2. Data loader function

In [None]:
def data_loader(planet_ids: list, data_file: str, sample_size: int = 100):
    '''Generator that yields signal, spectrum pairs for training/validation/testing.

    Args:
        planet_ids (list): List of planet IDs to include in the generator.
        data_file (str): Path to the HDF5 file containing the data.
        sample_size (int, optional): Number of frames to draw from each planet. Defaults to 100.
    '''

    with h5py.File(data_file, 'r') as hdf:

        while True:
            np.random.shuffle(planet_ids)
            
            for planet_id in planet_ids:

                signal = hdf[planet_id]['signal'][:]
                spectrum = hdf[planet_id]['spectrum'][:]

                indices = random.sample(range(signal.shape[0]), sample_size)
                sample = signal[sorted(indices), :]

                yield sample, spectrum

### 1.3. Prefill the arguments to `data_loader()`

In [None]:
training_data_generator = partial(
    data_loader,
    planet_ids=planet_ids,
    data_file=f'{config.PROCESSED_DATA_DIRECTORY}/train.h5',
    sample_size=sample_size
)

### 1.4. Create TF dataset

In [None]:
training_dataset = tf.data.Dataset.from_generator(
    training_data_generator,
    output_signature=(
        tf.TensorSpec(shape=(sample_size, config.WAVELENGTHS), dtype=tf.float64),
        tf.TensorSpec(shape=(config.WAVELENGTHS), dtype=tf.float64)
    )
)

## 2. CNN model

### 2.1. Model definition

In [None]:
def compile_model(
        samples: int=sample_size,
        wavelengths: int=config.WAVELENGTHS,
        learning_rate: float=learning_rate,
        l1: float=l_one,
        l2: float=l_two,
        first_filter_set: int=first_filter_set,
        second_filter_set: int=second_filter_set,
        third_filter_set: int=third_filter_set,
        first_filter_size: int=first_filter_size,
        second_filter_size: int=second_filter_size,
        third_filter_size: int=third_filter_size,
        dense_units: int=dense_units,
        beta_one: float=beta_one,
        beta_two: float=beta_two,
        amsgrad: bool=amsgrad,
        weight_decay: float=weight_decay,
        use_ema: bool=use_ema
) -> tf.keras.Model:

    '''Builds the convolutional neural network regression model'''

    # Set-up the L1L2 for the dense layers
    regularizer = tf.keras.regularizers.L1L2(l1=l1, l2=l2)

    # Define the model layers in order
    model = tf.keras.Sequential([
        tf.keras.layers.Input((samples,wavelengths,1)),
        tf.keras.layers.Conv2D(
            first_filter_set,
            first_filter_size,
            padding='same',
            activation='relu',
        ),
        tf.keras.layers.MaxPooling2D(),
        tf.keras.layers.Conv2D(
            second_filter_set,
            second_filter_size,
            padding='same',
            activation='relu',
        ),
        tf.keras.layers.MaxPooling2D(),
        tf.keras.layers.Conv2D(
            third_filter_set,
            third_filter_size,
            padding='same',
            activation='relu',
        ),
        tf.keras.layers.MaxPooling2D(),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(
            dense_units,
            kernel_regularizer=regularizer,
            activation='relu',
        ),
        tf.keras.layers.Dense(wavelengths, activation='linear')
    ])

    # Define the optimizer
    optimizer = tf.keras.optimizers.Adam(
        learning_rate=learning_rate,
        beta_1=beta_one,
        beta_2=beta_two,
        amsgrad=amsgrad,
        weight_decay=weight_decay,
        use_ema=use_ema
    )

    # Compile the model, specifying the type of loss to use during training 
    # and any extra metrics to evaluate
    model.compile(
        optimizer=optimizer,
        loss=tf.keras.losses.MeanSquaredError(name='MSE'),
        metrics=[
            tf.keras.metrics.RootMeanSquaredError(name='RMSE')
        ]
    )

    return model

In [None]:
model = compile_model()
model.summary()

### 2.2. Training

In [None]:
print(f'Training model for {total_ksteps} ksteps')
start_time = time.time()

training_results = model.fit(
  training_dataset.batch(batch_size),
  epochs=epochs,
  steps_per_epoch=steps,
  validation_steps=steps,
  verbose=0
)

print(f'Training complete in {(time.time() - start_time)/60:.1f} minutes')
model.save(model_save_file)

with open(training_results_save_file, 'wb') as output_file:
    pickle.dump(training_results, output_file)

### 2.3. Training curve

In [None]:
# Set-up a 1x2 figure for accuracy and binary cross-entropy
fig, axs=plt.subplots(1,3, figsize=(12,4))

# Add the main title
fig.suptitle('CNN training curves', size='large')

# Plot training and validation loss
axs[0].set_title('Training loss (mean squared error)')
axs[0].plot(np.array(training_results.history['loss']), alpha=0.5, label='Training')
axs[0].plot(np.array(training_results.history['val_loss']), alpha=0.5, label='Validation')
axs[0].set_xlabel('Epoch')
axs[0].set_ylabel('loss')
# axs[0].set_ylim(21, 25)
# axs[0].set_yscale('log')
axs[0].legend(loc='upper right')

# Plot training and validation MSE
axs[1].set_title('Mean squared error')
axs[1].plot(training_results.history['MSE'], alpha=0.5, label='Training')
axs[1].plot(training_results.history['val_MSE'], alpha=0.5, label='Validation')
axs[1].set_xlabel('Epoch')
axs[1].set_ylabel('MSE')
# axs[1].set_ylim(top=0.014)
axs[1].set_yscale('log')

# Plot training and validation RMSE
axs[2].set_title('Root mean squared error')
axs[2].plot(training_results.history['RMSE'], alpha=0.5, label='Training')
axs[2].plot(training_results.history['val_RMSE'], alpha=0.5, label='Validation')
axs[2].set_xlabel('Epoch')
axs[2].set_ylabel('RMSE')
# axs[2].set_ylim(top=0.014)
axs[2].set_yscale('log')

# Show the plot
fig.tight_layout()
fig.savefig(
    f'{figures_dir}/03.4.1-kaggle_model_training_curve_{total_ksteps}ksteps.jpg',
    dpi=config.STD_FIG_DPI,
    bbox_inches='tight'
)