This kernel will show you **how to use AutoML with the SETI dataset**.

I used AutoKeras to perform AutoML. You can find information and examples on how to use AutoKeras on its [website](https://autokeras.com/).

Also, in order to use the SETI dataset correctly and to create dataloaders, I used this [kernel](https://www.kaggle.com/ayuraj/train-tensorflow-efficientnet-kfold-w-b) from Ayush Thakur.

For this demonstration, I will only use 10% of the dataset in order to get results faster.

Note: this code is not intended to win this competition, only to present a tutorial on how to use AutoML with the SETI dataset.

------------------

*AutoKeras:*

*Haifeng Jin, Qingquan Song, and Xia Hu. "Auto-keras: An efficient neural architecture search system." Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. ACM, 2019*

# Download AutoKeras

In [None]:
!pip install -q git+https://github.com/keras-team/keras-tuner.git
!pip install -q autokeras

# Import libraries

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, train_test_split

import tensorflow as tf
import tensorflow_addons as tfa

import autokeras as ak

# Set the configuration we will need during this demonstration

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

CONFIGURATION = dict (
    seed = 0,
    nbr_folds = 5,
    img_width = 250,
    img_height = 250,
    batch_size = 16,
    epochs = 1000
)

# Load the *train_label.csv* file with pandas

In [None]:
df = pd.read_csv('../input/seti-breakthrough-listen/train_labels.csv')

df['img_path'] = df['id'].apply(lambda x: f'../input/seti-breakthrough-listen/train/{x[0]}/{x}.npy')

print(df.shape)

df.head()

# Estimate class weights based on the imbalance of the data set

In [None]:
neg = df.target.value_counts()[0]
pos = df.target.value_counts()[1]

total = neg + pos

# Weigts to correct imbalance:

# Scaling by total/2 helps keep the loss to a similar magnitude.
# The sum of the weights of all examples stays the same.
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weights = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

# Create folds

In [None]:
kfold = StratifiedKFold(
    n_splits=CONFIGURATION["nbr_folds"],
    shuffle=True,
    random_state=CONFIGURATION["seed"]
)

for n, (train_index, val_index) in enumerate(kfold.split(df, df['target'])):
    df.loc[val_index, 'fold'] = int(n)

df['fold'] = df['fold'].astype(int)

df.groupby(['fold', 'target']).size()

# Data loading and pre-processing functions

In [None]:
def load_npy(path):
    
    # load npy data
    data = np.load(path.numpy()).astype(np.float32)
    
    # stack -> we only keep the on-target observations
    data = np.dstack((data[0], data[2], data[4]))
    
    return data

# decoration

@tf.function
def load_resize_data_augmentation(df_dict):
    # Load image
    [image,] = tf.py_function(load_npy, [df_dict['img_path']], [tf.float32])
    image.set_shape((273, 256, 3))
    
    # Resize image
    image = tf.image.resize(image, (CONFIGURATION['img_height'], CONFIGURATION['img_width']))
    
    
    # Data augmentation
    image = tf.image.random_brightness(image, 0.2, seed=CONFIGURATION["seed"])
    image = tf.image.random_hue(image, 0.2, seed=CONFIGURATION["seed"])
    image = tf.image.random_flip_up_down(image, seed=CONFIGURATION["seed"])
    
    # Parse label
    label = df_dict['target']
    label = tf.cast(label, tf.float32)
    
    return image, label

@tf.function
def load_resize_spec(df_dict):
    # Load image
    [image,] = tf.py_function(load_npy, [df_dict['img_path']], [tf.float32])
    image.set_shape((273, 256, 3))
    
    # Resize image
    image = tf.image.resize(image, (CONFIGURATION['img_height'], CONFIGURATION['img_width']))
    
    # Parse label
    label = df_dict['target']
    label = tf.cast(label, tf.float32)
    
    return image, label

# Dataloaders with 10% of the dataset

In [None]:
def get_dataloaders(train_df, valid_df):
    trainloader = tf.data.Dataset.from_tensor_slices(dict(train_df))
    validloader = tf.data.Dataset.from_tensor_slices(dict(valid_df))

    trainloader = (
        trainloader
        .shuffle(1024)
        .map(load_resize_data_augmentation, num_parallel_calls=AUTOTUNE)
        .batch(CONFIGURATION['batch_size'])
        .prefetch(AUTOTUNE)
    )

    validloader = (
        validloader
        .map(load_resize_spec, num_parallel_calls=AUTOTUNE)
        .batch(CONFIGURATION['batch_size'])
        .prefetch(AUTOTUNE)
    )
    
    return trainloader, validloader

In [None]:
rdm_state = CONFIGURATION["seed"]

# Preparing the train and validation df
# We are only using one fold for this example
# You can create a loop in order to train models with different folds
train_df = df.loc[df.fold != 0].reset_index(drop=True)
valid_df = df.loc[df.fold == 0].reset_index(drop=True)

# We will use a subset in order to accelerate the training time
# size of the subset -> 10 % of the whole dataset
# Delete these lines of code, from here...
train_subset, _ = train_test_split(train_df, stratify=train_df.target, random_state=rdm_state, train_size=0.1)
valid_subset, _ = train_test_split(valid_df, stratify=valid_df.target, random_state=rdm_state, train_size=0.1)
train_df = train_subset.reset_index(drop=True)
valid_df = valid_subset.reset_index(drop=True)
# ... to here, in order to use the whole dataset

# Preparing the dataloaders
trainloader, validloader = get_dataloaders(train_df, valid_df)

# Initialize the image classifier

In [None]:
# Parameters are explained here: https://autokeras.com/image_classifier/

# The maximum number of different Keras Models to try (for example: 10)
max_number = 10

clf = ak.ImageClassifier(
    max_trials=max_number,
    metrics=[tf.keras.metrics.AUC(curve='ROC')],
    loss=tfa.losses.SigmoidFocalCrossEntropy(),
    overwrite=True,
    tuner="bayesian",
    seed=CONFIGURATION["seed"]
)

# Training of the image classifier

The number of epochs is very high due to the use of an [**early stopping callback**](https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/EarlyStopping).
The model will stop learning if it is overfitting (patience parameter).
So the number of epochs is not important.

If you don't use an early stopping callback, you should optimise the number of epochs to avoid overfitting and underfitting.

In [None]:
# Feed the tensorflow Dataset to the classifier.

clf.fit(
    trainloader,
    epochs=CONFIGURATION["epochs"],
    validation_data=validloader,
    class_weight=class_weights,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            patience=3,
            restore_best_weights=True)
    ]
)

In [None]:
# Export as a Keras Model.
model = clf.export_model()

model.evaluate(validloader)

In [None]:
model.summary()