In [None]:
import time, os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras import layers, metrics, losses, callbacks, regularizers
from tensorflow.python.client import device_lib
from sklearn.model_selection import train_test_split
from kaggle_datasets import KaggleDatasets
plt.rc('figure', autolayout=True)
plt.rc('axes', labelweight='bold', labelsize='large',titleweight='bold', titlesize=18, titlepad=10)
plt.rc('image', cmap='magma')
print("Python version: {}".format(sys.version_info))
print("Numpy version: {}".format(np.version.version))
print("Tensorflow version: {}".format(tf.__version__))

In [None]:
# Download dataset
# ---------------------------
!unzip -q -n ../input/galaxy-zoo-the-galaxy-challenge/images_training_rev1.zip -d ../temp/
!unzip -q -n ../input/galaxy-zoo-the-galaxy-challenge/images_test_rev1.zip -d ../temp/     
labels_pd = pd.read_csv('../input/galaxy-zoo-the-galaxy-challenge/training_solutions_rev1.zip',compression='zip')

for dirname, _, filenames in os.walk('../input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
print("Dataset loading done!")

In [None]:
# Dataset preprocessing
# ---------------------
BATCH_SIZE = 64
AUTOTUNE = tf.data.AUTOTUNE
def get_image_and_features(image_path, training=True):
    image = tf.io.decode_image(tf.io.read_file(image_path), dtype=tf.dtypes.float32)
    image = tf.image.resize_with_crop_or_pad(image, 160, 160) 
    if training:
        label = tf.strings.split(image_path, os.path.sep)[3] # take the galaxy number from image path
        galaxyID = int(tf.strings.substr(label, 0, tf.strings.length(label)-4)) # remove .jpg extension
        galaxy_row = labels_pd.loc[labels_pd['GalaxyID'] == galaxyID]
        features = tf.cast(galaxy_row.iloc[0,1:].values, tf.float32)
        return image, features
    else:
        return image
    
def dataset_preprocessing(images_path, training=True, tpu=False):
      
    images_path_ds = tf.data.Dataset.from_tensor_slices(images_path)
    if training:
        dataset = images_path_ds.map(lambda x: tf.py_function(func=get_image_and_features, inp=[x], Tout=(tf.float32,tf.float32)),num_parallel_calls=AUTOTUNE)
    else:
        dataset = images_path_ds.map(lambda x: tf.py_function(func=get_image_and_features, inp=[x,False], Tout=tf.float32),num_parallel_calls=AUTOTUNE)
        
    #dataset = images_path_ds.map(get_image_and_features, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.shuffle(2048) if training else dataset
    dataset = dataset.batch(BATCH_SIZE) # Set batch size
    dataset = dataset.prefetch(AUTOTUNE) # Add dataset prefetch() operations to reduce read latency while training the model
    return dataset

images_path = tf.io.gfile.glob('../temp/images_training_rev1/*')
seed = 54
tf.random.set_seed(seed)
images_path = tf.random.shuffle(images_path)
samples_size = len(images_path)
print("Number of total samples: {}".format(samples_size))

# Split train validation sets
splitIndex = int(np.floor(samples_size*0.8))
images_path_train = images_path[:splitIndex]
images_path_val = images_path[splitIndex:]
     
train_dataset = dataset_preprocessing(images_path_train)
print('Number of training batches: %d' % tf.data.experimental.cardinality(train_dataset))

val_dataset = dataset_preprocessing(images_path_val)
print('Number of validation batches: %d' % tf.data.experimental.cardinality(val_dataset))

print("Dataset preprocessing done!")

In [None]:
# Plot example
# --------------------
def plot_example(dataset, rows=2, cols=3):
    fig, axes = plt.subplots(rows, cols, figsize=(12, 6))
    images = list(dataset.take(1).as_numpy_iterator())[0][0]
    labels = list(dataset.take(1).as_numpy_iterator())[0][1]
    for i in range(rows):
        for j in range(cols):
            axes[i,j].grid(False)
            axes[i,j].axis('off')
            axes[i,j].imshow(images[cols*i+j, :])
    plt.show()
    return images[0].shape, labels[0].shape
image_shape, features_num = plot_example(train_dataset)
print("Images shape is: {}".format(image_shape))
print("Features length is: {}".format(features_num))


In [None]:
# Model
# ---------------------
def create_model(input_shape, use_augmentation=False):
    model = tf.keras.models.Sequential(name="galaxyClassifier", layers=[
        layers.Conv2D(filters=16,kernel_size=(6, 6),activation='relu',input_shape=input_shape),
        layers.BatchNormalization(),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Dropout(0.1),
        layers.Conv2D(filters=32,kernel_size=(5, 5),activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Dropout(0.1),
        layers.Conv2D(filters=64,kernel_size=(3, 3),activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Dropout(0.1),
        layers.Conv2D(filters=128,kernel_size=(3, 3),activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Dropout(0.1),
        layers.Flatten(),
        layers.Dense(512, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(37, activation='sigmoid')
    ])

    if use_augmentation:
        data_augmentation = tf.keras.models.Sequential(name='augmentation', layers=[
            layers.experimental.preprocessing.RandomFlip("horizontal_and_vertical", input_shape=input_shape),
            layers.experimental.preprocessing.RandomRotation(0.3),
            layers.experimental.preprocessing.RandomZoom(height_factor=0.2, width_factor=0.2),
            layers.experimental.preprocessing.RandomContrast(0.05)])
        model = tf.keras.models.Sequential(name="galaxyClassifier", layers=[
            data_augmentation,
            model])
        
    return model


image_shape = (160,160,3)
model = create_model(image_shape, True)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3) # Optimizer
loss_func = losses.MeanSquaredError() # Loss function
model.compile(loss=loss_func, optimizer=optimizer, metrics=[tf.keras.metrics.RootMeanSquaredError()])
model.summary()

In [None]:
# Training
# ---------------------
num_epochs = 50
verbose = True
# Callbacks
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=7, min_lr=1e-5)
def decay_schedule(epoch, lr):
    return lr * 0.8 if (epoch % 10 == 0) and (epoch != 0) else lr
lr_scheduler = callbacks.LearningRateScheduler(decay_schedule)
early_stop = callbacks.EarlyStopping(monitor='val_loss',mode='min',verbose=1,patience=7)
checkpoint = callbacks.ModelCheckpoint('best_model', save_best_only=True, monitor='val_accuracy', mode='max')
callbacksInUse = []

print('------- Training -------')
with strategy.scope(): 
    start = time.time()
    history = model.fit(train_dataset, validation_data=val_dataset, epochs=num_epochs, callbacks=callbacksInUse, use_multiprocessing=True, verbose=verbose)
    end = time.time()
print("Total training took {:.2f} hours.".format((end - start)/3600))

# Plot learning curves
metrics = history.history
fig, axes = plt.subplots(1, 2, figsize=(12,6))
axes[0].plot(metrics['root_mean_squared_error'], label='train_accuracy')
axes[0].plot(metrics['val_root_mean_squared_error'], label='val_accuracy')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('RMSE')
axes[0].legend(loc='upper right')
axes[1].plot(metrics['loss'], label='train_loss')
axes[1].plot(metrics['val_loss'], label='val_loss')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].legend(loc='upper right')
plt.show()

In [None]:
# Evaluation
# -------------------
results = model.evaluate(val_dataset, verbose=1)
print("Test RMSE: {:.3f}".format(results[1]))

In [None]:
# Save model
# -------------------
outPath = model.name+".h5"
model.save(outPath)
print("Model is saved: {}".format(outPath))

In [None]:
# Prediction
# -------------------
test_images_path = tf.io.gfile.glob('../temp/images_test_rev1/*')
test_dataset = dataset_preprocessing(test_images_path,False)
print('Number of test batches: %d' % tf.data.experimental.cardinality(test_dataset))
print('------- Prediction -------')
result = model.predict(test_dataset, verbose=1)
print("result shape: {}".format(result.shape))


In [None]:
# Kaggle submission
# -------------------   
predictions = np.vstack(np.array(result))
galaxyIDs = np.array([v.split('/')[3].split('.')[0] for v in test_images_path]).reshape(len(test_images_path),1)
submission = pd.DataFrame(np.hstack((galaxyIDs, predictions)), columns=labels_pd.columns)
submission = submission.sort_values(by=['GalaxyID'])
print("submission data shape: {}".format(submission.shape))
submission.to_csv('submission.csv', index=False)
print("Submission file is saved succesfully!")