# Galaxy Zoo - The Galaxy Challenge

- INPUT: 299x299
- Model: Xception
- pooling: GlobalAverage
- OUTPUT: 37class to reflect Decision Tree by Functional API

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
!unzip -n ../input/galaxy-zoo-the-galaxy-challenge/images_training_rev1.zip

In [None]:
!unzip -n ../input/galaxy-zoo-the-galaxy-challenge/images_test_rev1.zip

In [None]:
from tqdm import tqdm
import zipfile
import io
from PIL import Image
import matplotlib.pyplot as plt
%matplotlib inline
from skimage.transform import resize

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
AUTOTUNE = tf.data.experimental.AUTOTUNE

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization, GlobalMaxPooling2D
from tensorflow.keras import backend as K

In [None]:
pd.options.display.max_columns = 999
df = pd.read_csv('../input/galaxy-zoo-the-galaxy-challenge/training_solutions_rev1.zip')
df.head()

In [None]:
df.describe()

In [None]:
# df = df.sample(n=1000, random_state=0)

In [None]:
DATA_DIR = '../input/galaxy-zoo-the-galaxy-challenge/'
zippath = '../input/galaxy-zoo-the-galaxy-challenge/images_training_rev1.zip'
ORIG_SIZE = 424
# IMG_SIZE = 224
IMG_SIZE = 299
# IMG_SIZE = 424

batch_size = 32
# batch_size = 16

In [None]:
# load zip
zippath = '../input/galaxy-zoo-the-galaxy-challenge/images_training_rev1.zip'
z = zipfile.ZipFile(zippath)

# load image from zip
# imgname = 'images_training_rev1/100008.jpg'
imgname = 'images_training_rev1/100023.jpg'
im = Image.open(io.BytesIO(z.read(imgname)))
im_list = np.asarray(im)
plt.imshow(im_list)
plt.show()

z.close()

In [None]:
im_list.shape

In [None]:
plt.figure(figsize=(16,4))
for i in range(3):
    plt.subplot(1,3,i+1)
    plt.imshow(im_list[:,:,i])
    plt.colorbar()
plt.show()

## tf.data.Dataset を使う

In [None]:
def preprocess_image(image, augment_flag=False):
    image = tf.image.decode_jpeg(image, channels=3)
#     image = tf.image.resize(image, (IMG_SIZE,IMG_SIZE))
    image = tf.image.resize_with_crop_or_pad(image, IMG_SIZE, IMG_SIZE)  # 中央crop
    if augment_flag:
        image = tf.image.random_flip_left_right(image)  # 左右反転
        image = tf.image.random_flip_up_down(image)     # 上下反転
    image /= 255  # normalize to [0,1] range

    return image

In [None]:
def load_and_preprocess_image(path):
#     img_path = '../input/galaxy-zoo-the-galaxy-challenge/images_training_rev1/' + path + '.jpg'
    img_path = 'images_training_rev1/' + path + '.jpg'
    image = tf.io.read_file(img_path)
    return preprocess_image(image,  augment_flag=True)

In [None]:
# import matplotlib.pyplot as plt

# label = str(df.iloc[1, 0])
# # label = '303732'
# # img_path = '../input/galaxy-zoo-the-galaxy-challenge/images_training_rev1/' + label + '.jpg'

# plt.imshow(load_and_preprocess_image(label))
# plt.grid(False)
# plt.title(label)
# plt.colorbar()
# plt.show()

In [None]:
from sklearn.model_selection import train_test_split
(x_train, x_test, y_train, y_test) = train_test_split(df.values[:,0].astype(int).astype(str), df.values[:,1:], test_size=0.2, random_state=0)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
path_ds = tf.data.Dataset.from_tensor_slices(x_train)

In [None]:
path_ds

In [None]:
image_ds = path_ds.map(load_and_preprocess_image, num_parallel_calls=AUTOTUNE)

In [None]:
label_ds = tf.data.Dataset.from_tensor_slices(tf.cast(y_train, tf.float32))

In [None]:
image_label_ds = tf.data.Dataset.zip((image_ds, label_ds))

In [None]:
# ds = image_label_ds.cache(filename='./cache.tf-data')
# ds = image_label_ds.cache()
ds = image_label_ds
ds = ds.apply(
    tf.data.experimental.shuffle_and_repeat(buffer_size=1000))
ds = ds.batch(batch_size)

In [None]:
path_ds_valid = tf.data.Dataset.from_tensor_slices(x_test)
image_ds_valid = path_ds_valid.map(load_and_preprocess_image, num_parallel_calls=AUTOTUNE)
label_ds_valid = tf.data.Dataset.from_tensor_slices(tf.cast(y_test, tf.float32))
ds_valid = tf.data.Dataset.zip((image_ds_valid, label_ds_valid))
ds_valid = ds_valid.batch(batch_size)

In [None]:
y_test.shape

In [None]:
# for x in image_ds:
# for x, y in image_label_ds:
for x, y in ds:
#     print(x.shape)
    print(x.shape, y.shape)
    break

## Model

In [None]:
# ネットワークの構築
from tensorflow.keras.applications import VGG16, ResNet50, ResNet50V2, MobileNetV2, EfficientNetB0, InceptionResNetV2, Xception
from tensorflow.keras import models, layers, regularizers

# conv_base = VGG16(weights='imagenet',
# conv_base = ResNet50(weights='imagenet',
# conv_base = ResNet50V2(weights='imagenet',
# conv_base = MobileNetV2(weights='imagenet',
# conv_base = EfficientNetB0(weights='imagenet',
# conv_base = InceptionResNetV2(weights='imagenet',
conv_base = Xception(weights='imagenet',
                 include_top=False, pooling='avg',
                 input_shape=(IMG_SIZE, IMG_SIZE, 3))
conv_base.summary()

In [None]:
conv_base.input_shape

In [None]:
df.head()

In [None]:
# Functional API
from tensorflow.keras.layers import Input, Flatten, Dense, Concatenate, Multiply
from tensorflow.keras.activations import softmax, sigmoid
from tensorflow.keras import Model

inputs = Input(shape=(IMG_SIZE, IMG_SIZE, 3))
x = conv_base(inputs)
x = Flatten()(x)
x = Dense(37, kernel_initializer='he_normal')(x)

# Galaxy Zoo の Decision Tree の条件をモデルに反映
x01 = softmax(x[:,:3])                             # Task01 合計が1.0
x02 = Multiply()([softmax(x[:, 3: 5]), x01[:,1]])  # Task02 合計がTask01の2つ目
x03 = Multiply()([softmax(x[:, 5: 7]), x02[:,1]])  # Task03 合計がTask02の2つ目
x04 = Multiply()([softmax(x[:, 7: 9]), x02[:,1]])  # Task04 合計がTask02の2つ目
x05 = Multiply()([softmax(x[:, 9:13]), x04[:,1]])  # Task05 合計がTask04の2つ目
x06 = Multiply()([softmax(x[:,13:15]), x04[:,1]])  # Task06 合計がTask04の2つ目
x07 = Multiply()([softmax(x[:,15:18]), x01[:,0]])  # Task07 合計がTask01の1つ目
x08 = Multiply()([softmax(x[:,18:25]), x06[:,0]])  # Task08 合計がTask06の1つ目
x09 = Multiply()([softmax(x[:,25:28]), x02[:,0]])  # Task09 合計がTask02の1つ目
x10 = Multiply()([softmax(x[:,28:31]), x04[:,0]])  # Task10 合計がTask04の1つ目
x11 = Multiply()([softmax(x[:,31:37]), x04[:,0]])  # Task11 合計がTask04の1つ目

outputs = Concatenate(axis=1)([x01, x02, x03, x04, x05, x06, x07, x08, x09, x10, x11])
model = Model(inputs=inputs, outputs=outputs)
model.summary()

In [None]:
# for layer in conv_base.layers:
#     print(layer.name)

In [None]:
# 凍結
# conv_base.trainable = False

# 最初から特定の層までを凍結
# conv_base.trainable = True
# set_trainable = False
# for layer in conv_base.layers:
#     if layer.name == 'block14_sepconv1':
#         set_trainable = True
#     if set_trainable:
#         layer.trainable = True
#     else:
#         layer.trainable = False

len(model.trainable_weights)

In [None]:
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 

In [None]:
# オプティマイザ構築
from tensorflow.keras import optimizers

model.compile(loss='MeanSquaredError',
#              optimizer=optimizers.RMSprop(lr=1e-4),
             optimizer=optimizers.RMSprop(lr=1e-3),
#              optimizer=optimizers.Adam(lr=1e-4),
#              optimizer=optimizers.Adam(lr=1e-3),
#              optimizer=optimizers.SGD(lr=0.01),
#              optimizer=optimizers.SGD(lr=0.01, momentum=0.9),
             metrics=['acc', root_mean_squared_error])

In [None]:
%%time
# history = model.fit(ds, epochs=5, steps_per_epoch=10, use_multiprocessing=True, workers=8, 
#                     validation_steps=10, validation_data=ds_valid)
history = model.fit(ds, epochs=5, steps_per_epoch=x_train.shape[0]//batch_size, use_multiprocessing=True, workers=8, 
                    validation_steps=x_test.shape[0]//batch_size, validation_data=ds_valid)
# history = model.fit(ds, epochs=20, steps_per_epoch=61578//32+1, workers=0, validation_steps=1, validation_data=ds_valid, initial_epoch=2)

In [None]:
y_pred = model.predict(ds_valid, use_multiprocessing=True, workers=8, verbose=1, steps=1)

In [None]:
idx = 2
print(y_pred[idx])
print(sum(y_pred[idx,:3]), sum(y_pred[idx,3:5]), sum(y_pred[idx,5:7]), sum(y_pred[idx,7:9]), sum(y_pred[idx,9:13]), 
      sum(y_pred[idx,13:15]), sum(y_pred[idx,15:18]), sum(y_pred[idx,18:25]), sum(y_pred[idx,25:28]), sum(y_pred[idx,28:31]), 
      sum(y_pred[idx,31:37]))

In [None]:
model.evaluate(ds_valid)

In [None]:
model.save('my_model.h5')

In [None]:
history.history

In [None]:
plt.plot(history.history["root_mean_squared_error"], label="train", ls="-", marker="o")
plt.plot(history.history["val_root_mean_squared_error"], label="test", ls="-", marker="x")
plt.ylabel("root_mean_squared_error")
plt.xlabel("epoch")
plt.legend(loc="best")
plt.show()

In [None]:
#acc, val_accのプロット
plt.plot(history.history["acc"], label="train", ls="-", marker="o")
plt.plot(history.history["val_acc"], label="test", ls="-", marker="x")
plt.ylabel("Accuracy")
plt.ylim(0,1)
plt.xlabel("epoch")
plt.legend(loc="best")
plt.show()

## Submission

In [None]:
submission = pd.read_csv('../input/galaxy-zoo-the-galaxy-challenge/all_ones_benchmark.zip')

In [None]:
# submission = submission.sample(n=1000, random_state=0)

In [None]:
submission.shape

In [None]:
def load_and_preprocess_test_image(path):
#     img_path = '../input/galaxy-zoo-the-galaxy-challenge/images_test_rev1/' + path + '.jpg'
    img_path = 'images_test_rev1/' + path + '.jpg'
    image = tf.io.read_file(img_path)
    return preprocess_image(image)

In [None]:
import matplotlib.pyplot as plt

label = str(submission.iloc[0, 0])

plt.imshow(load_and_preprocess_test_image(label))
plt.grid(False)
plt.title(label)
print()

In [None]:
path_ds_test = tf.data.Dataset.from_tensor_slices(submission.values[:,0].astype(int).astype(str))
image_ds_test = path_ds_test.map(load_and_preprocess_test_image, num_parallel_calls=AUTOTUNE)
label_ds_test = tf.data.Dataset.from_tensor_slices(tf.cast(np.zeros((submission.shape[0], 37)), tf.float32))
ds_test = tf.data.Dataset.zip((image_ds_test, label_ds_test))
ds_test = ds_test.batch(batch_size)

In [None]:
# y_pred = model.predict(test_x)
y_pred = model.predict(ds_test, use_multiprocessing=True, workers=8, verbose=1)

In [None]:
submission.iloc[:, 1:] = y_pred

In [None]:
submission.to_csv('submission.csv', index=False)