In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np

from random import randrange
from keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, BatchNormalization

ROOT_FOLDER = '../input/whale-categorization-playground'
# ROOT_FOLDER = 'data'
TRAIN_FOLDER = ROOT_FOLDER + '/train/train/'
TEST_FOLDER = ROOT_FOLDER + '/test/test/'
WIDTH = 150
HEIGHT = 150

df_train = pd.read_csv(ROOT_FOLDER + '/train.csv')
df_test = pd.read_csv(ROOT_FOLDER + '/sample_submission.csv')
label_to_index = {}
index_to_label = {}

TIMES_TO_TRAIN = len(df_train) * 50
BATCH_SIZE = 100

for index, (image, label) in df_train.iterrows():
    index_to_label[index] = label
    if not label in label_to_index:
        label_to_index[label] = len(label_to_index)

model = Sequential()
model.add(Conv2D(filters=16, kernel_size=5, padding="same", activation="relu", input_shape=(150, 150, 3)))
model.add(MaxPooling2D(pool_size=(5, 5), strides=(5, 5), padding="valid"))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(Conv2D(32, (15, 15), activation="relu"))
model.add(MaxPooling2D(pool_size=(5, 5), strides=(5, 5), padding="valid"))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Flatten())

model.add(Dense(4251, activation="relu"))


model.compile(optimizer='adam',
              loss='binary_crossentropy', metrics=['binary_crossentropy', 'acc'])


# data augmentation
datagen = ImageDataGenerator(
        featurewise_center=False,  # set input mean to 0 over the dataset
        samplewise_center=False,  # set each sample mean to 0
        featurewise_std_normalization=False,  # divide inputs by std of the dataset
        samplewise_std_normalization=False,  # divide each input by its std
        zca_whitening=False,  # apply ZCA whitening
        rotation_range=10,  # randomly rotate images in the range (degrees, 0 to 180)
        zoom_range = 0.1, # Randomly zoom image 
        width_shift_range=0.1,  # randomly shift images horizontally (fraction of total width)
        height_shift_range=0.1,  # randomly shift images vertically (fraction of total height)
        horizontal_flip=False,  # randomly flip images
        vertical_flip=False)  # randomly flip images


# training
for t in range(int(TIMES_TO_TRAIN / BATCH_SIZE) + 1):
    print(str(t) + "/" + str(int(TIMES_TO_TRAIN / BATCH_SIZE)))
    
    x_train = []
    y_train = []
    for i in range(BATCH_SIZE):
        idx = randrange(len(df_train))
        image = df_train.loc[idx]['Image']
        x_train.append(img_to_array(load_img(TRAIN_FOLDER + image, target_size=(WIDTH, HEIGHT))))
        y_train.append(np.zeros(4251))
        y_train[i][label_to_index[df_train.loc[idx]['Id']]] = 1
        
    x_train = np.array(x_train)
    y_train = np.array(y_train)
    
    datagen.fit(x_train)
    model.fit_generator(datagen.flow(x_train, y_train, batch_size=BATCH_SIZE),
                        steps_per_epoch=x_train.shape[0] // BATCH_SIZE, verbose=False)
    model.fit(x_train, y_train, batch_size=BATCH_SIZE, verbose=False)


# predicting
def sort_predictions(array):
    pairs = []
    for idx, v in enumerate(array):
        pairs.append((idx, v))
    pairs.sort(key=lambda p: p[1], reverse=True)
    
    top_sorted = []
    for p in pairs[:5]:
        top_sorted.append(index_to_label[p[0]])
#     print(pairs[:5], top_sorted)
    return top_sorted

print('----------------- PREDICTING -----------------')
predictions = []
for index, (image, labels) in df_test.iterrows():
    image = df_test.loc[index]['Image']
    x_test = np.array([img_to_array(load_img(TEST_FOLDER + image, target_size=(WIDTH, HEIGHT)))])
    preds = model.predict(x_test)[0]
    top_preds = sort_predictions(preds)
#     print(top_preds)
    predictions.append(top_preds)
    
    print(str(index) + '/' + str(len(df_test)))

# submission
submission = pd.DataFrame(data=[[df_test.loc[i]['Image'], "{} {} {} {} {}".format(*predictions[i])] for i in range(len(predictions))],
                           columns=['Image', 'Id'])
print(submission)
submission.to_csv('submission.csv', index=False)