In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob
import cv2

In [None]:
df_train = pd.read_csv("../input/happy-whale-and-dolphin/train.csv")
print(df_train.head())

In [None]:
def list_full_paths(directory):
    return [os.path.join(directory, file) for file in os.listdir(directory)]

image_path = '../input/happy-whale-and-dolphin/train_images/'
df_train['filepath'] = list_full_paths(image_path)
print(df_train.head())

In [None]:
# plot a few example images
fig, ax = plt.subplots(2, 2, figsize=(10, 10))
for a in ax.ravel():
    a.axis('off')
ax[0, 0].imshow(cv2.imread(df_train['filepath'].iloc[0]))
ax[0, 1].imshow(cv2.imread(df_train['filepath'].iloc[1]))
ax[1, 0].imshow(cv2.imread(df_train['filepath'].iloc[2]))
ax[1, 1].imshow(cv2.imread(df_train['filepath'].iloc[3]))

In [None]:
# cleanup species labels
#print(df_train['species'].unique())
df_train['species'].replace({
    'beluga'            : 'beluga_whale',
    'globis'            : 'short_finned_pilot_whale',
    'pilot_whale'       : 'short_finned_pilot_whale',
    'bottlenose_dolpin' : 'bottlenose_dolphin',
    'kiler_whale'       : 'killer_whale',
}, inplace=True)
print(df_train['species'].unique())
print(f'{len(df_train["species"].unique())} unique values')

In [None]:
# number of each category
df_train['species'].value_counts().plot(kind='bar')

In [None]:
#y_train = np_utils.to_categorical(df_train['species'])
#print(y_train)

categories = df_train['species'].unique()
n_unique = len(categories)

# OneHotEncode
#from keras.utils import np_utils
#encoder = { category: label for category, label in zip( categories, range(n_unique) ) }
#df_train['species'] = df_train['species'].apply(lambda x: encoder[x])
#print(df_train.head())

print(categories)

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
# load images into data generator

# split train into train and validation, ensuring reproducability
from sklearn.model_selection import train_test_split
train, validation = train_test_split(df_train, test_size=0.2, random_state=42, shuffle=True)

train_gen = ImageDataGenerator(rescale=1. / 255)
train_generator = train_gen.flow_from_dataframe(dataframe=train, x_col='filepath', y_col='species', class_mode='categorical')

val_gen = ImageDataGenerator(rescale=1. / 255)
val_generator = val_gen.flow_from_dataframe(dataframe=validation, x_col='filepath', y_col='species', class_mode='categorical')

# Number of species in train/validation

In [None]:
train['species'].value_counts().plot(kind='bar')

In [None]:
validation['species'].value_counts().plot(kind='bar')

# Simple CNN for initial test, and trial some basic data augmentation

In [None]:
# data augmentation
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import RandomFlip, RandomRotation, RandomZoom

input_shape = (255, 255, 3)

#data_augmentation = keras.Sequential()
#data_augmentation.add(RandomFlip('horizontal', input_shape=input_shape))
#data_augmentation.add(RandomRotation(0.1))
#data_augmentation.add(RandomZoom(0.1))

# model definition
model = Sequential()

#model.add(data_augmentation)

model.add(Conv2D(input_shape=input_shape, filters=32, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(2, 2))

model.add(Conv2D(input_shape=input_shape, filters=64, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(2, 2))

model.add(Conv2D(input_shape=input_shape, filters=128, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(2, 2))


model.add(Flatten())

model.add(Dense(50, activation='relu'))
model.add(Dense(n_unique, activation='softmax'))

model.summary()

In [None]:
# checking GPU is on
!nvidia-smi

In [None]:
# fit data
from tensorflow.keras.optimizers import Adam
optimizer = Adam()
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# callbacks
es = EarlyStopping(monitor='val_loss', patience=15, verbose=1, mode='min', restore_best_weights=True)
#lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='min', min_lr=0.00000001)
callbacks = [es]

#epochs = 20
epochs = 1
with tf.device('/device:GPU:0'):
    history = model.fit(train_generator,
                        validation_data=val_generator,
                        epochs=epochs,
                        callbacks=callbacks)

In [None]:
fig, axs = plt.subplots(1, 2, figsize = (10, 5))
_ = axs[0].plot(history.history['loss'], label='Test')
_ = axs[0].plot(history.history['val_loss'], label='Validation')
axs[0].set_xlabel('epochs')
axs[0].set_ylabel('loss')
_ = axs[0].legend()

_ = axs[1].plot(history.history['accuracy'], label='Test')
_ = axs[1].plot(history.history['val_accuracy'], label='Validation')
axs[1].set_xlabel('epochs')
axs[1].set_ylabel('accuracy')
_ = axs[1].legend()

# ETA for 1 epoch is 1hr....