# Humpback Whale Identification - CNN with Keras

In [1]:
import gc
import os
import time

In [2]:
# Mutual exclusion trick :O (with a lock file) #Mutex #Beurk #DeepLearning
while os.path.isfile("lock"):
    time.sleep(30)

In [3]:
!touch lock

In [4]:
import numpy as np
import pandas as pd
import progressbar

import matplotlib.pyplot as plt
import matplotlib.image as mplimg
from matplotlib.pyplot import imshow

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from keras import layers
from keras.preprocessing import image
from keras.applications.imagenet_utils import preprocess_input
from keras.layers import Input, Dense, Activation, BatchNormalization, Flatten, Conv2D
from keras.layers import AveragePooling2D, MaxPooling2D, Dropout
from keras.models import Model

import keras.backend as K
from keras.models import Sequential

import warnings
warnings.simplefilter("ignore", category=DeprecationWarning)

Using TensorFlow backend.


In [5]:
train_df = pd.read_csv("../../input/train.csv")
train_df.head()

Unnamed: 0,Image,Id
0,0000e88ab.jpg,w_f48451c
1,0001f9222.jpg,w_c3d896a
2,00029d126.jpg,w_20df2c5
3,00050a15a.jpg,new_whale
4,0005c1ef8.jpg,new_whale


In [6]:
def prepareImages(data, m, dataset):
    #print("Preparing images")
    #print(m)
    X_train = np.zeros((m, 64, 64, 3))
    count = 0
    
    for fig in progressbar.progressbar(data['Image']):
        # load images into images of size 128x128x3
        img = image.load_img("../../input/"+dataset+"/"+fig, target_size=(64, 64, 3))
        x = image.img_to_array(img)
        x = preprocess_input(x)
        X_train[count] = x
        count += 1
    
    return X_train

In [7]:
def prepare_labels(y):
    values = np.array(y)
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(values)
    #print(integer_encoded)

    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    #print(onehot_encoded)

    y = onehot_encoded
    #print(y.shape)
    return y, label_encoder

In [8]:
X = prepareImages(train_df, train_df.shape[0], "train")
X /= 255

100% (25361 of 25361) |##################| Elapsed Time: 0:02:52 Time:  0:02:52


In [9]:
y, label_encoder = prepare_labels(train_df['Id'])

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [10]:
y.shape

(25361, 5005)

# Change model starting from here

## Train

In [18]:
model = Sequential()

model.add(Conv2D(32, (7, 7), strides = (1, 1), name = 'conv0', input_shape = (64, 64, 3)))

model.add(BatchNormalization(axis = 3, name = 'bn0'))
model.add(Activation('relu'))

model.add(AveragePooling2D((3, 3), name='avg_pool'))
model.add(Conv2D(64, (3, 3), strides = (1,1), name="conv1"))
model.add(Activation('relu'))
model.add(AveragePooling2D((3, 3), name='avg_pool2'))

model.add(Flatten())
model.add(Dense(500, activation="relu", name='rl'))
model.add(Dropout(0.8))
model.add(Dense(y.shape[1], activation='softmax', name='sm'))

model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv0 (Conv2D)               (None, 58, 58, 32)        4736      
_________________________________________________________________
bn0 (BatchNormalization)     (None, 58, 58, 32)        128       
_________________________________________________________________
activation_3 (Activation)    (None, 58, 58, 32)        0         
_________________________________________________________________
avg_pool (AveragePooling2D)  (None, 19, 19, 32)        0         
_________________________________________________________________
conv1 (Conv2D)               (None, 17, 17, 64)        18496     
_________________________________________________________________
activation_4 (Activation)    (None, 17, 17, 64)        0         
_________________________________________________________________
avg_pool2 (AveragePooling2D) (None, 5, 5, 64)          0         
__________

In [19]:
history = model.fit(X, y, epochs=100, batch_size=100, verbose=1)
gc.collect()

ValueError: Input arrays should have the same number of samples as target arrays. Found 7960 input samples and 25361 target samples.

In [20]:
plt.plot(history.history['acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.show()

NameError: name 'history' is not defined

## Test

In [21]:
test = os.listdir("../../input/test/")
print(len(test))

7960


In [22]:
col = ['Image']
test_df = pd.DataFrame(test, columns=col)
test_df['Id'] = ''

In [None]:
X = prepareImages(test_df, test_df.shape[0], "test")
X /= 255

In [None]:
predictions = model.predict(np.array(X), verbose=1)

In [None]:
for i, pred in enumerate(predictions):
    test_df.loc[i, 'Id'] = ' '.join(label_encoder.inverse_transform(pred.argsort()[-5:][::-1]))

In [None]:
test_df.head(10)
test_df.to_csv('keras-cnn-starter_without_average_polling_instead_of_maxpooling.csv', index=False) #> Score = 0.286

In [None]:
!kaggle competitions submit -c humpback-whale-identification -f "keras-cnn-starter_without_average_polling_instead_of_maxpooling.csv" -m "Keras CNN without average polling instead of maxpooling (64x64)"

In [23]:
!rm lock

In [None]:
%%javascript
// From https://github.com/jupyter/notebook/issues/1880
Jupyter.notebook.session.delete();

<IPython.core.display.Javascript object>