## References

    1.https://www.kaggle.com/youhanlee/small-data-many-class-data-augmentation 
    2.https://www.kaggle.com/satian/keras-mobilenet-starter
    3.https://medium.com/ymedialabs-innovation/data-augmentation-techniques-in-cnn-using-tensorflow-371ae43d5be9

## Overview

We are going to try to get a good percentage of our predictions using the mobilenet pretrained network, but we will focus our efforts on improving the input data.
The goal of this competition is identifying individual whales in images. The train dataset includes 25k images and 5k unique whale ids. In addition, ~10k of images show unique whales ('new_whale' label).

To improve the dataset we are going to carry out different data augmentation techniques:

- Feature Standardization
- ZCA Whitening
- Random Rotations
- Random Shifts
- Random Flips

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import matplotlib.image as mplimg
import seaborn as sns
from matplotlib.pyplot import imshow
from matplotlib import pyplot
from keras.backend import clear_session
import keras

from keras import applications
from keras import layers
from keras.layers import Flatten, Input, Add, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, AveragePooling2D, MaxPooling2D, GlobalMaxPooling2D, Dropout  
from keras.models import Sequential, Model, load_model  
from keras.preprocessing import image
from keras.applications.imagenet_utils import preprocess_input
import keras.backend as K  
from keras.callbacks import ModelCheckpoint  
from keras.callbacks import EarlyStopping

from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split

from keras.preprocessing.image import ImageDataGenerator

clear_session()

# Any results you write to the current directory are saved as output.

## funtions
   
def graph_acc_loss(model):
    
    sns.set(style = 'darkgrid')
    plt.figure(figsize = (24, 8))
    plt.subplot(2, 2, 1)
    #plt.plot(range(100), model.history['acc'])
    plt.plot(model.history['acc'],'r')  
    #plt.plot(model.history['val_acc'],'g')  
    plt.ylabel('TRAINING ACCURACY')
    plt.title('TRAINING ACCURACY vs EPOCHS')
    plt.legend(['train','validation'])
    
    plt.subplot(2, 2, 2)
    plt.plot(model.history['loss'],'r')  
    #plt.plot(model.history['val_loss'],'g')  
    plt.ylabel('TRAINING LOSS')
    plt.title('TRAINING LOSS vs EPOCHS')
    plt.legend(['train','validation'])
    
    plt.subplot(2, 2, 3)
    plt.plot(model.history['categorical_accuracy'],'b')  
    plt.xlabel('EPOCHS')
    plt.ylabel('TRAINING CATEGORICAL ACCURACY')
    plt.title('TRAINING CATEGORICAL ACCURACY vs EPOCHS')
    plt.legend(['categorical_accuracy'])
    
    plt.subplot(2, 2, 4)
    plt.plot(model.history['categorical_crossentropy'],'b')  
    plt.xlabel('EPOCHS')
    plt.ylabel('TRAINING CATEGORICAL CROSSENTROPY')
    plt.title('TRAINING CATEGORICAL CROSSENTROPY vs EPOCHS')
    plt.legend(['categorical_crossentropy'])
    
    
def prepare_data(df,width,heigth, channel):
    n_of_images = df.shape[0]
    channel = 3
    # preparing X numpy array with the images content
    #X = np.zeros((15697,48,48,3))
    X = np.zeros((n_of_images,width,heigth, channel))
    count = 0
    
    for file in df['Image']:
        img = image.load_img('../input/humpback-whale-identification/train/%s' % file,target_size=(width,heigth, channel))
        x = image.img_to_array(img)
        x = preprocess_input(x)
        
        X[count] = x
        
        if(count%4000==0):
            print("ProcessingImage : " , count+1,", ",file)
        count += 1
    print ("Total %s load ok" % count)
    # preparing Y numpy with de name of files , labelencoded and onehot encoded apply
    y_encoded = df['Id'].values
    values = np.array(y_encoded)
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(values)
    #print(integer_encoded)

    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded),1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    #print(onehot_encoded)
    y = onehot_encoded
   
    # split dataset in 20% validate and rest to train
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    
    return X_train, X_test, y_train, y_test, integer_encoded

def prepare_labels(y):
    values = np.array(y)
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(values)
    # print(integer_encoded)

    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    # print(onehot_encoded)

    y = onehot_encoded
    # print(y.shape)
    return y, label_encoder

def prepareImages(data, m, dataset):
    print("Preparing images")
    X_train = np.zeros((m, 96, 96, 3))
    count = 0
    
    for fig in data['Image']:
        #load images into images of size 100x100x3
        img = image.load_img("../input/humpback-whale-identification/"+dataset+"/"+fig, target_size=(96, 96, 3))
        x = image.img_to_array(img)
        x = preprocess_input(x)

        X_train[count] = x
        if (count%500 == 0):
            print("Processing image: ", count+1, ", ", fig)
        count += 1
    
    return X_train

## Read the train.csv and explore 

first we read the csv file, then using describe(), we can see how many unique types there are.

In [None]:
df_ = pd.read_csv('../input/humpback-whale-identification/train.csv', encoding='utf8')
print(df_['Id'].describe())

okay, we have a total of 25361, of which there are 5005 different types, but one of them is "new_whale", or what is the same, without cataloguing, so we will create a new dataframe with the unique ids removing it. We are also going to count them and save them in our variable "number_of_clases"

In [None]:
#df = df_.loc[df_['Id'] != 'new_whale']
df = df_
number_of_clases = len(df["Id"].value_counts())
print ("Number of Classes: %s" % number_of_clases)

In [None]:
df.head()

We created a graph to have a vision of the distribution of the labels.

In [None]:
train = df
counted = train.groupby("Id").count().rename(columns={"Image":"image_count"})
counted.loc[counted["image_count"] > 60,'image_count'] = 60
plt.figure(figsize=(25,4))
sns.countplot(data=counted, x="image_count")
plt.show()

with the function "prepare_data" we create arrays of numpy with the content of the images, also we prepare the and with the name of the files (we make the labelencoder and the onehot encoded to be able to pass numerical values to the neural network). Finally the same function already separates the values for test and for validation.

In [None]:
#X_train, X_test, y_train, y_test, integer_encoded = prepare_data(df,96,96,3)

In [None]:
train_df = pd.read_csv("../input/humpback-whale-identification/train.csv")
X = prepareImages(train_df, train_df.shape[0], "train")
X /= 255

In [None]:
y, label_encoder = prepare_labels(train_df['Id'])

## **MobileNet**

To start we will train using MobileNet (https://arxiv.org/abs/1704.04861) we will train and see what results it gives us with the data we have, then we will continue training the network but expanding the dataset with the various methods of augmentation discussed above. 

In [None]:
from keras.metrics import categorical_accuracy, top_k_categorical_accuracy, categorical_crossentropy
from keras.optimizers import Adam
from keras.applications import MobileNet
from keras.applications.mobilenet import preprocess_input
import keras

#model = MobileNet(input_shape=(96, 96, 3), alpha=1., weights=None, classes=5004)

model = MobileNet(input_shape=(96, 96, 3), alpha=1., weights=None, classes=5005)
model.compile(optimizer=Adam(lr=0.002), loss='categorical_crossentropy',
              metrics=['acc','mse',categorical_crossentropy, categorical_accuracy])
#model.summary()

In [None]:

#mobilenet = model.fit(x=X_train/255, y=y_train, epochs=600, batch_size=100, verbose=1, validation_data=(X_test/255, y_test), shuffle=True)
mc = keras.callbacks.ModelCheckpoint('weights{epoch:08d}.h5', 
                                     save_weights_only=True, period=5)
mobilenet = model.fit(X, y, epochs=100
                       , batch_size=100, verbose=1, callbacks=[mc], shuffle=True)

In [None]:
graph_acc_loss(mobilenet)

In [None]:
##Save partly trained model 
model.save('00_mobilenet_trained.h5')

## ** AUGMENTATION**
https://machinelearningmastery.com/image-augmentation-deep-learning-keras/

In [None]:
# Feature Standardization
def augmentation_feature_standardization(X_train):
    X_train_clone = X_train
    datagen = ImageDataGenerator(featurewise_center=True, featurewise_std_normalization=True)
    datagen.fit(X_train_clone)
    return datagen

# Random Flips
def augmentation_random_flips(X_train):
    X_train_clone = X_train
    datagen = ImageDataGenerator(horizontal_flip=True, vertical_flip=True)
    datagen.fit(X_train_clone)
    return datagen

# Random Rotations
def augmentation_random_rotations(X_train):
    X_train_clone = X_train
    datagen = ImageDataGenerator(rotation_range=90)    
    datagen.fit(X_train_clone)
    return datagen

# Random shifts
def augmentation_random_shifts(X_train):
    X_train_clone = X_train
    datagen = ImageDataGenerator(width_shift_range=shift, height_shift_range=shift)    
    datagen.fit(X_train_clone)
    return datagen

# ZCA whitening
def augmentation_random_zca(X_train):
    X_train_clone = X_train
    datagen = ImageDataGenerator(zca_whitening=True)   
    datagen.fit(X_train_clone)
    return datagen

In [None]:
#X_train_featureStandarization = augmentation_feature_standardization(X)
X_train_randomFlips = augmentation_random_flips(X)
#X_train_randomShifts = augmentation_random_shifts(X)
X_train_randomRotations = augmentation_random_rotations(X)
#X_train_randomZca = augmentation_random_zca(X)

**Random Flips**

In [None]:
for X_batch, y_batch in X_train_randomFlips.flow(X,y, batch_size=9):
    # create a grid of 3x3 images
    for i in range(0, 9):
        pyplot.subplot(330 + 1 + i)
        pyplot.imshow(X_batch[i], cmap=pyplot.get_cmap('gray'))
    # show the plot
    pyplot.show()
    break

In [None]:
batch_size=100
mc = keras.callbacks.ModelCheckpoint('weights_Flips{epoch:08d}.h5', 
                                     save_weights_only=True, period=5)

mobilenet1 = model.fit_generator(X_train_randomFlips.flow(X, y, batch_size=batch_size),
                        verbose=1,
                        callbacks=[mc],
                        epochs=20,
                        steps_per_epoch=X.shape[0] // batch_size)
                        #use_multiprocessing=True,workers=6)

In [None]:
graph_acc_loss(mobilenet1)

**Random Shifts**

In [None]:
for X_batch, y_batch in X_train_randomShifts.flow(X,y, batch_size=9):
    # create a grid of 3x3 images
    for i in range(0, 9):
        pyplot.subplot(330 + 1 + i)
        pyplot.imshow(X_batch[i], cmap=pyplot.get_cmap('gray'))
    # show the plot
    pyplot.show()
    break

In [None]:
batch_size=100
mc = keras.callbacks.ModelCheckpoint('weights_Shifts{epoch:08d}.h5', 
                                     save_weights_only=True, period=5)

mobilenet2 = model.fit_generator(X_train_randomShifts.flow(X, y, batch_size=batch_size),
                        verbose=1,
                        callbacks=[mc],
                        epochs=20,
                        steps_per_epoch=X.shape[0] // batch_size)
                        #use_multiprocessing=True,workers=6)

In [None]:
graph_acc_loss(mobilenet2)

**Random Rotations**

In [None]:
for X_batch, y_batch in X_train_randomRotations.flow(X,y, batch_size=9):
    # create a grid of 3x3 images
    for i in range(0, 9):
        pyplot.subplot(330 + 1 + i)
        pyplot.imshow(X_batch[i], cmap=pyplot.get_cmap('gray'))
    # show the plot
    pyplot.show()
    break

In [None]:
batch_size=100
mc = keras.callbacks.ModelCheckpoint('weights_Rotations{epoch:08d}.h5', 
                                     save_weights_only=True, period=5)

mobilenet3 = model.fit_generator(X_train_randomRotations.flow(X, y, batch_size=batch_size),
                        verbose=1,
                        callbacks=[mc],
                        epochs=20,
                        steps_per_epoch=X.shape[0] // batch_size)
                        #use_multiprocessing=True,workers=6)

In [None]:
graph_acc_loss(mobilenet3)

In [None]:
#X_train_randomZca = augmentation_random_zca(X_train)
#mobilenet5 = model.fit(x=X_train_randomZca/255, y=y_train, epochs=50, batch_size=100, verbose=1, validation_data=(X_test/255, y_test), shuffle=True)
#graph_acc_loss(mobilenet)

In [None]:
##Save partly trained model 
#model.save('augmentation_mobilenet_trained.h5') 
#del model 
##Reload model 
#model = load_model('../input/trained-v1/00_mobilenet_trained.h5') 

## **SUBMISSION**

In [None]:
test = os.listdir("../input/humpback-whale-identification/test/")
print(len(test))
col = ['Image']
test_df = pd.DataFrame(test, columns=col)
test_df['Id'] = ''

Xs = prepareImages(test_df, test_df.shape[0], "test")
Xs /= 255
ys, label_encoder_s = prepare_labels(df['Id'])

In [None]:
prediction = model.predict(np.array(Xs), verbose=1)

In [None]:
for i, pred in enumerate(prediction):
    test_df.loc[i, 'Id'] = ' '.join(label_encoder_s.inverse_transform(pred.argsort()[-5:][::-1]))

In [None]:
test_df.to_csv('submission_v13.csv', index=False)

In [None]:
test_df.head(10)

In [None]:
# import the modules we'll need
# https://www.kaggle.com/rtatman/download-a-csv-file-from-a-kernel
from IPython.display import HTML
import pandas as pd
import numpy as np
import base64

# function that takes in a dataframe and creates a text link to  
# download it (will only work for files < 2MB or so)
def create_download_link(df, title = "Download CSV file", filename = "submission_v10.csv"):  
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

# create a random sample dataframe
df = pd.DataFrame(np.random.randn(50, 8), columns=list('Image,Id'))

# create a link to download the dataframe
create_download_link(test_df)

# ↓ ↓ ↓  Yay, download link! ↓ ↓ ↓ 