# Intro
Welcome to the [Happywhale - Whale and Dolphin Identification](https://www.kaggle.com/c/happy-whale-and-dolphin/data) compedition.

![](https://storage.googleapis.com/kaggle-competitions/kaggle/22962/logos/header.png)

In this competition, the task was to predict individual humpback whales from images of their flukes. Whales and dolphins in this dataset can be identified by shapes, features and markings of dorsal fins, backs, heads and flanks.

**Table of content:**
1. [Exploratory Data Analysis](#EDA)
2. [Load Single Image](#LoadSingleImage)
3. [Plot Examples](#PlotExamples)
4. [Image Preprocessing](#ImagePreprocessing)
5. [Data Generator](#DataGenerator)
6. [Model](#Model)

<font size="4"><span style="color: royalblue;">Please vote the notebook up if it helps you. Feel free to leave a comment above the notebook. Thank you. </span></font>

# Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from tensorflow.keras.utils import to_categorical, Sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.optimizers import RMSprop,Adam
from tensorflow.keras.applications import ResNet50

# Path

In [None]:
path = '/kaggle/input/happy-whale-and-dolphin/'
os.listdir(path)

# Load Data

In [None]:
train_data = pd.read_csv(path+'train.csv')
samp_subm = pd.read_csv(path+'sample_submission.csv')

In [None]:
samp_subm.loc[0, 'predictions']

# Overview

In [None]:
print('Number train samples:', len(train_data))
print('Number train images:', len(os.listdir(path+'train_images/')))
print('Number test images:', len(os.listdir(path+'test_images/')))

In [None]:
train_data.head()

# Exploratory Data Analysis <a name="EDA"></a>

There are 30 different species collected from 28 different research organizations:

In [None]:
train_data['species'].value_counts()

There are duplicate names in species which can be merged:
* bottlenose_dolphin and bottlenose_dolhin,
* killer_whale and kiler_whale.

So in total we have 28 different species.

Individuals have been manually identified and given an individual_id by marine researches, and our task is to correctly identify these individuals in images:

In [None]:
train_data['individual_id'].value_counts()

# Load Single Image <a name="LoadSingleImage"></a>
We plot the first image of of the train data.

In [None]:
row = 0
file = train_data.loc[row, 'image']
species = train_data.loc[row, 'species']

img = cv2.imread(path+'train_images/'+file)
print('Shape:', img.shape)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 7))
ax.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_title(species)
plt.show()

# Plot Examples <a name="PlotExamples"></a>
We plot example images of the species top 3.

In [None]:
def plot_examples(category = 'bottlenose_dolphin'):
    """ Plot 5 images of a given category """
    
    fig, axs = plt.subplots(1, 5, figsize=(25, 20))
    fig.subplots_adjust(hspace = .1, wspace=.1)
    axs = axs.ravel()
    temp = train_data[train_data['species']==category].copy()
    temp.index = range(len(temp.index))
    for i in range(5):
        file = temp.loc[i, 'image']
        species = temp.loc[i, 'species']
        img = cv2.imread(path+'train_images/'+file)
        axs[i].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        axs[i].set_title(species)
        axs[i].set_xticklabels([])
        axs[i].set_yticklabels([])
    plt.show()

In [None]:
plot_examples(category = 'bottlenose_dolphin')

In [None]:
plot_examples(category = 'beluga')

In [None]:
plot_examples(category = 'humpback_whale')

# Image Preprocessing <a name="ImagePreprocessing"></a>
As we can see the images have different format: landscape or portrait. For the neural network we need a standard size. So we have to prepare the data. 

In [None]:
def image_preprocessing(image, image_size):
    """ Image Preprocessing """
    
    # Crop Image
    mid_row = int(image.shape[0]/2)
    mid_col = int(image.shape[1]/2)
    if image.shape[0]>image.shape[1]:
        image_cropped = image[mid_row-mid_col:mid_row+mid_col,
                                   0:image.shape[1]]
    else:
        image_cropped = image[0:image.shape[0],
                                   mid_col-mid_row:mid_col+mid_row]
    
    # Rescale Image
    image_rescale = cv2.resize(image_cropped,
                               dsize=(image_size, image_size))
    return image_rescale


def plot_befor_after(image):
    """ Compare original and prepared image """
    
    fig, axs = plt.subplots(1, 2, figsize=(15, 10))
    fig.subplots_adjust(hspace = .1, wspace=.1)
    axs = axs.ravel()
    # Plot Original Image
    axs[0].imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    axs[0].set_title('original shape: '+str(image.shape))
    # Image Preprocessing
    image_rescale = image_preprocessing(image, image_size)
    # Plot Prepared Image
    axs[1].imshow(cv2.cvtColor(image_rescale, cv2.COLOR_BGR2RGB))
    axs[1].set_title('rescaled shape: '+str(image_rescale.shape))
    for i in range(2):
        axs[i].set_xticklabels([])
        axs[i].set_yticklabels([])
    plt.show()

In [None]:
image_size = 128

In [None]:
row = 2022
file = train_data.loc[row, 'image']
species = train_data.loc[row, 'species']
image = cv2.imread(path+'train_images/'+file)
print('Shape:', image.shape)

In [None]:
plot_befor_after(image)

# Split Data

In [None]:
list_IDs_train, list_IDs_val = train_test_split(list(train_data.index), test_size=0.33, random_state=2022)
list_IDs_test = list(samp_subm.index)

In [None]:
print('Number train samples:', len(list_IDs_train))
print('Number val samples:', len(list_IDs_val))
print('Number test samples:', len(list_IDs_test))

# Data Generator <a name="DataGenerator"></a>
We define a data generator to load the data on demand.

In [None]:
img_size = 32
img_channel = 3
batch_size = 64
num_classes = len(train_data['species'].value_counts())

In [None]:
class DataGenerator(Sequence):
    def __init__(self, path, list_IDs, data, img_size, img_channel, batch_size, num_classes):
        self.path = path
        self.list_IDs = list_IDs
        self.data = data
        self.img_size = img_size
        self.img_channel = img_channel
        self.batch_size = batch_size
        self.num_classes = num_classes
        self.indexes = np.arange(len(self.list_IDs))
        if self.path.find('train')>=0:
            self.labels = pd.get_dummies(self.data['species'])
        
    def __len__(self):
        len_ = int(len(self.list_IDs)/self.batch_size)
        if len_*self.batch_size < len(self.list_IDs):
            len_ += 1
        return len_
    
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        X, y = self.__data_generation(list_IDs_temp)
        return X, y
    
    def __data_generation(self, list_IDs_temp):
        X = np.zeros((self.batch_size, self.img_size, self.img_size, self.img_channel))
        y = np.zeros((self.batch_size, self.num_classes), dtype=int)
        for i, ID in enumerate(list_IDs_temp):
            
            file = self.data.loc[ID, 'image']
            
            img = cv2.imread(self.path+file)
            
            img_prep = image_preprocessing(img, self.img_size)
            X[i, ] = img_prep/255
            if self.path.find('train')>=0:
                y[i, ] = self.labels.loc[ID]
        return X, y

In [None]:
train_generator = DataGenerator(path+'train_images/', list_IDs_train, train_data, img_size, img_channel, batch_size, num_classes)
val_generator = DataGenerator(path+'train_images/', list_IDs_val, train_data, img_size, img_channel, batch_size, num_classes)
test_generator = DataGenerator(path+'test_images/', list_IDs_test, samp_subm, img_size, img_channel, batch_size, num_classes)

Test data generator:

In [None]:
X, y = train_generator.__getitem__(0)
X[0].shape

# Model <a name="Model"></a>
**Coming soon**

In [None]:
weights='../input/models/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5'
conv_base = ResNet50(weights=weights,
                     include_top=False,
                     input_shape=(img_size, img_size, img_channel))
conv_base.trainable = True

# Export

In [None]:
samp_subm['predictions'] = '37c7aba965a5 114207cab555 a6e325d8e924 new_individual'
samp_subm.head()

In [None]:
samp_subm.to_csv('submission.csv', index=False)