In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import PIL
import PIL.Image
import cv2
import plotly.express as px

import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [None]:
train_df = pd.read_csv('../input/happy-whale-and-dolphin/train.csv')
train_df.head()

# EDA

### Species Distribution
Let's look at the distribution of the different speicies in the training set



In [None]:
species_count = train_df.groupby('species')['individual_id'].agg(species_count= 'count').reset_index()
no_of_individuals = train_df.groupby('individual_id')['image'].agg(no_of_individuals='count').reset_index()

train = pd.merge(train_df, species_count, on=['species'], how='inner')
train = pd.merge(train, no_of_individuals, on=['individual_id'], how='inner')

train.head()

Let's look at the distribution of different species


In [None]:
px.bar(species_count.sort_values(by='species_count', ascending=False), x='species', y='species_count', color='species')

Same above chart via Pie chart

In [None]:
px.pie(train, names='species')

In [None]:
# Check if there are any individual_id's having different naming conventions

set(train[train['species']=='killer_whale']['individual_id'].unique()).intersection(set(train[train['species']=='kiler_whale']['individual_id'].unique()))

In [None]:
set(train[train['species']=='bottlenose_dolphin']['individual_id'].unique()).intersection(set(train[train['species']=='bottlenose_dolpin']['individual_id'].unique()))

Looks like we do not have any overlap :)

### Individuals Distribution


In [None]:
px.bar(train.groupby('species')['individual_id'].nunique().reset_index().rename(columns={'individual_id':'no_of_individuals'}).sort_values(by='no_of_individuals', ascending=False), 
       x='species', y='no_of_individuals', color='species')

In [None]:
px.bar(train[train.species=='bottlenose_dolphin'], x='individual_id', y='no_of_individuals', title='bottlenose_dolphin distribution')

In [None]:
px.bar(train[train.species=='kiler_whale'], x='individual_id', y='no_of_individuals', title='Killer whale distribution')

In [None]:
px.bar(species_count.sort_values(by='species_count', ascending=False), x='species', y='species_count', color='species')

### Visualize some sample images of different species

In [None]:
# (reference:- https://www.kaggle.com/ruchi798/and-identification-eda-augmentation)

def path(group,group_type):
    PATH = "../input/happy-whale-and-dolphin/train_images"
    
    #species
    if group_type=='species':
        z = train['image'][train['species']==group].values 
    
    #ID
    if group_type=='id':
        z = train['image'][train['individual_id']==group].values 
   
    image_names = []
    for filename in z:
        fullpath = os.path.join(PATH, filename)
        image_names.append(fullpath)
    return image_names



def display_multiple_imgs(group, group_type, rows, cols):
    
    image_paths = path(group, group_type)
    image_paths = np.random.choice(image_paths, rows*cols)
    
    figure, ax = plt.subplots(nrows=rows,ncols=cols,figsize=(16,8))
    plt.suptitle(group, fontsize=20)
    for ind,image_path in enumerate(image_paths):
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 
        image = cv2.resize(image, (1200, 800))
        try:
            ax.ravel()[ind].imshow(image)
            ax.ravel()[ind].set_axis_off()
        except:
            continue;
    plt.tight_layout()
    plt.show()

In [None]:
for species in train['species'].unique():
    print('\n\n')
    display_multiple_imgs(species, 'species', 2, 2)

# Preprocessing and Model Building

In [None]:
label_names = train_df['individual_id'].unique()
label_names

In [None]:
label_to_index = dict((name, index) for index,name in enumerate(label_names))
# label_to_index

In [None]:
all_image_labels = [label_to_index[i] for i in train_df['individual_id']]
all_image_labels[:20]

In [None]:
train_df['label'] = all_image_labels
train_df.head()

# Images

In [None]:
all_image_paths = ['../input/happy-whale-and-dolphin/train_images/' + img for img in train_df['image']]
all_image_paths[:10]

In [None]:
image_count = len(all_image_paths)
image_count

In [None]:
def preprocess_image(image):
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [224, 224])
    image /= 255.0  # normalize to [0,1] range

    return image

In [None]:
def load_and_preprocess_image(path):
    image = tf.io.read_file(path)
    return preprocess_image(image)

In [None]:
image_path = all_image_paths[0]
label = all_image_labels[0]

plt.imshow(load_and_preprocess_image(image_path))
plt.grid(False)
plt.title(label_names[label].title())
print()

# Create a dataset

In [None]:
path_ds = tf.data.Dataset.from_tensor_slices(all_image_paths)
image_ds = path_ds.map(load_and_preprocess_image, num_parallel_calls=AUTOTUNE)
label_ds = tf.data.Dataset.from_tensor_slices(tf.cast(all_image_labels, tf.int64))
image_label_ds = tf.data.Dataset.zip((image_ds, label_ds))

In [None]:
BATCH_SIZE = 32

ds = image_label_ds.shuffle(buffer_size=1024)
ds = ds.batch(BATCH_SIZE)
ds = ds.prefetch(buffer_size=AUTOTUNE)
ds

# Train a model

## Model 1

In [None]:
preprocess_input = tf.keras.applications.mobilenet_v2.preprocess_input

base_model = tf.keras.applications.MobileNetV2(input_shape=(224, 224, 3), include_top=False, weights='imagenet')
base_model.trainable=True

prediction_layer = tf.keras.layers.Dense(len(label_names))

In [None]:
inputs = tf.keras.Input(shape=(224, 224, 3))
x = preprocess_input(inputs)
x = base_model(x, training=False)
x = tf.keras.layers.GlobalAveragePooling2D()(x)
x = tf.keras.layers.Dense(1024, activation='relu')(x)
x = tf.keras.layers.Dense(1024, activation='relu')(x)
x = tf.keras.layers.Dense(1024, activation='relu')(x)
x = tf.keras.layers.Dense(1024, activation='relu')(x)



outputs = prediction_layer(x)

model = tf.keras.Model(inputs, outputs)

In [None]:
model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
# model.fit(ds, epochs=10)

# Submission

In [None]:
sample_submission_df = pd.read_csv('../input/happy-whale-and-dolphin/sample_submission.csv')
sample_submission_df.head()

In [None]:
test = pd.read_csv('../input/happy-whale-and-dolphin/sample_submission.csv')

In [None]:
test_image_paths = ['../input/happy-whale-and-dolphin/test_images/' + img for img in sample_submission_df['image']]
test_path_ds = tf.data.Dataset.from_tensor_slices(test_image_paths)
test_image_ds = test_path_ds.map(load_and_preprocess_image, num_parallel_calls=AUTOTUNE)
test_ds = test_image_ds.batch(32).prefetch(buffer_size=AUTOTUNE)

In [None]:
%%time

pred = model.predict(test_ds)

In [None]:
# pred = pred.argsort(axis=1)[:,::-1]
# pred = pred[:,0:5]

In [None]:
# index_to_label = {v: k for k, v in label_to_index.items()}
# predictions = [None] * len(pred)

# for i in range(len(pred)):
#     row = [None] * 5
    
#     for j in range(5):
#         row[j] = index_to_label[pred[i][j]]
        
#     predictions[i] = " ".join(row)

In [None]:
# sample_submission_df['predictions'] = predictions
# sample_submission_df['predictions'].head()

In [None]:
# sample_submission_df.to_csv('submission.csv', index=False)

# Model 2

In [None]:
def rotate_values(x):
    xcopy = x.split()
    temp = xcopy[4]
    xcopy[4] = xcopy[0]
    xcopy[0] = temp
    xcopy = " ".join(xcopy)
    return xcopy

In [None]:
submission_df = pd.read_csv('/kaggle/input/happy-whale-and-dolphin/sample_submission.csv')

In [None]:
submission_df["predictions"] = submission_df["predictions"].apply(lambda x: rotate_values(x))

In [None]:
submission_df.head()

In [None]:
submission_df.to_csv('submission.csv', index=False)