# Introduction

This notebook explore the dataset of Happywhale - whale and dolphin and identify it.

# Analysis

load the data and explore it preliminarly.



In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import matplotlib.pyplot as plt
import seaborn as sns
import cv2

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
print(f"Files and folders: {os.listdir('/kaggle/input/happy-whale-and-dolphin')}")

Let's read and explore train.csv and sample_submission.csv first

In [None]:
train_df = pd.read_csv('/kaggle/input/happy-whale-and-dolphin/train.csv')
submission_df = pd.read_csv('/kaggle/input/happy-whale-and-dolphin/sample_submission.csv')

In [None]:
train_df.head()

In [None]:
submission_df.head()

# Data Exploration

Let's get some more insight into the train data and train and test images.

In [None]:
print(f"Images in train index file: {train_df.image.nunique()}")
print(f"Species in train index file: {train_df.species.nunique()}")
print(f"Individual IDs in train index file: {train_df.individual_id.nunique()}")

print(f"Images in train images folder: {len(os.listdir('/kaggle/input/happy-whale-and-dolphin/train_images'))}")
print(f"Images in test images folder: {len(os.listdir('/kaggle/input/happy-whale-and-dolphin/test_images'))}")

Let's check more details about the column individual_id from train_df values distribution.



In [None]:
print("Top 10 individual_id")
train_df.individual_id.value_counts().head(10)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 7))
sns.kdeplot(np.log(train_df.individual_id.value_counts()))
plt.title("Logaritmic distribution of individual_id frequency in images")
plt.show()

Let's check as well frequency of species in train dataset.

In [None]:
temp = train_df["species"].value_counts()
df = pd.DataFrame({'Species': temp.index,
                   'Images': temp.values
                  })
plt.figure(figsize = (12,6))
plt.title('Species distribution - images per each species - train dataset')
sns.set_color_codes("pastel")
s = sns.barplot(x = 'Species', y="Images", data=df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show()

Let's see now how many individual ids are per each species.

In [None]:
temp = train_df.groupby(["species"])["individual_id"].nunique()
df = pd.DataFrame({'Species': temp.index,
                   'Unique ID Count': temp.values
                  })
df = df.sort_values(['Unique ID Count'], ascending=False)
plt.figure(figsize = (12,6))
plt.title('Species distribution - Individual IDs per each species - train dataset')
sns.set_color_codes("pastel")
s = sns.barplot(x = 'Species', y="Unique ID Count", data=df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show()

Let's check now the image sizes in train and test images datasets.

Let's check if set of images listed in train_df is identical with set of images in folder train_images

In [None]:
train_df_list = list(train_df.image.unique())
train_images_list = list(os.listdir('/kaggle/input/happy-whale-and-dolphin/train_images'))
delta = set(train_df_list) & set(train_images_list)
minus = set(train_df_list) - set(train_images_list)
print(f"Images in train dataset: {len(train_df_list)}\nImages in train folder: {len(train_images_list)}\nIntersection: {len(delta)}\nDifference: {len(minus)}")


All images indexed in train_df are present in the images folder and viceversa.

In [None]:
def read_image_sizes(file_name):
    image = cv2.imread('/kaggle/input/happy-whale-and-dolphin/train_images/' + file_name)
    return list(image.shape)

# Images data exploration

Because the processing of images to get images dimmension, we will only process a sample of 2500 images.

In [None]:
import time
sample_size = 2500
start_time = time.time()
train_sample_df = train_df.sample(sample_size)
m = np.stack(train_sample_df['image'].apply(read_image_sizes))
df = pd.DataFrame(m,columns=['w','h','c'])
print(f"Total processing time for {sample_size} images: {round(time.time()-start_time, 2)} sec.")

In [None]:
train_img_df = pd.concat([train_sample_df, df], axis=1, sort=False)
print(f"Number of different image size ( images samples): {train_img_df.groupby(['w','h', 'c']).count().shape[0]}")

It appears that there are many images sizes (we only sampled less than 5% of the total number of images).

Let's visualize the distribution of width/height and colors per species.

In [None]:
plt.figure(figsize = (12,6))
plt.title('Species distribution - width per each species - train dataset (5% random data sample)')
sns.set_color_codes("pastel")
s = sns.boxplot(x = 'species', y="w", data=train_img_df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show()

In [None]:
plt.figure(figsize = (12,6))
plt.title('Species distribution - height per each species - train dataset (5% random data sample)')
sns.set_color_codes("pastel")
s = sns.boxplot(x = 'species', y="h", data=train_img_df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show()

Let's show the distribution of width and height per species using a scatterplot.

In [None]:
def plot_species_scatter(train_img_df):
    i = 0
    sns.set_style('whitegrid')
    plt.figure()
    species = list(train_img_df.species.unique())
    fig, ax = plt.subplots(5, 5,figsize=(15, 12))

    for spec in species:
        i += 1
        plt.subplot(5, 5,i)
        df = train_img_df.loc[train_img_df.species==spec]
        plt.scatter(df['w'], df['h'], marker='+')
        plt.xlabel(spec, fontsize=9)
    plt.show();
plot_species_scatter(train_img_df.dropna())

The number of colors seems to be allways 3 for the 5% random sample used.

Let's sample few of the train images, grouped on species.

We create first a plotting function.

In [None]:
def plot_image_samples(species):
    root_path = "/kaggle/input/happy-whale-and-dolphin/"
    fig.subplots_adjust(hspace = .1, wspace=.1)
    images_folder="train_images/"
    df = train_df[train_df['species']==species].copy()
    df.index = range(len(df.index))

    f, ax = plt.subplots(4, 4, figsize=(16,16))

    for i in range(16):
        file = df.loc[i, 'image']
        species = df.loc[i, 'species']
        identifier = df.loc[i, 'individual_id']
        img = cv2.imread(root_path+images_folder+file)
        ax[i//4, i%4].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        ax[i//4, i%4].set_title(identifier+" ("+species+")")
        ax[i//4, i%4].axis('off')

In [None]:
plot_image_samples("bottlenose_dolphin")

In [None]:
plot_image_samples("beluga")

In [None]:
plot_image_samples("humpback_whale")


In [None]:
plot_image_samples("blue_whale")

In [None]:
plot_image_samples("killer_whale")

In [None]:
plot_image_samples("spotted_dolphin")

Let's also look to a sample of test images.

In [None]:
def plot_image_samples_test():
    root_path = "/kaggle/input/happy-whale-and-dolphin/"
    fig.subplots_adjust(hspace = .1, wspace=.1)
    images_folder="test_images/"

    f, ax = plt.subplots(4, 4, figsize=(16,16))
    file_list = list(os.listdir(root_path+images_folder))
    for i in range(16):
        file = file_list[i]
        img = cv2.imread(root_path+images_folder+file)
        ax[i//4, i%4].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        ax[i//4, i%4].set_title("Test image: "+file)
        ax[i//4, i%4].axis('off')

In [None]:
plot_image_samples_test()

# Submission

Let's rotate the identifiers so thatnew_individual became the first option.

In [None]:
def rotate_values(x):
    xcopy = x.split()
    temp = xcopy[4]
    xcopy[4] = xcopy[0]
    xcopy[0] = temp
    xcopy = " ".join(xcopy)
    return xcopy

In [None]:
submission_df["predictions"] = submission_df["predictions"].apply(lambda x: rotate_values(x))

In [None]:
submission_df.head()

We output the prepared submission file.

In [None]:
submission_df.to_csv('submission.csv', index=False)