In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

from pathlib import Path
import cv2
import matplotlib.pyplot as plt
import matplotlib.image as img
from tqdm.notebook import tqdm

# Species Distribution
### Number of Species in this dataset

In [None]:
label = pd.read_csv('/kaggle/input/happy-whale-and-dolphin/train.csv')
label.head()

In [None]:
label.shape

In [None]:
species = label.species.unique().tolist()
print('Number of species in the dataset:', len(species))

### Species Distribution
Let's look at the species distribution:

In [None]:
plt.figure(figsize=(10,8))
label.species.value_counts().plot(kind='barh')
plt.title('Distribution of Species')
plt.show()

Some Paths:

In [None]:
ROOT_PATH = Path('/kaggle/input/happy-whale-and-dolphin/train_images/')

# Visualize Samples
Let's look at one random training image:

In [None]:
sample = '002ac5f9ad7c10.jpg'
img0 = ROOT_PATH/sample
img0 = img.imread(img0)
plt.imshow(img0)
plt.title(label.loc[label.image==sample, 'species'].values[0])
plt.show()

Take a look at first 9 images:

In [None]:
fig, axis = plt.subplots(3,3, figsize=(9,9))
c = 0
for i in range(3):
    for j in range(3):
        image_id = label.image.iloc[c]
        image_path = ROOT_PATH/image_id
        image = img.imread(image_path)
        
        image_label = label.species.iloc[c]
        
        axis[i][j].imshow(image)
        axis[i][j].set_title(image_label)
        axis[i][j].set_xticks([])
        axis[i][j].set_yticks([])
        c += 1
plt.tight_layout()
plt.show()    

Next, le's take a look at each species:

In [None]:
fig, axis = plt.subplots(5,6, figsize=(12,10))
c = 0
for i in range(5):
    for j in range(6):
        image_id = label.drop_duplicates(subset='species', inplace=False).image.iloc[c]
        image_path = ROOT_PATH/image_id
        image = img.imread(image_path)

        image_label = label.drop_duplicates(subset='species', inplace=False).species.iloc[c]
        
        axis[i][j].imshow(image)
        axis[i][j].set_title(image_label)
        axis[i][j].set_xticks([])
        axis[i][j].set_yticks([])
        c += 1
plt.suptitle('Example of Every Species in Training Dataset')
plt.tight_layout()
plt.show()    

### Look at Target
In this task, we are asked to predict the individual_id. So I want to see how many unique individual_id there are.

In [None]:
print('Number of Unique individual_id:',len(label.individual_id.unique()))

### Plot multiple picture for the same individual_id

In [None]:
label.value_counts('individual_id')

We can see individual_id of **37c7aba965a5** has most of the training images. Let's take a look:

In [None]:
fig, axis = plt.subplots(3,3, figsize=(10,10))
c = 0
for i in range(3):
    for j in range(3):
        image_id = label[label.individual_id=='37c7aba965a5'].image.iloc[c]
        image_path = ROOT_PATH/image_id
        image = img.imread(image_path)
        
        axis[i][j].imshow(image)
        axis[i][j].set_xticks([])
        axis[i][j].set_yticks([])
        c += 1
plt.suptitle('Individual_ID: 37c7aba965a5, Species:'+label[label.individual_id=='37c7aba965a5'].species.unique()[0])
plt.tight_layout()
plt.show()

Take another look at id **c995c043c353**

In [None]:
fig, axis = plt.subplots(3,3, figsize=(10,10))
c = 0
for i in range(3):
    for j in range(3):
        image_id = label[label.individual_id=='c995c043c353'].image.iloc[c]
        image_path = ROOT_PATH/image_id
        image = img.imread(image_path)
        
        # image_label = label[label.individual_id=='37c7aba965a5'].species.iloc[c]
        
        axis[i][j].imshow(image)
        axis[i][j].set_xticks([])
        axis[i][j].set_yticks([])
        c += 1
plt.suptitle('Individual_ID: 37c7aba965a5, Species:'+label[label.individual_id=='c995c043c353'].species.unique()[0])
plt.tight_layout()
plt.show()