# EDA for HappyWhale Image Prediction

# Load Packages

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from PIL import Image
import re
from termcolor import colored
sns.set_style("darkgrid")

# Load Data

In [None]:
TRAIN_IMAGE_DIR = '/kaggle/input/happy-whale-and-dolphin/train_images'
TEST_IMAGE_DIR = '/kaggle/input/happy-whale-and-dolphin/test_images'

n_train_images = len(os.listdir(TRAIN_IMAGE_DIR))
n_test_images = len(os.listdir(TEST_IMAGE_DIR))
train_df = pd.read_csv('/kaggle/input/happy-whale-and-dolphin/train.csv')
sample_submission = pd.read_csv('/kaggle/input/happy-whale-and-dolphin/sample_submission.csv')

# Basic Statistics & Name Corrections

In [None]:
print('# of training images = {}'.format(n_train_images))
print('# of testing images = {}'.format(n_test_images))
print('Shape of train_df ={}'.format(train_df.shape))
print('Shape of sample_submission = {}'.format(sample_submission.shape))
train_df.head()

In [None]:
unique_species = train_df.species.unique()
n_unique_species = len(unique_species)
n_unique_ids = len(train_df.individual_id.unique())
print('Number of unique species = {}'.format(n_unique_species))
print('Number of unique ids = {}'.format(n_unique_ids))
print('\nList of species:\n', unique_species)

Here, we can see, there are some species which are misspelt. For example, kiler and bottlenose_dolpin should be killer and bottlenose_dolphin. Also, beluga and globis are not categorized. Let's categorize them as whale. However, before making all these changes, let's see the distribution of these categories.

In [None]:
train_df['category'] = [re.split('_', s)[-1] for s in train_df.species]
train_df['only_species'] = [re.sub('_dolphin|_whale', '', s) for s in train_df.species]
train_df.head()

In [None]:
n_train_dolphins = sum(train_df.category == 'dolphin')
n_train_whales = sum(train_df.category == 'whale')
print('Number of dolphins in train = {}'.format(n_train_dolphins))
print('Number of whales in train = {}'.format(n_train_whales))
pie_plot_data = train_df.groupby('category')['image'].count()
labels = pie_plot_data.keys()
explode = [0.015]*len(labels)
pie, ax = plt.subplots(figsize = [10,6])
plt.pie(pie_plot_data, autopct = "%.1f%%", labels = labels, explode = explode, pctdistance = 0.5)
plt.show()

From pie chart, we can clearly see, apart from whale and dolphin, there are few species which do not have whale or dolphin as suffix. Also, few dolphin are misspelt as dolpins. So, we will correct the spellings of dolpin. Also, consider beluga and globus as whale.


In [None]:
name_mapping = {'kiler_whale': 'killer_whale',
               'bottlenose_dolpin': 'bottlenose_dolphin',
               'beluga': 'beluga_whale',
               'globis': 'globis_whale'}

train_df['species'] = train_df['species'].replace(name_mapping)

train_df['category'] = [re.split('_', s)[-1] for s in train_df.species]
train_df['only_species'] = [re.sub('_dolphin|_whale', '', s) for s in train_df.species]
train_df.head()

In [None]:
print(colored('Before preprocessing names:', 'red'))
print('Number of unique species {}'.format(n_unique_species))
print('Number of unique ids {}'.format(n_unique_ids))
print('Number of dolphins in train = {}'.format(n_train_dolphins))
print('Number of whales in train = {}'.format(n_train_whales))

print(colored('\nAfter preprocessing names:', 'red'))
unique_species = train_df.species.unique()
print('Number of unique species {}'.format(len(unique_species)))
print('Number of unique ids {}'.format(len(train_df.individual_id.unique())))
print('Number of dolphins in train = {}'.format(sum(train_df.category == 'dolphin')))
print('Number of whales in train = {}'.format(sum(train_df.category == 'whale')))
print('Number of dolphin species = {}'.format(len(train_df[train_df.category == 'dolphin'].species.unique())))
print('Number of whale species = {}'.format(len(train_df[train_df.category == 'whale'].species.unique())))
pie_plot_data = train_df.groupby('category')['image'].count()
labels = pie_plot_data.keys()
explode = [0.015]*len(labels)
pie, ax = plt.subplots(figsize = [10,6])
plt.pie(pie_plot_data, autopct = "%.1f%%", labels = labels, explode = explode, pctdistance = 0.5)
plt.show()

From the pie-chart, we can infer that two-third are of the images are of whale. Now, let's see, which species are more common in whales and dolphins.

In [None]:
fig = plt.figure(figsize = (10,7), dpi = 120)
gs = gridspec.GridSpec(2,2)
colors = sns.color_palette()

ax0 = fig.add_subplot(gs[0:2,0])
species_count = pd.DataFrame(train_df.groupby('species').image.count())
fifth_largest_value = species_count['image'].nlargest(5)[4]
color_plt = [colors[0] if value < fifth_largest_value else colors[3] for value in species_count.image]
bar = sns.barplot(data = species_count, y = species_count.index, x = 'image',  ax = ax0, palette = color_plt)
plt.xlabel('# of images')
plt.ylabel('')
plt.title('All Species')


ax1 = fig.add_subplot(gs[0,1])
dolphin_count = train_df[train_df.category == 'dolphin'].groupby('only_species').image.count()
dolphin_count = pd.DataFrame(dolphin_count.sort_values(ascending = False))
sns.barplot(data = dolphin_count, y = dolphin_count.index, x = 'image', ax = ax1, color = colors[0])
plt.xlabel('')
plt.ylabel('')
plt.title('Dolphin')

ax2 = fig.add_subplot(gs[1,1])
whale_count = train_df[train_df.category == 'whale'].groupby('only_species').image.count()
whale_count = pd.DataFrame(whale_count.sort_values(ascending = False))
sns.barplot(data = whale_count, y = whale_count.index, x = 'image', ax = ax2, color = colors[0])
plt.xlabel('# of images')
plt.ylabel('')
plt.title('Whale')
plt.subplots_adjust(left=0,
                    bottom=0, 
                    right=1, 
                    top=1, 
                    wspace=0.35, 
                    hspace=0.2)

From bar plot of all species, we can see the top 5 species (in red bars) in terms of number of images. 4 of them are whales and only one is dolphin. In the right hand side, we have separate bar plots for dolphins and whales. We have more species of whales compared to dolphins. Among dolphins, bottlenose is most common, however, among whales, beluga and humpback are most common.

# Visualizing Images

## Prepare dataset and save it

Commented the below code to ignore running the code again and again. Data saved could be resused for further running.


In [None]:
train_df.head()
print(colored('Image loading and information extraction started...', 'red'))
for i, train_image in enumerate(train_df.image):
    img = Image.open(os.path.join(TRAIN_IMAGE_DIR, train_image))
    train_df.loc[i, 'image_mode'] = img.mode
    train_df.loc[i, 'format'] = img.format
    train_df.loc[i, 'image_width'] = img.size[0]
    train_df.loc[i, 'image_height'] = img.size[1]
    train_df.loc[i, 'aspect_ratio'] = img.size[0]/img.size[1]
    if (i+1)%1000 == 0:
        print('{}/{} images processed'.format(i+1, len(train_df.image)))
print(colored('Image loading and information extraction completed...', 'red'))

train_df.to_csv('extra_data_train.csv', index = False)
print(colored('\nTrain data saved in working directory...', 'red'))

In [None]:
# train_df = pd.read_csv('../input/visuals-of-happywhale/extra_data_train.csv')

# print('Number of unique mode = {}'.format(len(train_df.image_mode.unique())))
# print('Number of unique format = {}'.format(len(train_df.format.unique())))
# print('Number of unique width = {}'.format(len(train_df.image_width.unique())))
# print('Number of unique height = {}'.format(len(train_df.image_height.unique())))
# train_df.head()

From above stats, we can see that mode of images, width and height are not unique. Let's explore it more.

## Check Modes of Images

In [None]:
train_image_modes = train_df.groupby('image_mode')['image'].count()
train_image_modes
labels = train_image_modes.keys()
explode = [0.02]*len(labels)
pie, ax = plt.subplots(figsize = [10,6])
plt.pie(train_image_modes, autopct = "%.1f%%", labels = labels, explode = explode, pctdistance = 0.5)
plt.show()

Basic difference between L mode and RGB mode is that L has only one channel and RGB has 3 channels. Let's plot and see these different modes of images. 

In [None]:
n_images_plot = 40
n_col_plot = 8
figsize = (20,10)
fontsize = 20

def plot_images(images, cmap = None):
    fig, axs = plt.subplots(int(n_images_plot/n_col_plot), n_col_plot, figsize = figsize)
    for i, train_image in enumerate(images.image[0:n_images_plot]):
        img = Image.open(os.path.join(TRAIN_IMAGE_DIR, train_image))
        axs.ravel()[i].imshow(img, cmap = cmap)
        axs.ravel()[i].set_axis_off()
        axs.ravel()[i].set_title('#{}. {}'.format(i+1, images.species[i]))
    return fig, axs

images = train_df[train_df.image_mode == 'L'].reset_index()
fig, axs = plot_images(images, cmap = 'gray')
plt.suptitle('Sample exaples of L mode images', fontsize = fontsize)
plt.tight_layout()
plt.show()

print('')
images = train_df[train_df.image_mode == 'RGB'].reset_index()
fig, axs = plot_images(images)
plt.suptitle('Sample exaples of RGB images', fontsize = fontsize)
plt.tight_layout()
plt.show()

We can clearly see the difference between L mode and RGB mode of images.

Now let's explore variation in sizes of images.

# Explore Various Sizes of Images

In [None]:
print('Minimum width = {}, Minimum height = {}'.format(int(train_df.image_width.min()),
                                                       int(train_df.image_height.min())))
print('Maximum width = {}, Maximum height = {}'.format(int(train_df.image_width.max()),
                                                       int(train_df.image_height.max())))

# fig, ax = plt.subplots(figsize = (10, 5))
jp = sns.jointplot(data = train_df, x = 'image_width', y = 'image_height', hue = 'category', height = 8, ratio = 5)
jp.set_axis_labels('Image Width (pixels)', 'Image Height (pixels)')
# jp.set_legend(loc="upper left", ncol = 2)
plt.show()

fig = plt.figure(figsize = (15,5), dpi = 120)
gs = gridspec.GridSpec(2,2)

ax0 = fig.add_subplot(gs[0, 0:2])
sns.histplot(data = train_df, x = 'aspect_ratio', ax = ax0)
plt.title('All Images')

ind_1p5 = (train_df['aspect_ratio'] >= 1.49) & (train_df['aspect_ratio'] <= 1.51)
ax1 = fig.add_subplot(gs[1, 0])
sns.histplot(data = train_df[ind_1p5], x = 'aspect_ratio', ax = ax1)
plt.title('Aspect Ratio is 1.5')

# ind_1p5 = (train_df['aspect_ratio'] >= 1.4) & (train_df['aspect_ratio'] <= 1.6)
ax1 = fig.add_subplot(gs[1, 1])
sns.histplot(data = train_df[(~ind_1p5)], x = 'aspect_ratio', ax = ax1)
plt.title('Aspect Ratio is not 1.5')
plt.subplots_adjust(hspace=0.5)
plt.show()

From the distribution plot of image widths and heights, we can see most of the images have width more than 3000 px, similary heights are also mostly more than 2000 px. Another observation we can make here is that many images (for both, whales and dolphins) have smaller height, i.e the aspect ratio of width to height is high.
    
Looking into aspect ratio distribution plot, we can see many images have aspect ratio close to 1.5. I have separately plotted for close to 1.5 and other than 1.5.

Now, let's see images of whales and dolphins of various aspect ratio.

In [None]:
n_images_plot = 16
n_col_plot = 8
figsize = (20,5)
fontsize = 20

def plot_images(images):
    fig, axs = plt.subplots(int(n_images_plot/n_col_plot), n_col_plot, figsize = figsize)
    for i, train_image in enumerate(images.image[0:n_images_plot]):
        img = Image.open(os.path.join(TRAIN_IMAGE_DIR, train_image))
        axs.ravel()[i].imshow(img)
        axs.ravel()[i].set_axis_off()
        axs.ravel()[i].set_title('#{}. Aspect Ratio = {:.2f}'.format(i+1, images.aspect_ratio[i]))
    return fig, axs

images = train_df[train_df['aspect_ratio'] < 0.75].reset_index()
fig, axs = plot_images(images)
plt.suptitle('Sample images with aspect ratio less than 0.75', fontsize = fontsize)
plt.tight_layout()
plt.show()

print('')
images = train_df[(train_df['aspect_ratio'] >= 0.75) & (train_df['aspect_ratio'] < 1)].reset_index()
fig, axs = plot_images(images)
plt.suptitle('Sample images with aspect ratio between 0.75 and 1', fontsize = fontsize)
plt.tight_layout()
plt.show()

print('')
images = train_df[(train_df['aspect_ratio'] == 1)].reset_index()
fig, axs = plot_images(images)
plt.suptitle('Sample images with aspect ratio equal to 1', fontsize = fontsize)
plt.tight_layout()
plt.show()

print('')
images = train_df[(train_df['aspect_ratio'] >= 1) & (train_df['aspect_ratio'] < 1.5)].reset_index()
fig, axs = plot_images(images)
plt.suptitle('Sample images with aspect ratio between 1 and 1.5', fontsize = fontsize)
plt.tight_layout()
plt.show()
             
print('')
images = train_df[(train_df['aspect_ratio'] == 1.5)].reset_index()
fig, axs = plot_images(images)
plt.suptitle('Sample images with aspect ratio equal to 1.5', fontsize = fontsize)
plt.tight_layout()
plt.show()
             
print('')
images = train_df[(train_df['aspect_ratio'] >= 1.5) & (train_df['aspect_ratio'] < 2)].reset_index()
fig, axs = plot_images(images)
plt.suptitle('Sample images with aspect ratio between 1.5 and 2', fontsize = fontsize)
plt.tight_layout()
plt.show()
             
print('')
images = train_df[(train_df['aspect_ratio'] >= 2) & (train_df['aspect_ratio'] < 5)].reset_index()
fig, axs = plot_images(images)
plt.suptitle('Sample images with aspect ratio between 2 and 5', fontsize = fontsize)
plt.tight_layout()
plt.show()

print('')
images = train_df[(train_df['aspect_ratio'] >= 5) & (train_df['aspect_ratio'] < 10)].reset_index()
fig, axs = plot_images(images)
plt.suptitle('Sample images with aspect ratio between 5 and 10', fontsize = fontsize)
plt.tight_layout()
plt.show()

print('')
images = train_df[(train_df['aspect_ratio'] >= 10)].reset_index()
fig, axs = plot_images(images)
plt.suptitle('Sample images with aspect ratio greater than 10', fontsize = fontsize)
plt.tight_layout()
plt.show()

From the above plots, we can observe that images with aspect ratio less than 1 captured mostly fins while with larger aspect ratio, shows whole body.
    
For further processing, we need to convert all these images to same size. We have done it [in this notebook](https://www.kaggle.com/code/rajankumar/resize-images-128x128-using-cv2-and-tensorflow).    