In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
from tqdm import tqdm

# Augmentations
import albumentations as A

In [None]:
# Matplotlib Setup
plt.rcParams.update({'font.size': 15})

# Global variables
TRAIN_IMAGE_PATH = '../input/happy-whale-and-dolphin/train_images'
TRAIN_CSV_PATH = '../input/happy-whale-and-dolphin/train.csv'

In [None]:
df = pd.read_csv(TRAIN_CSV_PATH)
df['path'] = TRAIN_IMAGE_PATH+'/'+df['image']

# Data Cleaning

In [None]:
# Fixing misspellings
df['species'] = df['species'].replace({
    'kiler_whale': 'killer_whale',
    'bottlenose_dolpin': 'bottlenose_dolphin',
    'globis': 'short_finned_pilot_whale'
})

# Basic Explorations

In [None]:
df.head()

In [None]:
print(f"Number of images: {len(df)}")
print(f"Number of species: {df['species'].nunique()}")
print(f"Number of animals: {df['individual_id'].nunique()}")

# Further Explorations

## Number of images per species

In [None]:
n_images_by_species = df.groupby('species')['image'].agg(len).sort_values()
n_images_by_species.plot(kind='barh', figsize=(40,20))

plt.xlabel('Number of images')

## Number of animals per species

In [None]:
n_animals_by_species = df.groupby('species')['individual_id'].agg(lambda x: x.nunique()).sort_values()
n_animals_by_species.plot(kind='barh', figsize=(40,20))

plt.xlabel('Number of unique animals')

#### Interestingly, bottlenose dolphin, having the most images, are only the 6th in number of animals

## Number of images per animal

In [None]:
n_images_per_animal = df.groupby('individual_id')['image'].agg(len)
n_images_per_animal.describe()

In [None]:
n_images_per_animal_by_species = df.groupby(['species', 'individual_id'])['image'].agg(len).groupby('species').mean().sort_values()
n_images_per_animal_by_species.plot(kind='barh', figsize=(40,20))

plt.title('Mean number of images per animal by species')
plt.xlabel('Mean number of images per animal')

In [None]:
n_images_per_animal.plot(kind='hist',figsize=(40,20),logy=True,bins=100)

plt.title('Distribution of number of images per animal, log scale')
plt.xlabel('Number of images')
plt.ylabel('Count of animals with X images')

## Summary

In [None]:
pd.DataFrame({
    'Number of Images': n_images_by_species.index[-1:-6:-1],
    'Number of Animals': n_animals_by_species.index[-1:-6:-1],
    'Number of Images per animal': n_images_per_animal_by_species.index[-1:-6:-1]
}, index = ["1st", "2nd", "3rd", "4th", "5th"])

# Images

In [None]:
def read_image(path):
    return cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB) #BGR to RGB
def plot_images(paths, rows, cols, figsize):
    fig, axes = plt.subplots(nrows = rows, ncols = cols, figsize=figsize)
    plt.tight_layout(rect=[0, 0.03, 1, 0.97])

    for path, ax in zip(paths, axes.flat):
        img = read_image(path)
        ax.imshow(img)
        ax.axis('off')

## Image Samples

In [None]:
plot_images(df.sample(20, random_state = 0)['path'], 4, 5, (40,20))
plt.suptitle('Random Subset of Images', fontsize=30)

## Image Resolutions
Image reading takes too long, so I sampled a subset

In [None]:
image_sizes = np.array([cv2.imread(path).shape[:-1] for path in tqdm(df['path'].sample(500, random_state=0))])

In [None]:
image_sizes_df = pd.DataFrame(image_sizes, columns = ['height','width'])
image_sizes_df['pixels'] = image_sizes_df['height'] * image_sizes_df['width']
image_sizes_df.describe()

#### Images have quite high resolutions, on average 1500x2500

## Image Samples by Species

In [None]:
sampled_species = 'bottlenose_dolphin'
plot_images(df[df['species'] == sampled_species].sample(20, random_state = 0)['path'], 4, 5, (40,20))
plt.suptitle(sampled_species, fontsize=30)

## Image Samples by Id

In [None]:
sampled_id = 'abbeba14a290'
plot_images(df[df['individual_id'] == sampled_id].sample(
    min(len(df[df['individual_id'] == sampled_id]), 20),
    random_state = 0
)['path'], 4, 5, (40,20))
plt.suptitle(sampled_id, fontsize=30)

#### Lighting makes a huge difference

# Augmentations
All augmentations are done on a 224x224 random resized crop of the original image

In [None]:
random_paths = df['path'].sample(4, random_state = 0) # For augmentation demonstrations
random_paths

In [None]:
def plot_augmentations(paths, aug_transform, figsize, n_augs=4):
    aug_transform = A.Compose([
        A.Resize(224,224),
        aug_transform
    ])
    fig, axes = plt.subplots(len(paths), n_augs + 1, figsize=figsize)
    plt.tight_layout(rect=[0, 0.03, 1, 0.94])
    
    axes[0][0].set_title('Original', fontsize=30)
    for i in range(n_augs):
        axes[0][i+1].set_title(f"Augmentation {i+1}", fontsize=30)
    
    for row, path in enumerate(paths):
        img = read_image(path)
        axes[row][0].imshow(img)
        axes[row][0].axis('off')
        
        for aug in range(1, n_augs+1):
            axes[row][aug].imshow(aug_transform(image=img)['image'])
            axes[row][aug].axis('off')

## Horizontal Flip


In [None]:
hflip_transform = A.HorizontalFlip(p=0.5)
plot_augmentations(random_paths, hflip_transform, (40,20))

plt.suptitle('Horizontal Flip with probability 0.5', fontsize=35)

## Brightness Change

In [None]:
brightness_transform = A.RandomBrightness(0.5)
plot_augmentations(random_paths, brightness_transform, (40,20))

plt.suptitle('Random brightness change with brightness=0.5', fontsize=35)

## Rotation

In [None]:
rot_transform = A.Rotate((0, 45))
plot_augmentations(random_paths, rot_transform, (40,20))

plt.suptitle('Random rotation between 0 and 45 degrees', fontsize=35)

## Fog

In [None]:
fog_transform = A.RandomFog()
plot_augmentations(random_paths, fog_transform, (40,20))

plt.suptitle('Random perspective change with distortion_scale=0.4', fontsize=35)

# All combined

In [None]:
all_aug = A.Compose([
    hflip_transform,
    brightness_transform,
    rot_transform,
    fog_transform
])
plot_augmentations(random_paths, all_aug, (40,20))
plt.suptitle('All Augmentations combined', fontsize=35)