In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
'''
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
'''
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import os
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
import albumentations as A

## other packages
from termcolor import colored
from colorama import Fore, Back, Style
# colored output
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA

import warnings
warnings.filterwarnings('ignore')

In [None]:
train_img_path = '/kaggle/input/happy-whale-and-dolphin/train_images'
test_img_path = '/kaggle/input/happy-whale-and-dolphin/test_images'
train_csv_path = '/kaggle/input/happy-whale-and-dolphin/train.csv'

### Load Training CSV Data

In [None]:
trainDF = pd.read_csv(train_csv_path)
trainDF.head()

Let's fix some issues with the species column:

In [None]:
trainDF.species.replace({"globis": "short_finned_pilot_whale",
                          "pilot_whale": "short_finned_pilot_whale",
                          "kiler_whale": "killer_whale",
                          "bottlenose_dolpin": "bottlenose_dolphin"}, inplace=True)

## EDA

### Unique Species

In [None]:
print('Number of unique species: ', trainDF['species'].nunique())
print('Species Names: ', trainDF['species'].unique())

#### Number of images per species

In [None]:
trainDF['species'].value_counts()

In [None]:
plt.figure()
sns.countplot(x='species',data=trainDF, order = trainDF['species'].value_counts().index)
plt.xticks(rotation=90)

#### Number of animals (unique animals) per species

In [None]:
trainDF.groupby(['species'])['individual_id'].nunique()

In [None]:
id_count = trainDF.groupby(['species'])['individual_id'].nunique().sort_values(ascending=False)
id_count.index, id_count.values

In [None]:
plt.figure()
plt.bar(x = id_count.index, height=id_count.values)
plt.xticks(rotation=90)

#### Number of unique ID's

In [None]:
trainDF['individual_id'].value_counts()

## Training and Test Images

In [None]:
# Function to get image paths from train and test directory
def getImagePaths(path):
    image_names = []
    for dirname, _, filenames in os.walk(path):
        for filename in filenames:
            fullpath = os.path.join(dirname, filename)
            image_names.append(fullpath)
    return image_names

# Function to display multiple images
def display_multiple_img(image_paths, rows, cols, title):
    fig,ax = plt.subplots(nrows=rows, ncols=cols, figsize=(16,8))
    plt.suptitle(title, fontsize=20)
    for ind, img_path in enumerate(image_paths):
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        try:
            ax.ravel()[ind].imshow(image)
            ax.ravel()[ind].set_axis_off()
        except:
            continue;
    plt.tight_layout()
    plt.show()

In [None]:
train_images_paths = getImagePaths(train_img_path)
test_images_paths = getImagePaths(test_img_path)

In [None]:
print(f'{y_}Number of train images: ' + f'{g_} {len(train_images_paths)}\n')
print(f'{y_}Number of test images: ' + f'{g_} {len(test_images_paths)}\n')

In [None]:
display_multiple_img(train_images_paths[0:25],5,5,'Train images')

In [None]:
display_multiple_img(test_images_paths[0:25],5,5,'Test images')

### Images of most frequent species

In [None]:
def n_most_frequent(df, col, n, most=True):
    if most:
        return df[col].value_counts()[:n].index.tolist()
    else:
        return df[col].value_counts()[-n:].index.tolist()

In [None]:
trainDF['species'].value_counts()[:5].index.tolist()

In [None]:
m_freq_species = n_most_frequent(trainDF, 'species', 5, True)
l_freq_species = n_most_frequent(trainDF, 'species', 5, False)

In [None]:
trainDF[trainDF['species']==m_freq_species[0]]['image'].values

In [None]:
spec = m_freq_species[0]
z = trainDF[trainDF['species']==spec]['image'].values.tolist()
z[0:9]

In [None]:
for spec in m_freq_species:
    z = trainDF[trainDF['species']==spec]['image'].values.tolist()
    z9 = z[0:9]
    fullpaths = [os.path.join(train_img_path, x) for x in z9]
    display_multiple_img(fullpaths,3,3,spec)

## Data Augmentation

In [None]:
def plot_augmentations(images, titles, sup_title):
    fig,axes = plt.subplots(figsize=(20,16), nrows=3, ncols=4, squeeze=False)
    
    for indx, (img, title) in enumerate(zip(images, titles)):
        axes[indx//4][indx%4].imshow(img)
        axes[indx//4][indx%4].set_title(title, fontsize=15)
        
    plt.tight_layout()
    fig.suptitle(sup_title, fontsize=20)
    fig.subplots_adjust(wspace=0.2, hspace=0.2, top=0.93)
    axes[2,2].set_visible(False)
    axes[2,3].set_visible(False)
    plt.show()

def augment(paths, data):
    albumentations = [A.RandomSunFlare(p=0.02), A.RandomFog(p=1), A.RandomBrightness(p=1),
                      A.Rotate(p=1, limit=9), A.RGBShift(p=1), A.RandomSnow(p=0.02),
                      A.HorizontalFlip(p=1), A.RandomContrast(limit=0.5,p=1),
                      A.HueSaturationValue(p=1, hue_shift_limit=20, sat_shift_limit=30, val_shift_limit=50)]
    titles = ["RandomSunFlare","RandomFog","RandomBrightnessContrast",
                       "Rotate", "RGBShift", "RandomSnow","HorizontalFlip", "RandomContrast","HSV"]
    for i in paths:
        image_path = i
        image_name = image_path.split("/")[4].split(".")[0]
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # RESIZE IMAGE
        image = cv2.resize(image, (224,224))
        
        images = []
        for aug_type in albumentations:
            augmented_img = aug_type(image=image)['image']
            images.append(augmented_img)
        
        titles.insert(0,"original")
        images.insert(0,image)
        sup_title = 'Image Augmentation for '+ data+ " - " +image_name 
        plot_augmentations(images, titles, sup_title)
        titles.remove('original')
        
augment(train_images_paths[0:2],'train')