In [None]:
import os
import pandas as pd
import numpy as np
import cv2
import plotly.express as px
import matplotlib.pyplot as plt

## Read the data

In [None]:
train = pd.read_csv('../input/happy-whale-and-dolphin/train.csv')

## Species Distribution

Let's look at the distribution of the different speicies in the training set

In [None]:
species_count = train.groupby('species')['individual_id'].agg(species_count= 'count').reset_index()
no_of_individuals = train.groupby('individual_id')['image'].agg(no_of_individuals='count').reset_index()

train = pd.merge(train, species_count, on=['species'], how='inner')
train = pd.merge(train, no_of_individuals, on=['individual_id'], how='inner')

train.head()

Let's look at the distribution of different species

In [None]:
px.bar(species_count.sort_values(by='species_count', ascending=False), x='species', y='species_count', color='species')

Let's plot the same using a pie chart

In [None]:
px.pie(train, names='species')

### Observations
1. Obviously there are some duplicates in the species name with different spellings
    - bottlenose_dolphin and bottlenose_dolpin
    - killer_whale and kiler_whale
2. Almost half of the data comes from the three species bottlenose_dolphin, beluga and humpback_whale
3. Blue whale, false_killer_whale and dusky_dolphin constitute almost 1/5th of the entrire dataset
4. Almost 15 species have less than 1% of the data, with frasiers_dolphin having the least amount of data (just 14 images)

#### Check if there are any individual_id's having different naming conventions


In [None]:
set(train[train['species']=='killer_whale']['individual_id'].unique()).intersection(set(train[train['species']=='kiler_whale']['individual_id'].unique()))

In [None]:

set(train[train['species']=='bottlenose_dolphin']['individual_id'].unique()).intersection(set(train[train['species']=='bottlenose_dolpin']['individual_id'].unique()))

Looks like we do not have any overlap :)

## Individuals Distribution

In [None]:
px.bar(train.groupby('species')['individual_id'].nunique().reset_index().rename(columns={'individual_id':'no_of_individuals'}).sort_values(by='no_of_individuals', ascending=False), 
       x='species', y='no_of_individuals', color='species')

In [None]:
px.bar(train[train.species=='bottlenose_dolphin'], x='individual_id', y='no_of_individuals', title='bottlenose_dolphin distribution')

In [None]:
px.bar(train[train.species=='kiler_whale'], x='individual_id', y='no_of_individuals', title='Killer whale distribution')

In [None]:
px.bar(species_count.sort_values(by='species_count', ascending=False), x='species', y='species_count', color='species')

## Visualize some sample images of different species

In [None]:
# function to display multiple images: Modified from (Thanks to https://www.kaggle.com/ruchi798/and-identification-eda-augmentation)

def path(group,group_type):
    PATH = "../input/happy-whale-and-dolphin/train_images"
    
    #species
    if group_type=='species':
        z = train['image'][train['species']==group].values 
    
    #ID
    if group_type=='id':
        z = train['image'][train['individual_id']==group].values 
   
    image_names = []
    for filename in z:
        fullpath = os.path.join(PATH, filename)
        image_names.append(fullpath)
    return image_names



def display_multiple_imgs(group, group_type, rows, cols):
    
    image_paths = path(group, group_type)
    image_paths = np.random.choice(image_paths, rows*cols)
    
    figure, ax = plt.subplots(nrows=rows,ncols=cols,figsize=(16,8))
    plt.suptitle(group, fontsize=20)
    for ind,image_path in enumerate(image_paths):
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 
        image = cv2.resize(image, (1200, 800))
        try:
            ax.ravel()[ind].imshow(image)
            ax.ravel()[ind].set_axis_off()
        except:
            continue;
    plt.tight_layout()
    plt.show()

In [None]:
for species in train['species'].unique():
    print('\n\n')
    display_multiple_imgs(species, 'species', 3, 3)

### Will keep the notebook updated with new EDA