In [None]:
!pip install -qq --upgrade wandb

In [None]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import wandb
wandb.login()

In [None]:
ROOT_PATH = '../input/happy-whale-and-dolphin'
IMGS_DIR = f'{ROOT_PATH}/train_images'
df = pd.read_csv(f'{ROOT_PATH}/train.csv')
df.head()

## `species`

In [None]:
unique_species = df.species.unique()
print(unique_species)
print('Num of unique species: ', len(unique_species))

> Most of the species have `_whale` or `_dolphine` identifiers. 

> There are total of 30 unique species. 

> `beluga` is whale; what's `globis`?

In [None]:
df.species.value_counts()

> It's gonna be hard to identify few species. 

## `individual_id`s

In [None]:
print('Number of unique individual_ids: :', len(df.individual_id.unique()))

In [None]:
df.individual_id.value_counts()

> There are species with just one image associated with them. It's gonna be hard to cluster these individual species. It would be worse if they belong to sparse species. 

## Map `individual_id` to Images

We will be using [W&B Tables](https://docs.wandb.ai/guides/data-vis) feature to easily build log images belonging to unique `individual_id`. W&B Tables is like a 2D grid (spreadsheet) which supports rich media and interactiveness. For simplicity and memory consideration we will log at max 5 images per unique id. We are also going to log for those ids that has more than 5 images. This is our first look at the data. 

In [None]:
# Initialize a W&B run
run = wandb.init(project='happywhale_eda')

# Initialize an empty W&B Table
data_table = wandb.Table(columns=['individual_id', 'image_1', 'image_2', 'image_3', 'image_4', 'image_5'])

for unique_id, tmp_df in tqdm(df.groupby('individual_id')):
    if len(tmp_df) > 5:
        # Sample 5 images randomly
        sample_imgs = random.sample(list(tmp_df.image.values), 5)
        # Add data to the table row-wise
        data_table.add_data(unique_id,
                            wandb.Image(f'{IMGS_DIR}/{sample_imgs[0]}'),
                            wandb.Image(f'{IMGS_DIR}/{sample_imgs[1]}'),
                            wandb.Image(f'{IMGS_DIR}/{sample_imgs[2]}'),
                            wandb.Image(f'{IMGS_DIR}/{sample_imgs[3]}'),
                            wandb.Image(f'{IMGS_DIR}/{sample_imgs[4]}'))
        
# Log the table
wandb.log({'mapping_table': data_table})

# Finish the run
wandb.finish()

Check out this cool Table here: http://wandb.me/happywhale-tables

![img](https://i.imgur.com/txSOmzH.mp4)

> Images are of varying sizes.

> Images are of whale/dolphin humpback (+dorsal fins, backs, heads and flanks).