Content:

1 Initializing and loading data

2 Routines

2.1. Function show_imgs_as_tiles

2.2. Subsample dataframe picking top n records for each group    

3 Quick analysis

4 View some samples

4.1. Sample 10 unique individuals from each specie

4.2. Sample same individual different images

# 1. Initializing and loading data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
train_path = '/kaggle/input/happy-whale-and-dolphin/train_images/'
train_files = []
for dirname, _, filenames in os.walk(train_path):
    for filename in filenames:
        train_files.append(os.path.join(dirname, filename))
train = pd.read_csv('../input/happy-whale-and-dolphin/train.csv')
 
test_path = '/kaggle/input/happy-whale-and-dolphin/test_images/'
test_files = []
for dirname, _, filenames in os.walk(test_path):
    for filename in filenames:
        test_files.append(os.path.join(dirname, filename))
test = pd.read_csv('../input/happy-whale-and-dolphin/sample_submission.csv')    
    
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
print(f"Some quick check if the input is OK:")

print(f"Training set {len(train_files)} images") 
print(f"Training data: {len(train)} records")
print(f"Unique species: {train['species'].nunique()}")
print(f"Unique individuals: {train['individual_id'].nunique()}")

print(f"Test set {len(test_files)} images") 
print(f"Test data: {len(test)} in submit sample")

In [None]:
test["predict1"] = test["predictions"].str.split(" ", 1).str[0]
test["predict2"] = test["predictions"].str.split(" ", 2).str[1]
test["predict3"] = test["predictions"].str.split(" ", 3).str[2]
test["predict4"] = test["predictions"].str.split(" ", 4).str[3]
test["predict5"] = test["predictions"].str.split(" ", 5).str[4]
test.head(5)

# 2. Routines

## 2.1 Function show_imgs_as_tiles

In [None]:
import PIL
import matplotlib.pyplot as plt
import os

def show_imgs_as_tiles( w, h, imgs, labels=None, tile_width=200, tile_height=200, \
                       path='/kaggle/input/happy-whale-and-dolphin/train_images/'): 
    """ display w, h tiles with images and labels 
    """
    # this function uses the open, resize and array functions we have seen before
    load_img = lambda filename: np.array(PIL.Image.open(f"{filename}").resize((tile_width, tile_height)))
    
    _, axes_list = plt.subplots(h, w, figsize=(2*w, 2*h)) # define a grid of (w, h)
    
    i = 0 
    for axes in axes_list:
        for ax in axes:
            if i<len(imgs):
                img = os.path.join( path, imgs[i])
                ax.axis('off')
                ax.imshow(load_img(img)) # load and show
                if len(labels[i])>18:
                    ax.set_title(labels[i].replace('.jpg','')[-18:])
                else:
                    ax.set_title(labels[i].replace('.jpg',''))
            else:
                ax.axis('off')
            i+=1

# quick test                 
show_imgs_as_tiles(w=4, h=2, imgs= train['image'], labels=imgs, tile_width=200, tile_height=200)

## 2.2 Subsample dataframe picking top n records for each group

In [None]:
def sub_sample_by(df=pd.DataFrame(), by='species', groups=top_species, n_samples=5):
    """ Subsample a df by picking top n records for each group
    """
    sample = []
    for s in groups:
        ind = df[ df[by] == s  ].iloc[:n_samples]
        sample.append(ind)
    return pd.concat(sample)

sub_sample_by( df=train, by='species', groups=train.species.unique(), n_samples=3 )

# 3. Quick analysis

In [None]:
train.head()

In [None]:
species = train.groupby('species')['image'].count().reset_index(name='count_specie').sort_values(['count_specie'], ascending=False)
train = pd.merge(train, species, left_on='species', right_on='species')
species.head()

In [None]:
individuals = train.groupby('individual_id')['image'].count().reset_index(name='count_individ').sort_values(['count_individ'], ascending=False)
train = pd.merge(train, individuals, left_on='individual_id', right_on='individual_id')
individuals.head()

In [None]:
train.head(10)

# 4. View some samples

## 4.1. Sample 10 unique individuals from each specie

In [None]:
n, m = 30, 10  # n species x m individuals 
top_species = train.drop_duplicates(['individual_id'], keep='first')\
                    .groupby(by='species')\
                    .sample(n=m)\
                    .sort_values(by=['count_specie'], ascending=[False])
top_species.head(25)

In [None]:
imgs = list(top_species.image)
labels = list(top_species.species)
show_imgs_as_tiles(w=m, h=n, imgs=imgs, labels=labels,tile_width=200, tile_height=200, path=train_path)

## 4.2. Sample same individual different images

In [None]:
n, m = 30, 9  # n species x m individuals 
same = train.sort_values(by=['count_specie','count_individ', 'individual_id'], ascending=[False,False,True])
same = sub_sample_by( df=same, by='species', groups=same.species.unique(), n_samples=m )
same.head(25)

In [None]:
imgs = list(same.image)
labels = [ x[:7] + y[-7:] for x, y in zip( list(same.species) , list(same.individual_id) )]
show_imgs_as_tiles(w=m, h=n, imgs=imgs, labels=labels,tile_width=200, tile_height=200, path=train_path)