In [None]:
import json
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from PIL import Image, UnidentifiedImageError
import seaborn as sns

In [None]:
train_annot_path = '../input/iwildcam-2020-fgvc7/iwildcam2020_train_annotations.json'
train_img_path = '../input/iwildcam-2020-fgvc7/train'

In [None]:
with open(train_annot_path) as f:
    train_annot = json.load(f)

In [None]:
print(f'Total categories in set: {len(train_annot["categories"])}')
print(f'Total training images: {len(train_annot["images"])}')

In [None]:
train_labels = pd.DataFrame(train_annot['annotations'])
train_labels.head()

In [None]:
images_with_1_label = np.count_nonzero(train_labels.groupby('image_id').category_id.nunique().values == 1)
print('1 label for each image: {}'.format(images_with_1_label == len(train_annot['images'])))

Model trained on this may struggle when multiple animals are in frame

In [None]:
train_classes = train_labels.category_id.nunique()
print('All classes have training examples: {}'.format(len(train_annot['categories']) == train_classes))
print('Total classes: {}\nTrain classes: {}'.format(len(train_annot['categories']), train_classes))

Interesting...

In [None]:
# Let's look at some images
fig = plt.figure(figsize=(24, 12))
for i, img_id in enumerate(train_labels.image_id.sample(12).values):
    img_path = os.path.join(train_img_path, img_id + '.jpg')
    img = Image.open(img_path)
    ax = fig.add_subplot(3, 4, i+1)
    ax.imshow(img)
    ax.grid()
    ax.axis('off')

In [None]:
train_images_meta = pd.DataFrame(train_annot['images'])
train_images_meta.head()

In [None]:
train_images_meta.describe()

There can be multiple images per sequence.

Some images have sides = -1 ???

Significantly high stddev in image height and width

In [None]:
train_images_meta[train_images_meta.seq_num_frames == 10]

There's variance in the size of image within a sequence. Let's look at a sequence

In [None]:
sample_sequence = train_images_meta[train_images_meta.seq_num_frames == 10].seq_id.sample(1).values[0]
seq_img = train_images_meta[train_images_meta.seq_id == sample_sequence].file_name.values

# Let's look at some images
fig = plt.figure(figsize=(30, 30))
for i, img_id in enumerate(seq_img):
    img_path = os.path.join(train_img_path, img_id)
    img = Image.open(img_path)
    ax = fig.add_subplot(4, 3, i+1)
    ax.imshow(img)
    ax.grid()
    ax.axis('off')


In [None]:
img_dims = train_images_meta.groupby(['height', 'width']).id.nunique().reset_index().sort_values(by='id', ascending=False)
img_dims['frac'] = img_dims.id / img_dims.id.sum()
img_dims['cum_frac'] = img_dims.id.cumsum() / img_dims.id.sum()
img_dims.head(10)

~98% images are one of (1024p/HD, 1080p/FHD, 1536p). Decent for downscaling

In [None]:
plt.figure(figsize=(16, 8))
img_dims = np.log10(train_images_meta.groupby(['height', 'width']).id.nunique()).reset_index()
img_dims = pd.pivot_table(img_dims, index='height', columns='width', values='id').fillna(-1)
sns.heatmap(img_dims, square=True, linecolor='#09000f', linewidths=.1)
_ = plt.gca().set_title('Image dimensions (counts in log scale)')

In [None]:
# Images with -1 dims are intriguing
# Let's look at some
fig = plt.figure(figsize=(24, 6))
for i, img_id in enumerate(train_images_meta[train_images_meta.width == -1].id.values):
    try:
        img_path = os.path.join(train_img_path, img_id + '.jpg')
        img = Image.open(img_path)
        ax = fig.add_subplot(1, 3, i+1)
        ax.imshow(img)
        ax.grid()
        ax.axis('off')
    except FileNotFoundError:
        print("Image {} doesn't exist".format(img_id))
    except UnidentifiedImageError:
        print("Image {} unidentified".format(img_id))

These don't exist in training dataset. Should remove before training

In [None]:
categories = pd.DataFrame(train_annot['categories'])

In [None]:
examples_by_cat = train_labels.category_id.value_counts().reset_index()
examples_by_cat.columns = ['category', 'examples']
examples_by_cat = examples_by_cat.merge(categories, left_on='category', right_on='id', how='inner')[['id', 'name', 'examples']]
examples_by_cat = examples_by_cat.assign(cumulative=examples_by_cat.examples.cumsum()/examples_by_cat.examples.sum(), frac=examples_by_cat.examples/examples_by_cat.examples.sum())
examples_by_cat.head(20)

~35% examples belong to the empty class

In [None]:
examples_by_cat[examples_by_cat.id > 0][['examples', 'frac']].describe()

In [None]:
examples_by_cat.sort_values('frac')[['name', 'examples']].head(20)

Some classes have only a single example

In [None]:
l10_examples = examples_by_cat.groupby('examples').id.count().reset_index()
l10_examples.columns=['examples', 'n_classes']
print('{} classes have less than 10 examples'.format(l10_examples[l10_examples.examples<=10].n_classes.sum()))