In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import json
from datetime import datetime

from PIL import Image

import matplotlib.pyplot as plt

Load the Data**

In [None]:
DATA_DIR = '../input/iwildcam-2020-fgvc7/'
TRAIN_DIR = DATA_DIR + 'train/'
TEST_DIR = DATA_DIR + 'test/'

# load the megadetector results
megadetector_results = json.load(open(DATA_DIR + 'iwildcam2020_megadetector_results.json'))
#megadetector_results['images'][:2]

# load train images annotations
train_info = json.load(open(DATA_DIR + 'iwildcam2020_train_annotations.json'))
# split json into several pandas dataframes
train_annotations = pd.DataFrame(train_info['annotations'])
train_images = pd.DataFrame(train_info['images'])
train_categories = pd.DataFrame(train_info['categories'])

# load test images info
test_info = json.load(open(DATA_DIR + 'iwildcam2020_test_information.json'))
# split json into several pandas dataframes
test_images = pd.DataFrame(test_info['images'])
test_categories = pd.DataFrame(test_info['categories'])

In [None]:
train_annotations.head()

In [None]:
train_images.head()

In [None]:
train_categories.head()

Get the total number of images in the train and test sets:

In [None]:
print('Number of images in the train set is {}'.format(train_annotations.image_id.nunique()))
print('Number of images in the test set is {}'.format(test_images.file_name.nunique()))

Visulize train and test set

In [None]:
plt.pie([train_annotations.image_id.nunique(), test_images.file_name.nunique()], labels=['Train', 'Test'], autopct='%1.1f%%', 
           startangle=90, colors=['#fa4252', '#91bd3a'])
plt.axis('equal')
plt.title('Number of images in train and test sets', fontsize=14)
plt.show()

In [None]:
#Let's check the number of animals found per image:

counts_per_image = train_annotations.groupby(by=['image_id']).sum().reset_index()[['image_id', 'count']].sort_values(by=['count'], ascending=False)
# output top-5
counts_per_image.head(5)

In [None]:
counts_per_image.tail(5)

In [None]:
plt.hist(counts_per_image['count'].values, bins=14, color='#91bd3a')
plt.title('The distribution of the count of animals per image', fontsize=14)
plt.show()

* most of the images are empty;
* having more than 20 animals on one image is very rare.

look at the number of different categories of animals per image:

In [None]:
category_counts_per_image = train_annotations.groupby(by=['image_id', 'category_id']).sum().reset_index()[['image_id', 'category_id', 'count']]\
.sort_values(by=['count'], ascending=False)
# merge with category names
category_counts_per_image = category_counts_per_image.merge(train_categories[['id', 'name']].rename(columns={'id':'category_id'}), on=['category_id'])
# output top-5
category_counts_per_image.head(5)

In [None]:
plt.hist(category_counts_per_image['count'].values, bins=14, color='#91bd3a')
plt.title('The distribution of the animal category counts per image', fontsize=14)
plt.show()

In [None]:
#Now let's look at the number of images for each category:

num_categories = train_categories.sort_values(by=['count'], ascending=False)
# list the top-5
num_categories.head(5)

In [None]:
fig, axs = plt.subplots(1,2, figsize=(20,7))
width = 0.8
n_categories = 20
# plot the top-n
axs[0].bar(x=range(n_categories), height=num_categories['count'].values[:n_categories], width=width, color='#91bd3a')
axs[0].set_xticks(np.array(range(n_categories)))
axs[0].set_xticklabels(num_categories['name'].values[:n_categories], rotation=90)
axs[0].set_title('TOP-{}'.format(n_categories))
# plot the tail-n
axs[1].bar(x=range(n_categories), height=num_categories[num_categories['count'] > 0]['count'].values[-n_categories:], width=width, color='#fa4252')
axs[1].set_xticks(np.array(range(n_categories)))
axs[1].set_xticklabels(num_categories[num_categories['count'] > 0]['name'].values[-n_categories:], rotation=90)
axs[1].set_title('LAST-{}'.format(n_categories))

plt.suptitle('The most and the least frequent categories', fontsize=16)
plt.show()

* Most of the images are empty;
* A lot of images captured humans;
* Some of the categories were not captured at all;
* The least frequent categories were captured by cameras just once!

Let's explore the locations related to the train set images:

In [None]:
print('The number of unique locations is {}'.format(train_images.location.nunique()))
print('The average number of images per location is {}'.format(train_images.groupby(by=['location']).id.count().mean()))
print('The minimum number of images per location is {}'.format(train_images.groupby(by=['location']).id.count().min()))
print('The maximum number of images per location is {}'.format(train_images.groupby(by=['location']).id.count().max()))

In [None]:
plt.figure(figsize=(20,5))
plt.hist(train_images.groupby(by=['location']).id.count(), bins=40, color='#91bd3a')
plt.title('The distribution of the number of the images per location', fontsize=14)
plt.show()

 * Most of the locations have less than 500 images, but for some of the locations the total number of images exceeds 8000.
* For some of the locations there is only 1 image captured! Probably, this is why we want to have models, which extend to different locations.

Analyze the number of frames in sequences:

In [None]:
frames_per_sequence = train_images.groupby(by=['seq_id']).frame_num.max()

print('The average number of frames is {}'.format(frames_per_sequence.mean()))
print('The minimum number of frames is {}'.format(frames_per_sequence.min()))
print('The maximum number of frames is {}'.format(frames_per_sequence.max()))

In [None]:
plt.hist(frames_per_sequence.values, bins=40, color='#91bd3a')
plt.title('The distribution of the number of frames')
plt.show()

Train Set Images Exploration

In [None]:
def get_first_category(img_id):
    """Find first the image category by id."""
    # get category id
    category_id = train_annotations[train_annotations.image_id == img_id].category_id.values[0]
    # get category name
    category_name = train_categories[train_categories.id == category_id].name.values[0]
    return category_id, category_name

def visualize_image_grid(rows, cols):
    """Visualize random grid of images with the first category."""
    filenames = train_images.file_name.unique()
    
    np.random.seed(42)
    img_idx = np.random.randint(len(filenames), size=rows * cols)
    
    fig, axs = plt.subplots(rows, cols, figsize=(15,7))
    
    for r in range(rows):
        for c in range(cols):
            # get the image and image id
            filename = filenames[img_idx[rows*r + c]]
            img_id = filename.split('.')[0]
            # get the category
            category_id, category = get_first_category(img_id)
            
            img = Image.open(TRAIN_DIR + filename)
            
            axs[r,c].imshow(img)
            axs[r,c].axis('off')
            axs[r,c].set_title('{}:{}'.format(category_id, category))
            
    plt.suptitle('Train images', fontsize=16)
    plt.show()

In [None]:
visualize_image_grid(3, 3)

Visualize images for a specific category

In [None]:
def visualize_cetagory(category_id, rows=3, cols=3, seed=42):
    """Function to visualize images of a specific category."""
    # filter by the category_id
    copy = train_annotations[train_annotations.category_id == category_id]
    # get the category name
    category_name = train_categories[train_categories.id == category_id].name.values[0]
    
    # get random indices
    np.random.seed(seed)
    img_idx = np.random.randint(len(copy), size=rows * cols)
    
    # plot images
    fig, axs = plt.subplots(rows, cols, figsize=(15,7))
    
    for r in range(rows):
        for c in range(cols):
            # get the image and image id
            filename = copy.iloc[img_idx[rows*r + c]].image_id + '.jpg'
            img_id = filename.split('.')[0]
            
            img = Image.open(TRAIN_DIR + filename)
            
            axs[r,c].imshow(img)
            axs[r,c].axis('off')
            axs[r,c].set_title('{}:{}'.format(category_id, category_name))
            
    plt.suptitle('Train images for {}:{}'.format(category_id, category_name), fontsize=16)
    plt.show()

In [None]:
visualize_cetagory(0)

In [None]:
visualize_cetagory(3) 

In [None]:
visualize_cetagory(112) 

In [None]:
visualize_cetagory(372) 

In [None]:
def visualize_top_categories(n_categories=5, n_cols=5, seed=42):
    """Function to plot a grid of n_cols images for each of the top n_categories from the train set."""
    np.random.seed(seed)
    
    # get ids for the top n_categories
    # excluding empty and human
    top_categories = num_categories['id'].values[2:n_categories+2]
    
    # setup the image grid
    fig, axs = plt.subplots(n_categories, n_cols, figsize=(18,10))
    
    for row in range(0, n_categories):
        # get the category 
        category_id = top_categories[row]
        
        # get the category name
        category_name = train_categories[train_categories.id == category_id].name.values[0]
        
        # filter the images by category
        copy = train_annotations[train_annotations.category_id == category_id]
        
        # get random indices
        img_idx = np.random.randint(len(copy), size=n_cols)
        
        for col in range(0, n_cols):
            # get the image and image id
            filename = copy.iloc[img_idx[col]].image_id + '.jpg'
            img_id = filename.split('.')[0]
            
            img = Image.open(TRAIN_DIR + filename)
            
            axs[row,col].imshow(img)
            axs[row,col].axis('off')
            axs[row,col].set_title('{}:{}'.format(category_id, category_name))
            
    plt.suptitle('Train images for top-{} categories'.format(n_categories), fontsize=16)
    plt.show()

In [None]:
visualize_top_categories(n_categories=5)