Original Notebook: [https://www.kaggle.com/aleksandradeis/iwildcam-eda](http://)

In [None]:
import os
import json
import pandas as pd
import numpy as np
import datetime as datetime
import matplotlib.pyplot as plt
from PIL import Image

In [None]:
# setup the directories
DATA_DIR = '../input/iwildcam2021-fgvc8/'
TRAIN_DIR = DATA_DIR + 'train/'
TEST_DIR = DATA_DIR + 'test/'
METADATA_DIR = DATA_DIR + 'metadata/'

# load the megadetector results
megadetector_results = json.load(open(METADATA_DIR + 'iwildcam2021_megadetector_results.json'))
#megadetector_results['images'][:2]

# load train images annotations
train_info = json.load(open(METADATA_DIR + 'iwildcam2021_train_annotations.json'))
# split json into several pandas dataframes
train_annotations = pd.DataFrame(train_info['annotations'])
train_images = pd.DataFrame(train_info['images'])
train_categories = pd.DataFrame(train_info['categories'])

# load test images info
test_info = json.load(open(METADATA_DIR + 'iwildcam2021_test_information.json'))
# split json into several pandas dataframes
test_images = pd.DataFrame(test_info['images'])
#test_categories = pd.DataFrame(test_info['categories'])

In [None]:
train_info.keys()

In [None]:
train_images.keys()

In [None]:
train_annotations.keys()

In [None]:
train_categories.keys()
train_categories

In [None]:
test_info.keys()

In [None]:
test_images.head()

In [None]:
#test_categories.head() #there is nothing like in the test json file, it is created for future 

In [None]:
print('Number of images in the train set is {}'.format(train_annotations.image_id.nunique()))
print('Number of images in the test set is {}'.format(test_images.file_name.nunique()))

In [None]:
plt.pie([train_annotations.image_id.nunique(), test_images.file_name.nunique()], labels=['Train', 'Test'], autopct='%1.1f%%', 
           startangle=90, colors=['#fa4252', '#91bd3a'])
plt.axis('equal')
plt.title('Number of images in train and test sets', fontsize=14, color='violet')
plt.show()

# Location Data Exploration

In [None]:
print('The number of unique locations is {}'.format(train_images.location.nunique()))
print('The average number of images per location is {}'.format(train_images.groupby(by=['location']).id.count().mean()))
print('The minimum number of images per location is {}'.format(train_images.groupby(by=['location']).id.count().min()))
print('The maximum number of images per location is {}'.format(train_images.groupby(by=['location']).id.count().max()))

In [None]:
plt.figure(figsize=(20,5))
plt.hist(train_images.groupby(by=['location']).id.count(), bins=40, color='#91bd3a')
plt.title('The distribution of the number of the images per location', fontsize=14)
plt.show()

# Timeline for Captured Images

In [None]:
# convert datetimes to just dates
def to_date(datetime_str):
    """Convert datetime string to date."""
    # datetime string example: 2013-08-08 11:45:00.000
    dt = datetime_str.split(' ')[0]
    return dt
    
train_images['date'] = train_images.apply(lambda row: to_date(row.datetime), axis=1)
# group by date
img_per_date = train_images.groupby(by=['date']).id.count()

In [None]:
print('The average number of images per day is {}'.format(img_per_date.mean()))
print('The maximum number of images per day is {}'.format(img_per_date.max()))
print('The minimum number of images per day is {}'.format(img_per_date.min()))

# Analyze the number of sequences

In [None]:
train_images.keys()

In [None]:
# group by sequence id
frames_per_sequence = train_images.groupby(by=['seq_id']).seq_frame_num.max()

print('The average number of frames is {}'.format(frames_per_sequence.mean()))
print('The minimum number of frames is {}'.format(frames_per_sequence.min()))
print('The maximum number of frames is {}'.format(frames_per_sequence.max()))

In [None]:
plt.hist(frames_per_sequence.values, bins=40, color='#91bd3a')
plt.title('The distribution of the number of frames')
plt.show()

# Image Dimensional Exploration

In [None]:
print('The minimum width of the images is {}'.format(train_images.width.min()))
print('The maximum width of the images is {}'.format(train_images.width.max()))
print('The minimum height of the images is {}'.format(train_images.height.min()))
print('The maximum height of the images is {}'.format(train_images.height.max()))

In [None]:
# plot histograms to show the distribution of width and height values
fig, axs = plt.subplots(1, 2, figsize=(15,7))
axs[0].hist(train_images.width.values, bins=20, color = '#91bd3a')
axs[0].set_title('Width distribution')
axs[0].set_xlim(1000, 3000)

axs[1].hist(train_images.width.values, bins=20, color = '#91bd3a')
axs[1].set_title('Height distribution')
axs[1].set_xlim(1000, 3000)

plt.suptitle('Image Dimensions')
plt.show()

# Train Set Imnages Exploration

In [None]:
def get_first_category(img_id):
    """Find first the image category by id."""
    # get category id
    category_id = train_annotations[train_annotations.image_id == img_id].category_id.values[0]
    # get category name
    category_name = train_categories[train_categories.id == category_id].name.values[0]
    return category_id, category_name

def visualize_image_grid(rows, cols):
    """Visualize random grid of images with the first category."""
    filenames = train_images.file_name.unique()
    
    np.random.seed(42)
    img_idx = np.random.randint(len(filenames), size=rows * cols)
    
    fig, axs = plt.subplots(rows, cols, figsize=(15,7))
    
    for r in range(rows):
        for c in range(cols):
            # get the image and image id
            filename = filenames[img_idx[rows*r + c]]
            img_id = filename.split('.')[0]
            # get the category
            category_id, category = get_first_category(img_id)
            
            img = Image.open(TRAIN_DIR + filename)
            
            axs[r,c].imshow(img)
            axs[r,c].axis('off')
            axs[r,c].set_title('{}:{}'.format(category_id, category))
            
    plt.suptitle('Train images', fontsize=16)
    plt.show()

In [None]:
visualize_image_grid(3, 3)

# Specific Image Category Visualization

In [None]:
def visualize_cetagory(category_id, rows=3, cols=3, seed=42):
    """Function to visualize images of a specific category."""
    # filter by the category_id
    copy = train_annotations[train_annotations.category_id == category_id]
    # get the category name
    category_name = train_categories[train_categories.id == category_id].name.values[0]
    
    # get random indices
    np.random.seed(seed)
    img_idx = np.random.randint(len(copy), size=rows * cols)
    
    # plot images
    fig, axs = plt.subplots(rows, cols, figsize=(15,7))
    
    for r in range(rows):
        for c in range(cols):
            # get the image and image id
            filename = copy.iloc[img_idx[rows*r + c]].image_id + '.jpg'
            img_id = filename.split('.')[0]
            
            img = Image.open(TRAIN_DIR + filename)
            
            axs[r,c].imshow(img)
            axs[r,c].axis('off')
            axs[r,c].set_title('{}:{}'.format(category_id, category_name))
            
    plt.suptitle('Train images for {}:{}'.format(category_id, category_name), fontsize=16)
    plt.show()

In [None]:
train_annotations['category_id']

In [None]:
visualize_cetagory(112) #any number for cateroy id

# Visualize images for top categories

In [None]:
# load train images annotations
train_info = json.load(open(METADATA_DIR + 'iwildcam2021_train_annotations.json'))
# split json into several pandas dataframes
train_annotations = pd.DataFrame(train_info['annotations'])
train_images = pd.DataFrame(train_info['images'])
train_categories = pd.DataFrame(train_info['categories'])

In [None]:
import collections
import seaborn as sns
# Preperation for visualization
df_categories = pd.DataFrame(train_info["categories"])
labels_id = [item["id"] for item in train_info["categories"]]
cnt = collections.Counter([item["category_id"] for item in train_info["annotations"]])
df_categories_count = pd.DataFrame.from_dict(cnt, orient='index').reset_index()
df_categories_count = df_categories_count.rename(columns={'index':'id', 0:'count'})

df_categories_count = df_categories_count.merge(df_categories, on='id').sort_values(by=['count'], ascending=False)

In [None]:
fig = plt.figure(figsize=(30, 4))
ax = sns.barplot(x="id", y="count",data=df_categories_count, order=labels_id)
ax.set(ylabel='count')
ax.set(ylim=(0,80000))
plt.title('distribution of count per id in train')

In [None]:
import plotly.express as px
fig = px.bar(df_categories_count, x="id", y="count", 
             title='distribution of count per id in train',
             width=1400, height=400, color='id')
fig.show()

The annotation data seems to be biased to some extent. To see the breakdown, let's look at the top 10 categories. Empty is the most, but annotations stating that animals are in the picture also seem to vary among the top 10.

In [None]:
df_categories_count.iloc[:10]

On the other hand, fewer categories have only about one sample. We need to be careful when splitting the dataset to train and validation data when training the model.

In [None]:
df_categories_count.iloc[-10:]
