# EDA on iWildCam 2018 dataset

## Imports

In [None]:
# Imports
import glob
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import cv2
from PIL import Image
from io import BytesIO

from zipfile import ZipFile


## Exploration - Training Data

In [None]:
def sample_train_images(n=10, grid_size=(5, 2), figsize=(24, 16)):
    fig = plt.figure(figsize=figsize)
    with ZipFile(train_zip) as tz:
        for i, sample in enumerate(np.random.choice(tz.namelist(), n)):
            ax = fig.add_subplot(*grid_size,1+i)
            ax.axis('off')
            img_file = tz.read(sample)
            img = Image.open(BytesIO(img_file))
            ax.imshow(img)
    return fig

In [None]:
# The images
train_zip = '../input/iwildcam-2019-fgvc6/train_images.zip'
with ZipFile(train_zip) as tz:
    print('Total training images:', len(tz.namelist()))

In [None]:
train_gt = pd.read_csv('../input/iwildcam-2019-fgvc6/train.csv')
train_gt.head()

In [None]:
print('Total categories: ', train_gt.category_id.nunique())

In [None]:
# class labels
classes_wild = {0: 'empty', 1: 'deer', 2: 'moose', 3: 'squirrel', 4: 'rodent', 5: 'small_mammal', \
                6: 'elk', 7: 'pronghorn_antelope', 8: 'rabbit', 9: 'bighorn_sheep', 10: 'fox', 11: 'coyote', \
                12: 'black_bear', 13: 'raccoon', 14: 'skunk', 15: 'wolf', 16: 'bobcat', 17: 'cat',\
                18: 'dog', 19: 'opossum', 20: 'bison', 21: 'mountain_goat', 22: 'mountain_lion'}

In [None]:
train_gt = train_gt.assign(category_label=train_gt.category_id.apply(classes_wild.get).values)

In [None]:
category_samples = list(train_gt.groupby('category_label')['file_name'].apply(pd.Series.sample, n=1).reset_index(inplace=False)[['category_label', 'file_name']].to_records(index=False))
fig = plt.figure(figsize=(24, 16))
with ZipFile(train_zip) as tz:
    for i, sample in enumerate(category_samples):
        ax = fig.add_subplot(3,5,1+i)
        ax.axis('off')
        ax.set_title(sample[0])
        img_file = tz.read(sample[1])
        img = Image.open(BytesIO(img_file))
        ax.imshow(img)
        

In [None]:
category_counts = train_gt.groupby('category_label')['file_name'].apply(pd.Series.nunique).reset_index(inplace=False)
category_counts.sort_values('file_name', inplace=True)

In [None]:
pie, ax = plt.subplots(figsize=[10,10])
plt.pie(x=category_counts.file_name, autopct="%.1f%%", labels=category_counts.category_label, pctdistance=0.75, explode=[0.1]*category_counts.shape[0])
plt.title("Labels distribution", fontsize=14);

In [None]:
train_gt.file_name.nunique()/train_gt.shape[0] # Multiple entries per file?

In [None]:
train_gt.groupby(by='file_name')['category_id'].agg(['count', lambda x: set(x)]).reset_index(inplace=False).query('count>1')
# Hmm there are files with multiple labels. Let's look at some

In [None]:
multi_animal_imgs = train_gt.groupby(by='file_name')['category_id'].agg(['count', lambda x: set(x)]).reset_index(inplace=False).query('count>1').file_name.sample(12)
fig = plt.figure(figsize=(24, 16))
with ZipFile(train_zip) as tz:
    for i, fname in enumerate(multi_animal_imgs):
        ax = fig.add_subplot(4, 3,1+i)
        ax.axis('off')
        ax.set_title(fname)
        img_file = tz.read(fname)
        img = Image.open(BytesIO(img_file))
        ax.imshow(img)

In [None]:
# Let's look at the test data too
test_gt = pd.read_csv('../input/iwildcam-2019-fgvc6/test.csv')
test_gt.head()

In [None]:
# The images
test_zip = '../input/iwildcam-2019-fgvc6/test_images.zip'
fig = plt.figure(figsize=(24, 12))
with ZipFile(test_zip) as tz:
    for i, sample in enumerate(tz.namelist()[:15]):
        ax = fig.add_subplot(3,5,1+i)
        ax.axis('off')
        img_file = tz.read(sample)
        img = Image.open(BytesIO(img_file))
        ax.imshow(img)
        

## Preprocessing
### Vignette removal

In [None]:
# Let's see how bad it is
fig = plt.figure(figsize=(24, 16))
with ZipFile(train_zip) as tz:
    for i, sample in enumerate(np.random.choice(tz.namelist(), 30)):
        ax = fig.add_subplot(5,6,1+i)
        ax.axis('off')
        img_file = tz.read(sample)
        img = Image.open(BytesIO(img_file))
        ax.imshow(img)
        

Isn't so bad for day time images. Night vision needs work though. 

**Future work**: Detect grayscale images and de-vignette?

### Parking for now

### CLAHE

In [None]:
# Extract a few random images to pwd
with ZipFile(train_zip) as tz:
    sample = np.random.choice(tz.namelist(), 20)
    for i, s in enumerate(sample):
        i+=1
        tz.extract(s)
        print('[', *['.']*i, *[' ']*(20-i), ']', sep='', end='\r')

In [None]:
clahe = cv2.createCLAHE(clipLimit=1.0, tileGridSize=(8, 8))
fig = plt.figure(figsize=(20, 36))
for i, f in enumerate(glob.glob('*.jpg')):
    # read each file, apply CLAHE and show
    ax = fig.add_subplot(10, 2, i+1)
    ax.axis('off')
    img = cv2.cvtColor(cv2.imread(f), cv2.COLOR_BGR2LAB)
    img[:, :, 0] = clahe.apply(img[:, :, 0])
    
    img_raw = cv2.cvtColor(cv2.imread(f), cv2.COLOR_BGR2RGB)
    stacked_img = np.hstack([img_raw, cv2.cvtColor(img, cv2.COLOR_LAB2RGB)])
    plt.imshow(stacked_img)

Night images improved. Day-time images get high-contrast (expected). 

**Future work:** Detect day/night and apply CLAHE?

### Denoising

In [None]:
# Let's look at a few images and see how bad it is
_ = sample_train_images(n=6, grid_size=(3, 2), figsize=(24, 24))

Not too bad IMO

In [None]:
fig = plt.figure(figsize=(24, 48))
for i, f in enumerate(glob.glob('*.jpg')[:6]):
    # read each file, apply CLAHE and show
    ax = fig.add_subplot(6, 1, i+1)
    ax.axis('off')
    img = cv2.cvtColor(cv2.imread(f), cv2.COLOR_BGR2RGB)
    # Try with a small h. h~7 blurs it too much and we lose sharpness.
    img = cv2.fastNlMeansDenoisingColored(img, None, 3, 6, 7, 21)
    
    img_raw = cv2.cvtColor(cv2.imread(f), cv2.COLOR_BGR2RGB)
    stacked_img = np.hstack([img_raw, img])
    plt.imshow(stacked_img)

The images look mostly clean. The main issues seem to be poor illumination in night images. May work without denoising. 

### White balance

In [None]:
# Let's run a quick test to see if the grey world assumption is fair to make

fig = plt.figure(figsize=(24, 8))
with ZipFile(train_zip) as tz:
    for i, sample in enumerate(np.random.choice(tz.namelist(), 30)):
        ax = fig.add_subplot(3,10,1+i)
        ax.axis('off')
        img_file = tz.read(sample)
        img = Image.open(BytesIO(img_file))
        # img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
        means = np.mean(np.mean(img, axis=0), axis=0).astype(np.uint8)
        #print(means)
        color_patch = np.ones((32, 32, 3), dtype=np.uint8)*means
        ax.imshow(color_patch)
        
# Yeah not really. It's greens or greys.

Come to think of it, white balancing is essentially a linear transform on the pixel values based on some statistic of the whole image, kind of like regularization at an image level. 
Can try on/off and see if it helps, starting with simpler WB techniques and moving on more complex ideas:
- [Improving CNN-Based Texture Classification byColor Balancing - Bianco et al.](https://www.researchgate.net/publication/318740203_Improving_CNN-Based_Texture_Classification_by_Color_Balancing)

**Future work:** Can experiment with Learning based WB techniques

In [None]:
# Cleanup pwd
for f in os.listdir('.'):
    os.remove(f)