# EDA - Feature Detection Sandbox
Test techniques for detecting some features (e.g. clouds, water, etc.)

### Boilerplate

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import matplotlib.pyplot as plt
from spectral import *
import seaborn as sns
%matplotlib inline

pal = sns.color_palette()
sns.set_style("whitegrid")

### Files

In [None]:
print('# File sizes')
for f in os.listdir('../input'):
    if not os.path.isdir('../input/' + f):
        print(f.ljust(30) + str(round(os.path.getsize('../input/' + f) / 1000000, 2)) + 'MB')
    else:
        sizes = [os.path.getsize('../input/'+f+'/'+x)/1000000 for x in os.listdir('../input/' + f)]
        print(f.ljust(30) + str(round(sum(sizes), 2)) + 'MB' + ' ({} files)'.format(len(sizes)))

In [None]:
train_df = pd.read_csv('../input/train_v2.csv')
train_df.head()

### View Distribution

In [None]:
labels = train_df['tags'].apply(lambda x: x.split(' '))
from collections import Counter, defaultdict
counts = defaultdict(int)
for l in labels:
    for l2 in l:
        counts[l2] += 1

counts_df = pd.DataFrame.from_dict(counts, orient='index')
counts_df.columns = ['count']
counts_df.sort_values('count', ascending=False, inplace=True)

fig, ax = plt.subplots()
ax = sns.barplot(x=counts_df.index, y=counts_df['count'], ax=ax)
fig.set_size_inches(12,4)
ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=-45);

### View some JPEGs

In [None]:
import cv2

test_label = 'agriculture'

new_style = {'grid': False}
plt.rc('axes', **new_style)
_, ax = plt.subplots(3, 3, sharex='col', sharey='row', figsize=(12, 12))
i = 0
for f, l in train_df[100:].values:
    if test_label in l:
        print(f, l)
        
        img = cv2.imread('../input/train-jpg/{}.jpg'.format(f))
        ax[i // 3, i % 3].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        ax[i // 3, i % 3].set_title('{} - {}'.format(f, l))

        i += 1
        
    if i >= 9:
        break

# train_0 (*haze primary) 
# train_2 (*clear *primary)
# train_37 (*partly_cloudy primary)
# train_111 (*cloudy)
# train_19 (agriculture clear primary *road)
# train_21 (clear primary road *water)
# train_30 (clear primary *water)
# train_67 (clear *habitation primary road)
# 

### Test Images

In [None]:
test_list = ['testing', 'hello']
'hello' in test_list

In [None]:
import skimage
import skimage.io

image_list = [
    'train_0',   #(*haze primary) 
    'train_2',   #(*clear *primary)
    'train_37',  #(*partly_cloudy primary)
    'train_111', #(*cloudy)
    'train_19',  #(agriculture clear primary *road)
    'train_21',  #(clear primary road *water)
    'train_30',  #(clear primary *water)
    'train_67',  #(clear *habitation primary road)
    'train_122' #(*agriculture clear primary road)
]

test_images = []    # 4 band images
test_images_g = []  # Gray images

plt.rc('axes', **new_style)
_, ax = plt.subplots(3, 3, sharex='col', sharey='row', figsize=(12, 12))
i = 0
for f, l in train_df[:].values:
    if f in image_list:
        img = cv2.imread('../input/train-jpg/{}.jpg'.format(f))
        ax[i // 3, i % 3].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        ax[i // 3, i % 3].set_title('{} - {}'.format(f, l))
        
        test_img_f = '../input/train-tif-v2/{}.tif'.format(f)
        im_tiff = skimage.io.imread(test_img_f, plugin='tifffile')
        im_tiff_g = skimage.io.imread(test_img_f, as_grey=True)
        
        test_images.append(im_tiff)
        test_images_g.append(im_tiff_g)

        i += 1


In [None]:
def plot_bands(im):
    # Plot the intensities of the 4 bands
    fig, axes = plt.subplots(2,2, figsize=(7, 8))
    ax = axes.ravel()

    ax[0] = plt.subplot(2, 2, 1, adjustable='box-forced')
    ax[0].imshow(im[:,:,0], cmap='nipy_spectral')
    for i in range(3):
        ax[i+1] = plt.subplot(2, 2, i+2, sharex=ax[0], sharey=ax[0], adjustable='box-forced')
        ax[i+1].imshow(im[:,:,i+1], cmap='nipy_spectral')

    ax[0].set_title('Blue')
    ax[1].set_title('Green')
    ax[2].set_title('Red')
    ax[3].set_title('NIR')

In [None]:
# Example
plot_bands(test_images[4])

### Calculate various indices

USGS Guide on Landsat: https://landsat.usgs.gov/sites/default/files/documents/si_product_guide.pdf

Cloud detection: https://weather.msfc.nasa.gov/sport/journal/pdfs/2009_GRS_Jedlovec.pdf

Water Detection: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4970121/

NDWI: https://en.wikipedia.org/wiki/Normalized_difference_water_index

In [None]:
def calc_maps(im):
    # RGB and False Color Images
    im2 = get_rgb(im, [2, 1, 0]) # RGB
    im3 = get_rgb(im, [3, 2, 1]) # NIR-R-G
    im4 = get_rgb(im, [3, 2, 0]) # NIR-R-B

    # spectral module ndvi function
    vi = ndvi(im, 2, 3)

    # reverse index
    vi1 = (im[:,:,3] - im[:,:,2])/(im[:,:,3] + im[:,:,2])
    
    # calculate NDVI and NDWI with spectral module adjusted bands
    vi2 = (im3[:, :, 0] - im3[:, :, 1]) / (im3[:, :, 0] + im3[:, :, 1]) # (NIR - RED) / (NIR + RED)
    vi3 = (im3[:, :, 2] - im3[:, :, 0]) / (im3[:, :, 2] + im3[:, :, 0]) # (GREEN - NIR) / (GREEN + NIR)
    
    # EVI
    evi=2.5*((im3[:,:,0]-im3[:,:,1])/(im3[:,:,0]+2.4*im3[:,:,1]+1))
    
    # SAVI
    savi = ((im3[:,:,0] - im3[:,:,1]) / (im3[:,:,0] + im3[:,:,1] +0.5)) * (1 + 0.5)
    
    # MSAVI Modified Soil Adjusted Vegetation Index 
    msavi = (2*im3[:,:,0] + 1 - np.sqrt(np.square(2*im3[:,:,0]+1)-8*(im3[:,:,0]-im3[:,:,1])) )
    
    # NIR Index
    mean_vis = np.mean(im2, axis=2)
    niri = (mean_vis - im3[:,:,0])/(mean_vis + im3[:,:,0])
    
    image_maps = {
        'NDVI_s':vi,
        'Reverse':vi1,
        'NDVI_c':vi2,
        'NDWI':vi3,
        'EVI':evi,
        'SAVI':savi,
        'MSAVI':msavi,
        'NIRI':niri
    }
    
    false_images = {
        'RGB':im2,
        'NRG':im3,
        'NRB':im4
    }
    
    return image_maps, false_images

In [None]:
test_maps = []
test_FI = []
for im in test_images:
    map_dict, false_image_dict = calc_maps(im)
    test_maps.append(map_dict)
    test_FI.append(false_image_dict)

### Stats

In [None]:
def plot_map_dist(map_dict):
    plt.style.use('fivethirtyeight')
    fig, axes = plt.subplots(nrows=int(np.ceil(len(map_dict) / 3)), ncols=3, figsize=(11, 12))
    ax = axes.ravel()
    
    i=0
    for map_name, imap in map_dict.items():
        sns.distplot(map_dict[map_name].flatten(), kde=False, ax=ax[i]);
        ax[i].set_title(map_name);
        i += 1

    return ax

In [None]:
def find_bin_center(i, bin_edges):
    a = bin_edges[i]
    b = bin_edges[i+1]
    return (a+b)/2.0

def find_dist_peak(a, bins=40):
    hist, bin_edges = np.histogram(a,bins)
    b_max = find_bin_center(np.argmax(hist), bin_edges)
    return b_max

def calc_map_stats(map_dict):
    map_stats = dict()
    for map_name, imap in map_dict.items():
        img_flat = imap.flatten()

        map_stats[map_name] = dict(
            peak_dist = find_dist_peak(img_flat,40),
            peak_dist_p = find_dist_peak(img_flat[img_flat>0],40),
            peak_dist_n = find_dist_peak(img_flat[img_flat<0],40),
            mean = np.mean(img_flat),
            median = np.median(img_flat),
            std = np.std(img_flat),
            ptp = np.ptp(img_flat),
        )
    
    return map_stats

#calc_map_stats(test_maps[4])

### View Images

In [None]:
def plot_maps(map_dict, false_image_dict):
    num_maps = len(map_dict)
    num_false = len(false_image_dict)
    num_images = num_maps + num_false
    nrows = int(np.ceil(num_images / 3))
    
    plt.style.use('ggplot')
    fig, axes = plt.subplots(nrows=nrows, ncols=3, figsize=(10, 3.5*nrows))
    ax = axes.ravel()
    
    ax[0] = plt.subplot(nrows, 3, 1, adjustable='box-forced');
    for i in range(1, num_false):
        ax[i] = plt.subplot(nrows, 3, i+1, sharex=ax[0], sharey=ax[0], adjustable='box-forced');

    for i in range(num_false,num_images):
        ax[i] = plt.subplot(nrows, 3, i+1, adjustable='box-forced');

    i = 0
    for FI_name, FI in false_image_dict.items():
        ax[i].imshow(FI);
        ax[i].set_title(FI_name);
        i += 1
        
    for map_name, imap in map_dict.items():
        ax[i].imshow(imap, cmap='nipy_spectral');
        ax[i].set_title(map_name);
        i += 1
    
    return ax

## REPORT

In [None]:
import pprint

# Choose which image you want to look at
image_i = 4

# For reference:
# 0    'train_0',   #(*haze primary) 
# 1    'train_2',   #(*clear *primary)
# 2    'train_37',  #(*partly_cloudy primary)
# 3    'train_111', #(*cloudy)
# 4    'train_19',  #(agriculture clear primary *road)
# 5    'train_21',  #(clear primary road *water)
# 6    'train_30',  #(clear primary *water)
# 7    'train_67',  #(clear *habitation primary road)
# 8    'train_122' #(*agriculture clear primary road)

map_dict = test_maps[image_i]
false_image_dict = test_FI[image_i]
im = test_images[image_i]
im_g = test_images_g[image_i]

In [None]:
plot_bands(im);

plot_maps(map_dict, false_image_dict);

plot_map_dist(map_dict);

test_stats = calc_map_stats(map_dict)
pprint.pprint(test_stats)