## Required Imports

In [81]:
import scipy.io as sio
import math
import sklearn.metrics
import numpy as np
import pandas as pd

from tensorflow import keras
from tensorflow.keras.callbacks import ModelCheckpoint
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder

from tensorflow.keras.layers import Input, Add, Dense, Activation, ZeroPadding3D, BatchNormalization, Flatten, Conv3D, AveragePooling3D, MaxPooling3D, GlobalMaxPooling3D
from tensorflow.keras.models import Model
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras import regularizers


import logging
logging.getLogger('tensorflow').setLevel(logging.ERROR)

## Pick samples belonging to all classes

In [82]:
def pick_samples_from_class(Class, cube_size, data, ground_truth, cubes, output_class, overlap_ratio, channels):
    
    ## Get row and column position from ground truth image for class
    class_indices = np.where(ground_truth == Class)
    
    ## Remove border position class samples
    class_cube_positions = [[class_indices[0][i], class_indices[1][i]] for i in range(len(class_indices[0])) 
                        if len(ground_truth) - np.ceil(cube_size / 2) > class_indices[0][i] > np.ceil(cube_size / 2) 
                        and len(ground_truth[0]) - np.ceil(cube_size / 2) > class_indices[1][i] > np.ceil(cube_size / 2)]
    
    #print('Length of class positions', len(class_cube_positions))
    
    extracted_cubes = [[class_cube_positions[0][0], class_cube_positions[0][1]]]
    
    ## Form the first cube for this class
    cubes.append(np.array(data[:channels, 
                        class_cube_positions[0][0] - int(cube_size / 2):class_cube_positions[0][0] + int(cube_size / 2),
                       (class_cube_positions[0][1] - int(cube_size / 2)):class_cube_positions[0][1] + int(cube_size / 2)]))
    
    ## Output class value
    output_class.append(Class)
        
    ## Pick cube/sample if it satisfies the criteria for the overlap ratio
    for i in range(1, len(class_cube_positions)):
        
        distance_vector = [] ## Calculate distance from existing sample to the next candiddate cube sample
        
        for k in range(len(extracted_cubes)):
            
            distance = math.sqrt((class_cube_positions[i][0] - extracted_cubes[k][0]) ** 2 + 
                                 (class_cube_positions[i][1] - extracted_cubes[k][1]) ** 2)
            
            distance_vector.append(distance)
            
        if np.min(distance_vector) > int(cube_size * (1 - overlap_ratio)):
            
            cubes.append(np.array(data[:channels, 
                             class_cube_positions[i][0] - int(cube_size / 2):class_cube_positions[i][0] + int(cube_size / 2),
                            (class_cube_positions[i][1] - int(cube_size / 2)):class_cube_positions[i][1] + int(cube_size / 2)]))
            
            output_class.append(Class)
            extracted_cubes.append([class_cube_positions[i][0], class_cube_positions[i][1]])
            
    return cubes, output_class, extracted_cubes

## Collect and combine samples from all classes

In [83]:
def collect_samples_from_all_classes(classes, cube_size, data, ground_truth, cubes, output_class, overlap_ratio, channels):
    
    class_samples = []
    
    for Class in classes:
        cubes, output_class, extracted_cubes = pick_samples_from_class(Class, cube_size, data, ground_truth, cubes, 
                                                                       output_class,overlap_ratio, channels)
        class_samples.append(len(extracted_cubes))
    
    cubes = np.array(cubes)
    output_class = np.array(output_class)
    
    return cubes, output_class, class_samples

## Prepare Training & Test Data

In [86]:
def training_and_test_set(for_training_set, class_samples, cubes, output_class):
    
    X_train = []
    X_test = []

    Y_train = []
    Y_test = []

    train_and_test_from_each_class = [0]
    for_test = 0

    ## (for_train) samples from each class for the training set, rest of the samples from that class to the test set
    for i in range(len(class_samples)):

        train_and_test_from_each_class.append(for_training_set + for_test)
        train_and_test_from_each_class.append(class_samples[i] + for_test)

        for_test = class_samples[i] + for_test

    for i in range(1, len(train_and_test_from_each_class)):
        if i % 2 != 0:
            for j in range(train_and_test_from_each_class[i - 1], train_and_test_from_each_class[i]):
                X_train.append(cubes[j])
                Y_train.append(output_class[j])
        else:
            for k in range(train_and_test_from_each_class[i - 1], train_and_test_from_each_class[i]):
                X_test.append(cubes[k])
                Y_test.append(output_class[k])

    X_train = np.array(X_train)
    X_test = np.array(X_test)

    Y_train = np.array(Y_train)
    Y_test = np.array(Y_test)

    ## Shuffle Training Set
    samples_train = np.arange(X_train.shape[0])
    np.random.shuffle(samples_train)

    X_train = X_train[samples_train]
    Y_train = Y_train[samples_train]

    ## Shuffle Test Set
    samples_test = np.arange(X_test.shape[0])
    np.random.shuffle(samples_test)

    X_test = X_test[samples_test]
    Y_test = Y_test[samples_test]

    X_train = np.expand_dims(X_train, axis=4)
    X_test = np.expand_dims(X_test, axis=4)

    values, counts = np.unique(Y_test, return_counts=True)

    print("Samples per class: " + str(class_samples) + '\n'
          "Total number of samples is " + str(np.sum(class_samples)) + '.\n')

    print("unique classes in test: " + str(values) + '\n'
          "Total number of samples in test set is " + str(np.sum(counts)) + '.\n'
          "Samples per class in test set: " + str(counts) + '\n')

    ## one hot encode labels
    onehot_encoder = OneHotEncoder(sparse = False)

    Y_train = Y_train.reshape(len(Y_train), 1)
    Y_test = Y_test.reshape(len(Y_test), 1)

    Y_train = onehot_encoder.fit_transform(Y_train)
    Y_test = onehot_encoder.fit_transform(Y_test)
    
    print('Training set shape',X_train.shape)
    print('Training labels', Y_train.shape)
    print('Test set shape', X_test.shape)
    print('Test set labels', Y_test.shape)

    
    return X_train, X_test, Y_train, Y_test, counts, class_samples

In [87]:
def prepare_data_for_training(classes, cube_size, data, ground_truth, cubes, output_class, for_training_set,
                              overlap_ratio, channels):
    
    cubes, output_class, class_samples = collect_samples_from_all_classes(classes, 
                                                                      cube_size, 
                                                                      data,  
                                                                      ground_truth, 
                                                                      cubes, 
                                                                      output_class , 
                                                                      overlap_ratio, 
                                                                      channels)
    
    X_train, X_test, Y_train, Y_test, class_samples, counts = training_and_test_set(for_training_set, 
                                                                                    class_samples, 
                                                                                    cubes,
                                                                                    output_class)
    return X_train, X_test, Y_train, Y_test, class_samples, counts

## Load Hyperspectral Dataset - Pavia

In [88]:
uPavia = sio.loadmat('PaviaU.mat')
gt_uPavia = sio.loadmat('PaviaU_gt.mat')

In [89]:
data_pavia = uPavia['paviaU']
ground_truth = gt_uPavia['paviaU_gt']

## Dimensions

In [90]:
data_pavia.shape

(610, 340, 103)

In [91]:
data = np.moveaxis(data_pavia, 2, 0) # channels first

In [92]:
data.shape

(103, 610, 340)

In [93]:
ground_truth.shape

(610, 340)

## Distribution of Samples for each class

In [94]:
class_distribution = pd.DataFrame(np.unique(ground_truth, return_counts = True))
class_distribution = class_distribution.transpose()
class_distribution.columns = ['class','samples']
class_distribution

Unnamed: 0,class,samples
0,0,164624
1,1,6631
2,2,18649
3,3,2099
4,4,3064
5,5,1345
6,6,5029
7,7,1330
8,8,3682
9,9,947


In [95]:
classes , counts = np.unique(ground_truth, return_counts = True)
classes = classes[1:] ## Not considering background

In [96]:
classes

array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=uint8)

In [97]:
X_train, X_test, Y_train, Y_test, class_samples, counts = prepare_data_for_training(classes = classes, 
                                                                                cube_size = 20, 
                                                                                data = data, 
                                                                                ground_truth = ground_truth, 
                                                                                cubes = [], 
                                                                                output_class = [], 
                                                                                for_training_set = 600,
                                                                                overlap_ratio = 1, 
                                                                                channels = 64)

Samples per class: [5975, 15062, 1742, 2854, 1345, 5029, 1330, 3682, 940]
Total number of samples is 37959.

unique classes in test: [1 2 3 4 5 6 7 8 9]
Total number of samples in test set is 32559.
Samples per class in test set: [ 5375 14462  1142  2254   745  4429   730  3082   340]

Training set shape (5400, 64, 20, 20, 1)
Training labels (5400, 9)
Test set shape (32559, 64, 20, 20, 1)
Test set labels (32559, 9)


In [98]:
class_samples # samples extracted for each class

array([ 5375, 14462,  1142,  2254,   745,  4429,   730,  3082,   340])

In [99]:
class_distribution = class_distribution[1:]
class_distribution['samples_extracted'] = class_samples
class_distribution

Unnamed: 0,class,samples,samples_extracted
1,1,6631,5375
2,2,18649,14462
3,3,2099,1142
4,4,3064,2254
5,5,1345,745
6,6,5029,4429
7,7,1330,730
8,8,3682,3082
9,9,947,340


In [100]:
## Total samples extracted
len(cubes), len(output_class)

(37959, 37959)