# Part 2. Exploratory Data Analysis & Data Preparation

**Author**: Navavat Pipatsart, pnastranagant@gmail.com

**language**: python

**environemt**: jupyter notebook

**Objective**: Explore pre-processed dataset from **Part 1. Data Acquisition & Data Preprocessing** using *Exploratory Data Analysis (EDA)* and prepare dataset via *Data Preparation* and *Feature Extraction* for further **3. Data Training and Model Evaluation**.

**Last modified date**: 2022-06-23

**Modified issue**: 

- Improve full spectrum data

**status**: Done

## 1. Environmental Setup

### 1.1. Miscellaneous Configuration

In [None]:
# Import global libraries
import random
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

# System configuration
%matplotlib inline

### 1.2. Main Configuration

**NOTE**: This project datasets is located in *on-premise external harddisk*. Hence, path will be set to the located files.

In [None]:
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
# Define filenames
# !!! priority recheck
date_preprocessed = '2022-06-16' 

# !!! priority recheck
process_final = 'triangle'

# --------------------------------------------------------------------------------------------------------
# Define paths
## On-premise paths
parent_path = '/Volumes/phdbackup/backup/processed_data/' # Parent path to pre-processed files and path to save

# !!! priority recheck
children_path_to_files = 'preprocessed_data_20220616/' # Children path to pre-processed files

# !!! priority recheck
children_path_to_save = 'prepared_data_20220623/' # Children path to save

# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$

path_to_files = parent_path + children_path_to_files # Full path to files
path_to_save = '/Users/pnastra/OneDrive - Mahidol University/phd_thesis/thesis_results/preprocessing/' # Full path to save

## On-cloud path
parent_path_cloud = '/Users/pnastra/pnastranagant@gmail.com - Google Drive/My Drive/' # Parent path to files in cloud
path_to_wavelengths = parent_path_cloud + 'phd_codes/metadata/wavelengths.csv' # Full path to full spectrum wavelengths profile
path_to_wavelengths_trimmed = parent_path_cloud + 'phd_codes/metadata/wavelengths_trimmed.csv' # Full path to trimmed wavelengths from wavelength selection profile

# --------------------------------------------------------------------------------------------------------
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
## Variables, a.k.a. X
filenames_X = [
               date_preprocessed + '_X_' + process_final + '_vb_d1.npy', # Variety B, Treatment: Fresh
               date_preprocessed + '_X_' + process_final + '_vb_d8.npy', # Variety B, Treatment: Deterioration
               date_preprocessed + '_X_' + process_final + '_vbci_d8.npy', # Variety B, Treatment: Chilling Injury-induced
               date_preprocessed + '_X_' + process_final + '_vc_d1.npy',
               date_preprocessed + '_X_' + process_final + '_vc_d8.npy',
               date_preprocessed + '_X_' + process_final + '_vcci_d8.npy'
              ]

## Labels, a.k.a. y
filenames_y = [
               date_preprocessed + '_y_vb_d1.npy', # Variety B, Treatment: Fresh
               date_preprocessed + '_y_vb_d8.npy', # Variety B, Treatment: Deterioration
               date_preprocessed + '_y_vbci_d8.npy', # Variety B, Treatment: Chilling Injury-induced
               date_preprocessed + '_y_vc_d1.npy',
               date_preprocessed + '_y_vc_d8.npy',
               date_preprocessed + '_y_vcci_d8.npy'
              ]


## Calibrated averaged reflectances
filenames_reflectances_averaged = [
                                   date_preprocessed + '_reflectances_averaged_vb_d1.npy', # Variety B, Treatment: Fresh
                                   date_preprocessed + '_reflectances_averaged_vb_d8.npy', # Variety B, Treatment: Deterioration
                                   date_preprocessed + '_reflectances_averaged_vbci_d8.npy', # Variety B, Treatment: Chilling Injury-induced
                                   date_preprocessed + '_reflectances_averaged_vc_d1.npy',
                                   date_preprocessed + '_reflectances_averaged_vc_d8.npy',
                                   date_preprocessed + '_reflectances_averaged_vcci_d8.npy'
                                  ]

## PSNR of images from wavelet denoising 
filenames_psnr_wavelet = [
                          date_preprocessed + '_psnr_tensor_wavelet_vb_d1.npy', # Variety B, Treatment: Fresh
                          date_preprocessed + '_psnr_tensor_wavelet_vb_d8.npy', # Variety B, Treatment: Deterioration
                          date_preprocessed + '_psnr_tensor_wavelet_vbci_d8.npy', # Variety B, Treatment: Chilling Injury-induced
                          date_preprocessed + '_psnr_tensor_wavelet_vc_d1.npy',
                          date_preprocessed + '_psnr_tensor_wavelet_vc_d8.npy',
                          date_preprocessed + '_psnr_tensor_wavelet_vcci_d8.npy'
                         ]

## PSNR of images from wavelet denoising and median filtering
filenames_psnr_median = [
                         date_preprocessed + '_psnr_median_vb_d1.npy', # Variety B, Treatment: Fresh
                         date_preprocessed + '_psnr_median_vb_d8.npy', # Variety B, Treatment: Deterioration
                         date_preprocessed + '_psnr_median_vbci_d8.npy', # Variety B, Treatment: Chilling Injury-induced
                         date_preprocessed + '_psnr_median_vc_d1.npy',
                         date_preprocessed + '_psnr_median_vc_d8.npy',
                         date_preprocessed + '_psnr_median_vcci_d8.npy'
                        ]


## PSNR of images from wavelet denoising, median filtering, and triangle thresholding
filenames_psnr_triangle = [
                           date_preprocessed + '_psnr_tensor_triangle_vb_d1.npy', # Variety B, Treatment: Fresh
                           date_preprocessed + '_psnr_tensor_triangle_vb_d8.npy', # Variety B, Treatment: Deterioration
                           date_preprocessed + '_psnr_tensor_triangle_vbci_d8.npy', # Variety B, Treatment: Chilling Injury-induced
                           date_preprocessed + '_psnr_tensor_triangle_vc_d1.npy',
                           date_preprocessed + '_psnr_tensor_triangle_vc_d8.npy',
                           date_preprocessed + '_psnr_tensor_triangle_vcci_d8.npy'
                          ]

# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
# --------------------------------------------------------------------------------------------------------
# Miscellaneous variables
# Data labels
labels = {
          # format <file_name>: <label_name>
          'vb_d1': 'fresh-W',
          'vb_d8': 'CS-W',
          'vbci_d8': 'CI-W',
          'vc_d1': 'fresh-S',
          'vc_d8': 'CS-S',
          'vcci_d8': 'CI-S',
         }

# Define color palette for visualization
color_palette = [
                 '#F16745', '#FFC65D', '#7BC8A4', 
                 '#4CC3D9', '#93648D', '#404040'
                 ]

# Assign head & tail wavelengths to get rid inconsistency reflectances from EDA
head_tail = (0, 224) # Full spectrum
test_size = 0.2 # Test size for splitting dataset

# Save option
save = False # Data saving option
# save_figure = True # Image saving option
number = 10 # Wavelength amount
full = True # Full spectrum option

# --------------------------------------------------------------------------------------------------------
# Preload variables
wavelengths = np.fromfile(file=path_to_wavelengths, sep=',') # load list of actual wavelengths
# wavelengths_trimmed = np.fromfile(file=path_to_wavelengths_trimmed, sep=',') # load list of trimmed wavelengths
num_classes = len(set(labels)) # Classes number

# --------------------------------------------------------------------------------------------------------

### 1.3. Model configuration

In [None]:
# Network type indicator
is_network = False # Neural network model type option
is_load_model = True # Loading neural network model option
is_exclude = False # Exclude some noisy head and tail wavelengths from wavelength selection
model_name = 'plsr' # Model name

# Image classification approach using CNN algorithm
verbose = 1 # Verbosity
is_slice = True
epochs = 500 # Epoch of training network algorithm

# Signal processing approach using PLSR
max_components = 13 # ~ min(sample_number, feature_number)

## 2. Define Functions

### 2.1. Data I/O Functions

In [None]:
# Define function to loading processed data
def load_dataset(path_to_files: str,
                 dataname: str):
    
    '''
    Function to loading processed data
    loaded data from assigned 'path_to_file' including current date and assigned name
    '''
    
    # Import libraries
    import os
    import numpy as np
    
    print('begin to loading ', dataname)
    
    data = np.load(file=os.path.join(path_to_files, dataname)) # Load dataset
    
    print('loading ', dataname , 'done')
    
    return data

In [None]:
# Define function to perform load datasets from given path and filenames then merge together
def merge_datasets(path_to_files: str,
                   filenames: str,
                   is_y: bool):
        
    '''
    Function to perform load dataset from given file path and filenames then merge togethers
    steps:
    1. Loop through filename then execute function to loading processed data.
    2. Extract data name using Regular Expression.
    3. Consider inside loop; y datasets checker; If "is_y" == True, reformat data: array => list.
    4. Collect to global collectors.
    5. Consider outside loop; y datasets negative checker; If "is_y" == False: reformat data, global collector: list => array
    6. Loop through name; global collector for generating summary
    7. Return output
    '''
    
    # Import libraries
    import re
    import os
    import numpy as np

    # Assign global collectors
    data = []
    names = []
    data_amount = []
    
    print('begin to merging dataset')
    
    # Loop through filename
    for _, filename_i in enumerate(filenames):

        # Execute function to loading processed data
        data_temp = load_dataset(path_to_files=path_to_files,
                                 dataname=filename_i)
        
        name_temp = re.search('(?<=_)\w+', filename_i)[0] # Extract data name using Regular Expression
        
        # Inside loop; y datasets checker
        if is_y == True:
            
            data_temp = data_temp.tolist() # Reformat data: array => list
            
            data.extend(data_temp) # Collect to global collector
        
        else: 
            
            data.append(data_temp) # Collect to global collector
            
        # Collect to global collectors
        names.append(name_temp)
        data_amount.append(np.shape(data_temp)[0])
    
    # Outside loop; y datasets negative checker
    if is_y == False:
        
        data = np.vstack(data) # Reformat data, global collector: list => array
         
    print('merging datasets from done')
    print('summary: ')
    
    # Loop through name; global collector for generating summary
    for index, name in enumerate(names):
    
        print('data:', name, ', amount:', data_amount[index])
        
    print('\n')
        
    return data

In [None]:
# Define function to saving prepared dataset to directory
def save_dataset(data,
                 dataname: str,
                 path_to_save: str):
    
    '''
    Function to saving prepared dataset to directory
    Save dataset to assigned 'path_to_save' with naming by current date and assigned name
    '''
    
    # Import libraries
    import numpy as np
    from datetime import date
    
    print('begin to saving data to path: ', path_to_save)
    print('begin to saving ', dataname)
    
    # Save dataset with date timestamp
    np.save(file=(path_to_save + str(date.today()) + '_' + dataname),
            arr=data) 
    
    print('saving ', dataname , 'done')
    
    return None

In [None]:
# Define function to convert list to DataFrame then save to directory
def save_list_to_csv(data,
             dataname: str,
             path_to_save: str):
    
    """
    Function to convert list to DataFrame then save to directory
    steps:
    1. Convert list to DataFrame
    2. Save DataFrame to csv file
    """
    
    # Import library
    import pandas as pd
    
    data = pd.DataFrame(data) # Convert list to DataFrame
    data.to_csv(path_to_save + dataname + '.csv') # Save DataFrame to csv file
    
    print(dataname + ' is saved to directory')
    
    return None


In [None]:
# Define function to save model to file
def save_model(model,
               is_network: bool,
               path_to_save: str,
               filename: str):

    """
    Function to save model to file
    Consider network-type model checker; 
    - if "is_network" == True: save both model and weights, 
    - otherwise: save model
    """
    
    # Import libraries
    import json
    import joblib # I/O library for scikit-learn
    
    print('begin to save model:', filename)
    
    # Network-type model checker
    if is_network == True:
    
        # Serialize model to JSON format
        model_json = model.to_json()

        # Open empty file to write
        with open("{}.json".format(path_to_save + filename), "w") as json_file:

            json_file.write(model_json) # Write model to file

        # Serialize weights to HDF5
        model.save_weights("{}_weight.h5".format(path_to_save + filename))
            
    else:
        
        joblib.dump(model, path_to_save + filename + '.joblib') # save model to file 

    print('save model:', filename, 'done')
    
    return None

In [None]:
# Define function to load model from file
def load_model(is_network: bool,
               path_to_file: str, 
               filename: str):

    """
    Function to load model from file
    Consider network-type model checker; 
    - if "is_network" == True: load both model and weights, 
    - otherwise: load model
    """
    
    # Import libraries
    import joblib # I/O library for scikit-learn
    from tensorflow.keras.models import model_from_json # Loading model in JSON format
    
    print('begin to load model:', filename)
    
    # Network-type model checker
    if is_network == True:
    
        # Load json and create model
        json_file = open('{}.json'.format(path_to_file + filename), 'r') # Open file
        loaded_model_json = json_file.read() # Read file
        json_file.close() # Close file
        loaded_model = model_from_json(loaded_model_json) # Assign the loaded model

        # Load weights into new model
        loaded_model.load_weights("{}_weight.h5".format(path_to_file + filename))
        
    else:
    
        # Load model
        loaded_model = joblib.load(path_to_file + filename + '_model.joblib') # Load model
    
    print('load model:', filename, 'done')
    
    return loaded_model

### 2.2. Utility functions

In [None]:
# Define function to extract sample indices from specified label name
def extract_indices(y: list,
                    label: str):
    
    """
    Function to extract sample indices from specified label name
    """
    
    # Extract sample indices from specified label name
    samples = [sample for sample, y_label in enumerate(y) if y_label == label]
    
    return samples

In [None]:
# Define function to calculate averaged reflectance over spatial domain (width & length axises)
def calculate_average(data,
                      axis):
    
    '''
    Function to calculate averaged reflectance over spatial domain (width & length axises) then return output as signle averaged value
    '''
    
    # Import libraries
    import numpy as np
    
    # Calculate average along spectral-axis of sample image data
    data_averaged = np.mean(data, axis=axis)
    
    return data_averaged

In [None]:
# Define utility function to perform looping though average value calculation from multiple label data by executing function to calculate averaged reflectance over spatial domain (width & length axises)
def calculate_average_all_data(data,
                               y: list,
                               labels: list):

    '''
    Utility function to perform looping though average value calculation from multiple label data by executing function to calculate averaged reflectance over spatial domain (width & length axises)
    steps:
    1. Loop through assigned label, extract sample indices of considering label.
    3. Execute function to calculate averaged reflectance over spatial domain (width & length axises) then return output as signle averaged value
    3. Collect to collector
    4. Reformat; list => array
    5. Return output as array
    '''

    # Assign collector
    data_averaged_merged = []

    print('begin to calculate averaged data')
    
    # Loop through assigned label
    for label in labels:

        # Execute function to extract sample indices from specified label name
        samples = extract_indices(y,
                                  label=label)

        # Execute function to calculate averaged reflectance over spatial domain (width & length axises) then return output as signle averaged value
        data_averaged = calculate_average(data=data[samples,...],
                                          axis=(1,2))
        # Collect to collector
        data_averaged_merged.append(data_averaged)
        
    # Reformat; list => array
    data_averaged_merged = np.vstack(data_averaged_merged)
    
    print('calculate averaged data done')
        
    return data_averaged_merged

In [None]:
# Define function to print iterations progress (imported from open-source)
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    
    # Print New Line on Complete
    if iteration == total:    
        print()
        
    return None

### 2.3. Analysis Functions

#### 2.3.1. CNN-related Analysis Functions

In [None]:
# Define function to define callback for model training
def construct_callback(epochs,
                       metric,
                       verbose):
    
    """
    Function to define callback for model training
    """
    
    # Import library
    from tensorflow.keras.callbacks import EarlyStopping
    
    # Define parameter
    patience = epochs // 10
    mode = 'min'

    # Define callbacks
    callback = EarlyStopping(
                              monitor=metric,
                              min_delta=0,
                              patience=patience,
                              verbose=verbose,
                              mode=mode,
                              baseline=None,
                              restore_best_weights=False
                              )
    
    return callback

In [None]:
# Define function to construction CNN model architecture
def construct_cnn_model(X_train,
                        num_classes: int,
                        model_name: str,
                        epochs: int,
                        verbose: int):
    
    """
    Function to construction CNN model architecture
    steps:
    1. Construct architectures as blocks of layers
    2. Construct model from defined layers
    3. Return output
    """
    
    # Import libraries
    from tensorflow.keras import layers # Layer API for neural network architechture
    from tensorflow.keras import Model # Group layers into a model object
    
    # Define hyperparameter
    activation = 'relu' # Convolution and Dense activation function type
    padding = 'Same' # Convolution padding kernel
    pool_size = (2, 2) # Max pooling and Average Pooling size
    rate = 0.5 # Dropout rate
    units = 200 # Dense units (Fully connected layer)
    
    # --------------------------------------------------------------------------------------------------------
    # Construct architectures as blocks of layers
    # Block 0
    ## Input layer with shape with converting array => tensor
    inputs = layers.Input(X_train.shape[1:],
                          name='Input')
        
    # --------------------------------------------------------------------------------------------------------
    # Block 1
    ## Convolution layer
    model_structure = layers.Conv2D(
                                    filters=5, 
                                    kernel_size=(5, 5), 
                                    strides=(1, 1),
                                    padding=padding, 
                                    activation=activation,
                                    name='Convolution2D_1'
                                    )(inputs)
    
    # --------------------------------------------------------------------------------------------------------
    # Block 2
    ## Convolution layer
    model_structure = layers.Conv2D(
                                    filters=96, 
                                    kernel_size=(7, 7), 
                                    strides=(2, 2),
                                    padding=padding, 
                                    activation=activation,
                                    name='Convolution2D_2'
                                    )(model_structure)
    
    ## Max pooling layer
    model_structure = layers.MaxPool2D(
                                       pool_size=pool_size,
                                       name='Max_Pooling_2'
                                       )(model_structure)

    ## Dropout layer
    model_structure = layers.Dropout(
                                     rate=rate,
                                     name='Drop_out_2'
                                     )(model_structure)
    
    # --------------------------------------------------------------------------------------------------------
    # Block 3
    ## Convolution layer
    model_structure = layers.Conv2D(
                                    filters=64, 
                                    kernel_size=(5, 5), 
                                    strides=(1, 1),
                                    padding=padding, 
                                    activation=activation,
                                    name='Convolution2D_3'
                                    )(model_structure)

    ## Max pooling layer
    model_structure = layers.MaxPool2D(
                                       pool_size=pool_size,
                                       name='Max_Pooling_3'
                                       )(model_structure)

    ## Dropout layer
    model_structure = layers.Dropout(
                                     rate=rate,
                                     name='Drop_out_3'
                                     )(model_structure)
    
    # --------------------------------------------------------------------------------------------------------
    # Block 4
    ## Convolution layer
    model_structure = layers.Conv2D(
                                    filters=64, 
                                    kernel_size=(5, 5), 
                                    strides=(1, 1),
                                    padding=padding, 
                                    activation=activation,
                                    name='Convolution2D_4'
                                    )(model_structure)

    ## Max pooling layer
    model_structure = layers.MaxPool2D(
                                       pool_size=pool_size,
                                       name='Max_Pooling_4'
                                       )(model_structure)

    ## Dropout layer
    model_structure = layers.Dropout(
                                     rate=rate,
                                     name='Drop_out_4'
                                     )(model_structure)
    
    # --------------------------------------------------------------------------------------------------------
    # Block 5
    ## Convolution layer    
    model_structure = layers.Conv2D(
                                    filters=128, 
                                    kernel_size=(3, 3), 
                                    strides=(1, 1),
                                    padding=padding, 
                                    activation=activation,
                                    name='Convolution2D_5'
                                    )(model_structure)

    ## Average pooling layer
    model_structure = layers.AveragePooling2D(
                                              pool_size=pool_size,
                                              name='Average_Pooling'
                                              )(model_structure)
    
    ## Dropout layer
    model_structure = layers.Dropout(
                                     rate=rate,
                                     name='Drop_out_5'
                                     )(model_structure)
    
    # --------------------------------------------------------------------------------------------------------
    # Block 6
    ## Flatten layer
    model_structure = layers.Flatten(name='Flatten')(model_structure)
    
    ## Dense layer
    model_structure = layers.Dense(
                                   units=units, 
                                   activation=activation,
                                   name='Dense_6'
                                   )(model_structure)
    
    # --------------------------------------------------------------------------------------------------------
    # Block 7
    ## Dense layer
    model_structure = layers.Dense(
                                   units=units, 
                                   activation=activation,
                                   name='Dense_7'
                                   )(model_structure)
    
    ## Dropout layer
    model_structure = layers.Dropout(
                                     rate=rate,
                                     name='Drop_out_7'
                                     )(model_structure)
    
    # --------------------------------------------------------------------------------------------------------
    # Block 8
    ## Output layer
    outputs = layers.Dense(
                           units=num_classes, 
                           activation="softmax",
                           name='Output'
                           )(model_structure)

    # --------------------------------------------------------------------------------------------------------
    # Construct model from defined layers
    model = Model(inputs=inputs, outputs=outputs, name=model_name)
       
    # --------------------------------------------------------------------------------------------------------
    # Compile the constructed model
    # Define argument
    metric_fit = 'accuracy'
    optimizer = 'adam'
    loss = 'categorical_crossentropy'

    # Compile model
    model.compile(loss=loss, 
                  optimizer=optimizer, 
                  metrics=[metric_fit])
    
    # --------------------------------------------------------------------------------------------------------
    # Define callback
    # Define argument
    metric_callback = 'val_loss'

    # Execute function to define callback for model training
    callback = construct_callback(epochs=epochs,
                                  metric=metric_callback,
                                  verbose=verbose)
    
    # --------------------------------------------------------------------------------------------------------
    
    return model, callback

In [None]:
# Define function perform feature extraction as wavelength selection by image classification using CNN model
def extract_feature_image(X,
                          y_categorized: list,
                          num_classes: int,
                          test_size: float,
                          is_slice: bool,
                          model_name: str,
                          epochs: int,
                          verbose: int,
                          wavelengths: list):

    """
    Function perform feature extraction as wavelength selection by image classification using CNN model
    steps:
    1. Loop through wavelength index
    2. Slice then split dataset
    3. Construction CNN model architecture
    4. Train model
    5. Evaluate the trained model
    6. Collect to collector
    7. Return output
    """
    
    # Import library
    import time
    
    # Define parameters
    batch_size = X.shape[0] // 10 # Hyperparameter of sample amount to train per step
    metric_fit = 'accuracy'
    metric_callback = 'val_loss'
    length_wavelengths = X.shape[-1] # Define spectral-axis length   
    evaluation_score = np.zeros((length_wavelengths, 2)) # Define collector

    start_time = time.time() # Memory starting time
    
    # Loop through wavelength index
    for index in range(length_wavelengths):

        print('index : wavelength =', index, ':', wavelengths[index], ' nm')
        
        # --------------------------------------------------------------------------------------------------------
        # Slice then split dataset
        # Execute function to perform data preparation by encoded y into one-hot format and split X and y follow "test_size"
        X_train, X_test, y_train, y_test = prepare_dataset_image(X=X,
                                                                 y=y_categorized,
                                                                 num_classes=num_classes,
                                                                 test_size=test_size,
                                                                 is_slice=is_slice,
                                                                 index_wavelength=index)
    
        # --------------------------------------------------------------------------------------------------------
        # Train model
        model.fit(
                  x=X_train, 
                  y=y_train, 
                  validation_data=(X_test, y_test),
                  batch_size=batch_size, 
                  epochs=epochs,
                  shuffle=True,
                  callbacks=[callback],
                  verbose=verbose
                  )
        
        # Evaluate the trained model
        evaluation = model.evaluate(x=X_test,
                                    y=y_test,
                                    verbose=verbose)

        evaluation_score[index] = evaluation # Collect to collector

        print('Model Evaluation:')
        print('      Loss                    =', evaluation[0])
        print('      Classification Accuracy =', evaluation[1], '\n')
        
    end_time = time.time() # Memory ending time
    
    print('\n')
    print('total_execution time:', end_time - start_time, 'seconds \n') # Calculate time usage    
        
    return model, evaluation_score

#### 2.3.2. PLSR-related Analysis Functions

In [None]:
# Fefine function to perform PLS regression with specific component number
def plsr_fit_transform(X,
                       y: list,
                       n_components: int):
    
    """
    Function to perform PLS regression with specific component number
    steps:
    1. Define PLS regressor with assigned component number
    2. Fit PLS regressor with X and y
    3. Transform X according to PLS regressor
    4. Return output as PLS regressor and transformed X
    """
    
    # Import library
    from sklearn.cross_decomposition import PLSRegression
    
    plsr = PLSRegression(n_components=n_components) # Define PLS regressor with assigned component number
    plsr.fit(X, y) # Fit PLS regressor with X and y
    X_transformed = plsr.transform(X) # Transform X according to PLS regressor
    
    return plsr, X_transformed

In [None]:
# Define function to perform feature extraction by component number optimization of PLS regression using MSE and cross-validation
def extract_feature_signal(X, 
                           y: list,
                           max_components: int):
    
    """
    Function to perform feature extraction by component number optimization of PLS regression using MSE and cross-validation
    steps:
    1. Define MSE array to be populated
    2. Loop over the number of PLS components then regression with specified number of components, using full spectrum.
    3. Get the list of indices that sorts the PLS coefficients in ascending order of the absolute value by ascending sorting.
    4. Sort spectra according to ascending absolute value of PLS coefficients.
    5. Loop through spectral-axis then discard one wavelength at a time of the sorted spectra, regress, and calculate the MSE cross-validation.
    6. PLS regression with specified number of components with sliced wavelengths.
    7. Caluclate cross validated label and calculate MSE.
    8. Calculate and print the position of minimum in MSE.
    9. Finalize by get the list of indices that sorts the PLS coefficients in ascending order of the absolute value and calculate PLS with optimal components and export values.
    10. Select opimal parameters.
    11. Return selected parameter
    """
    
    # Import libraries
    import time
    from sklearn.model_selection import cross_val_predict
    from sklearn.metrics import mean_squared_error, r2_score
    
    start_time = time.time() # memory starting time    
    
    # Define MSE array to be populated
    mse = np.zeros((max_components, X.shape[-1]))
 
    # Loop over the number of PLS components
    for n in range(max_components):
        
        print('processing PLSR with component amount = {}/{}'.format(n + 1, max_components))
        
        # Regression with specified number of components, using full spectrum
        # Execute function to perform PLS regression with specific component number
        plsr1, _ = plsr_fit_transform(X=X,
                                      y=y,
                                      n_components=n + 1)
        
        
        # Get the list of indices that sorts the PLS coefficients in ascending order of the absolute value
        indices_sorted = np.argsort(np.abs(plsr1.coef_[:, 0])) # Ascending sorting
 
        # Sort spectra according to ascending absolute value of PLS coefficients 
        X_calibrated = X[:, indices_sorted]
 
        # Loop through spectral-axis
        # Discard one wavelength at a time of the sorted spectra, regress, and calculate the MSE cross-validation
        for m_wavelength in range(X.shape[-1] - (n + 1)):
            
            # PLS regression with specified number of components with sliced wavelengths
            # Execute function to perform PLS regression with specific component number
            plsr2, _ = plsr_fit_transform(X=X[:, m_wavelength:],
                                          y=y,
                                          n_components=n + 1)
            
            # Caluclate cross validated label
            y_cross_validated = cross_val_predict(estimator=plsr2, 
                                                  X=X[:, m_wavelength:], 
                                                  y=y, 
                                                  cv=10)
 
            mse[n, m_wavelength] = mean_squared_error(y, y_cross_validated) # Calculate MSE
    

            # Show progress bar
            printProgressBar(m_wavelength + 1, 
                             X_calibrated.shape[-1] - (n + 1), 
                             prefix = 'progress:', 
                             suffix = 'complete', 
                             length = 50)    
        
    # Calculate and print the position of minimum in MSE
    mse_min_x, mse_min_y = np.where(mse == np.min(mse[np.nonzero(mse)]))
 
    print('\n')
    print("Optimized number of PLS components: ", mse_min_x[0] + 1)
    print("Wavelengths to be discarded: ", mse_min_y[0])
    print('Optimized MSEP: ', mse[mse_min_x, mse_min_y][0])
    
    # Finalize
    # Calculate PLS with optimal components and export values
    # Execute function to perform PLS regression with specific component number
    plsr, _ = plsr_fit_transform(X=X,
                                 y=y,
                                 n_components=mse_min_x[0] + 1)
        
    # Get the list of indices that sorts the PLS coefficients in ascending order of the absolute value
    indices_sorted = np.argsort(np.abs(plsr.coef_[:,0]))
 
    # Sort spectra according to ascending absolute value of PLS coefficients
    X_calibrated = X[..., indices_sorted]
    
    # Select opimal parameters
    X_calibrated_optimized = X_calibrated[:,mse_min_y[0]:]
    n_components_optimized = mse_min_x[0] + 1
    wavelengths_discarded = mse_min_y[0]
    
    end_time = time.time() # Memory ending time

    print('execution time:', end_time - start_time, 'seconds \n') # Calculate time usage
 
    return indices_sorted, X_calibrated_optimized, n_components_optimized, wavelengths_discarded

### 2.4. Preparation Functions

#### 2.4.1. General Preparation Functions

In [None]:
# Define function to encode labels from names (type: string) => label integer (type: int) and from label integers to binary class matrices
def encode_label(y: list):
    
    """
    Function to encode labels from names (type: string) => label integer (type: int) and from label integers binary class matrices
    steps:
    1. Encoding labels to label integers
    2. Encoding label integers to binary class matrices
    3. Return outputs
    """
    
    # Import libraries
    from sklearn.preprocessing import LabelEncoder
    from tensorflow.keras.utils import to_categorical

    print('begin to label encoding')
    
    # Encoding labels to label integers
    encoder = LabelEncoder() # Define label encoder
    y_encoded = encoder.fit_transform(y) # Transform labels => encoded labels

    # Encoding label integers to binary class matrices
    y_categorized = to_categorical(y=y_encoded)
    
    print('label encoding done \n')
    
    return y_encoded, y_categorized

#### 2.4.2. CNN-related Preparation Functions

In [None]:
# Define function to perform data preparation by execute function to perform data preparation by encoded y into one-hot format and split X and y follow "test_size"
def prepare_dataset_image(X,
                          y: list,
                          num_classes: int,
                          test_size: float,
                          is_slice: bool,
                          index_wavelength: int):

    """
    Function to perform data preparation by encoded y into one-hot format and split X and y follow "test_size"
    steps:
    1. Consider slice dataset for a single wavelength option checker, if "is_slice" == True: 
        - Slice dataset for spatial-axes in specific spectral-axis index
        - Expand dimension for training Network model
        - Set sliced dataset to X
    1. Split the prepared X and y into train & test datasets
    2. Return outputs
    """
    
    # Import library
    from sklearn.model_selection import train_test_split
    
    print('begin to prepare dataset')
    
    # Slice dataset for a single wavelength option checker
    if is_slice == True:
        
        # Slice dataset for spatial-axes in specific spectral-axis index
        X_sliced = X[..., index_wavelength]
        X_sliced = np.expand_dims(X_sliced, axis=-1) # Expand dimension for training Network model
        X = X_sliced # Set sliced dataset to X
        
    # Split the prepared X and y into train & test datasets
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        y,
                                                        test_size=test_size,
                                                        stratify=y_encoded)
    
    print('prepare dataset done')
    
    return X_train, X_test, y_train, y_test

In [None]:
# Define function to sort index evaluation score and filter dataset
def filter_dataset(X,
                   evaluation_score,
                   metric: str,
                   number: int,
                   is_exclude: bool):
    
    """
    Function to sort index evaluation score and filter dataset
    steps:
    1. Consider exclude head and tail wavelengths checker, if "is_exclude" == True:
       Exclude head and tail wavelengths from wavelength selection
       
    2. Consider filter metric option checker for loss or accuracy or mse,
       If 'metric' == 'loss': 
           - Preparatition indices as follow loss score
           - Obtain indices as correspond 'number' of the lowest loss score
           - Obtain indices list sorted by evaluation metrics
           - Loop through selected indices for print output
       If 'metric' == 'accuracy':
           - Preparatition indices as follow accuracy score
           - Obtain indices as correspond 'number' of the highest accuracy score
           - Obtain indices list sorted by evaluation metrics
           - Loop through selected indices for print output
        If 'metric' == 'mse':
           - Filter option checker of PLS regression
    3. Filter dataset using candidate indices
    """
    
    # Import library
    import numpy as np
    
    # Exclude head and tail wavelengths checker
    if is_exclude == True:
        
        # Exclude head and tail wavelengths from wavelength selection
        evaluation_score[:5, 0] = 99 # Maximize loss
        evaluation_score[:5, 1] = 0 # Minimize accuracy
        evaluation_score[-5:, 0] = 99 # Maximize loss
        evaluation_score[-5:, 1] = 0 # Minimize accuracy
            
    # Filter metric option checker for loss
    if metric == 'loss':
    
        # Preparatition indices as follow loss score
        indices_partitioned = np.argpartition(evaluation_score[:, 0], number)[:number]

        # Obtain indices as correspond 'number' of the lowest loss score
        indices = indices_partitioned[np.argsort((-evaluation_score[:, 0])[indices_partitioned])]
        
        # Obtain indices list sorted by evaluation metrics
        temp_indices = np.argpartition(evaluation_score[:, 0], (evaluation_score.shape[0] - 1))[:(evaluation_score.shape[0] - 1)]
        indices_list_sorted = temp_indices[np.argsort((-evaluation_score[:, 0])[temp_indices])]
        
        # Loop through selected indices for print output
        for index in indices:
    
            print('index:', index, 'score:', evaluation_score[index, 0])
        
    # Filter metric option checker for accuracy
    elif metric == 'accuracy':
    
        # Preparatition indices as follow accuracy score
        indices_partitioned = np.argpartition(evaluation_score[:, 1], -number)[-number:]

        # Obtain indices as correspond 'number' of the highest accuracy score
        indices = indices_partitioned[np.argsort((-evaluation_score[:, 1])[indices_partitioned])]
        
        # Obtain indices list sorted by evaluation metrics
        temp_indices = np.argpartition(evaluation_score[:, 1], -1 * (evaluation_score.shape[0] - 1))[-1 * (evaluation_score.shape[0] - 1):]
        indices_list_sorted = temp_indices[np.argsort((-evaluation_score[:, 1])[temp_indices])]

        # Loop through selected indices for print output
        for index in indices:
    
            print('index:', index, 'score:', evaluation_score[index, 1])
    
    # Filter option checker of PLS regression
    elif metric == 'mse':
        
        # Slice top index corresponding to MSE
        indices = evaluation_score[:number]
        
        indices_list_sorted = evaluation_score # Assign for completion

        # Loop through selected indices for print output
        for index in indices:
    
            print('index:', index)

    # Filter dataset using candidate indices
    X_filtered = X[..., indices]
#     indices_sorted = sorted(indices_partitioned)
    
    return X_filtered, indices_list_sorted

#### 2.4.3. PLSR-related Preparation Functions

In [None]:
# Define function to perform data preparation with optimize parameters for visualizing plots among sepectra
def prepare_viz_spectral(X,
                         y_encoded: list,
                         n_components: int,
                         head_tail: tuple):

    """
    Function to perform data preparation with optimize parameters for visualizing absolute value of PLS regression coefficients
    steps:
    1. Slice dataset by head-tail trimming and reshape trimmed X for PLS regressor desirable shape
    2. Execute function to perform PLS regression with specific component number
    3. Return outputs
    """

    # Prepare trimmed X
    X_trimmed = X[..., head_tail[0]:head_tail[1]] # Slice dataset by head-tail trimming
    X_trimmed_reshaped = X_trimmed.ravel().reshape(X_trimmed.shape[0],-1) # Reshape trimmed X for PLS regressor desirable shape    
    
    # Execute function to perform PLS regression with specific component number
    plsr_X, _ = plsr_fit_transform(X=X_trimmed_reshaped,
                                   y=y_encoded,
                                   n_components=n_components)

    return X_trimmed, X_trimmed_reshaped, plsr_X

In [None]:
# Define function to perform data preparation by wavelength selection then split dataset into train & test datasets
def prepare_dataset_signal(X,
                           y: list,
                           indices_sorted: list,
                           wavelengths_discarded: int,
                           test_size: float):

    """
    Function to perform data preparation by wavelength selection then separate dataset into train & test datasets
    steps:
    1. Prepare indices; discards indices as follow optimization and unsort the discarded indices to original order
    2. Prepare X by wavelength selection; slice X with optimize wavelength selection reshape selected X for PLS regressor desirable shape
    3. Split the prepared X into train & test datasets
    4. Return output
    """
    
    # Import library
    from sklearn.model_selection import train_test_split
    
    print('begin to prepare dataset')
    
    # Prepare indices
    indices_sorted_discarded = indices_sorted[:len(indices_sorted) - wavelengths_discarded] # Discards indices as follow optimization
    indices_discarded = sorted(indices_sorted_discarded) # Unsort the discarded indices to original order
    
    # Prepare X by wavelength selection
    X_prepared = X[..., indices_discarded] # Slice X with optimize wavelength selection
    X_prepared_reshaped = X_prepared.ravel().reshape(X_prepared.shape[0], X_prepared.shape[1], X_prepared.shape[2], X_prepared.shape[3]) # Reshape selected X for PLS regressor desirable shape
    
    # Split the prepared X into train & test datasets
    X_train, X_test, y_train, y_test = train_test_split(X_prepared_reshaped, 
                                                        y,
                                                        test_size=test_size,
                                                        stratify=y_encoded) #!
     
    print('prepare dataset done')
    
    return X_train, X_test, y_train, y_test

### 2.5. Visualization Functions

In [None]:
# Define function to visualize PSNR/reflectance plot among treatments from assigned processing method
def viz_averaged(X,
                 y: list,
                 labels: dict,
                 wavelengths: list,
                 head_tail: tuple,
                 color_palette: list,
                 is_plot_averaged: bool,
                 is_plot_filtered: bool,
                 alpha: float,
                 mask: list,
                 y_axis_name: str,
                 figurename: str,
                 save: bool,
                 path_to_save: str,
                 yticks: np.array):
    
    '''
    Function to visualize PSNR/reflectance plot among treatments from assigned processing method
    steps:
    1. Define x-coordinate for plotting
    2. Loop through labels then execute function to extract sample indices from specified label name
    3. Loop through sample index then define y-coordinates for drawing considering sample figure
    4. Draw individual sample figure
    5. Consider plot averaged values option checker; if "is_plot_averaged" == True: add filter to figure
    6. Consider save figure option checker; if "save" == True: save figure
    '''
    
    # Import libraries
    import numpy as np
    from datetime import date
    from matplotlib import pyplot as plt
    import matplotlib.collections as collections
    
    # Figure configuration
    fig, ax = plt.subplots(figsize=(10,8)) # Create empty figure for drawing figures of PSNR
    x_coordinates = np.arange(wavelengths.shape[0]) # Define x-coordinate for drawing
    fontsize = 18
    
    # Loop through label
    for index, label in enumerate(labels):
        
        # Execute function to extract sample indices from specified label name
        samples = extract_indices(y,
                                  label=label)
        
        # Loop through sample index
        for sample in samples:
            
            # Define y-coordinates for drawing considering sample figure
            y_coordinates = X[sample,:]

            # Draw individual sample figure
            ax.plot(x_coordinates[head_tail[0]:head_tail[1]], 
                    y_coordinates[head_tail[0]:head_tail[1]], 
                    alpha=alpha, 
                    color=color_palette[index])
            
        # Plot averaged values option checker
        if is_plot_averaged == True:

            # Plot averaged line
            ax.plot(x_coordinates[head_tail[0]:head_tail[1]], 
                    np.mean(X[samples,:], axis=0)[head_tail[0]:head_tail[1]], 
                    label=labels[label], 
                    color=color_palette[index], 
                    linestyle='solid', marker='.')
                
        # Plot filtered wavelengths option checker
        if is_plot_filtered == True:

            # Define filter configuration
            collection = collections.BrokenBarHCollection.span_where(
                            x_coordinates[head_tail[0]:head_tail[1]], 
                            ymin=0, 
                            ymax=0.2, 
                            where=mask == False, 
                            facecolor='#FFFA4D', 
                            alpha=0.9)
            
            ax.add_collection(collection) # Add filter to figure
            
    # Decoration
    plt.xticks(ticks=x_coordinates[head_tail[0]:head_tail[1]][0:-1:20], 
               labels=wavelengths[head_tail[0]:head_tail[1]][0:-1:20],
               rotation=45,
               fontsize=12)
    
    
    plt.yticks(ticks=yticks,
               fontsize=12)
    
    plt.xlabel('Wavelength (nm)', fontsize=fontsize - 2)
    plt.ylabel(y_axis_name, fontsize=fontsize - 2)
    plt.legend(bbox_to_anchor=(1.00, 1.00), fontsize=fontsize - 4)
    
    # Save figure option checker
    if save == True:
    
        plt.savefig(path_to_save + figurename + '_' + str(date.today()) + '.png') # Save figure
        
        print('save figure:', figurename, 'done')
        
    plt.show()
    
    return None

In [None]:
# Define function to ilterately visualize PSNR/reflectance plot a treatment at a time from assigned processing method by execute function to visualize PSNR/reflectance plot among treatments from assigned processing method 
def viz_averaged_iterated(X,
                          y: list,
                          labels: dict,
                          wavelengths: list,
                          head_tail: tuple,
                          color_palette: list,
                          is_plot_averaged: bool,
                          is_plot_filtered: bool,
                          alpha: float,
                          mask: list,
                          y_axis_name: str,
                          figurename: str,
                          save: bool,
                          path_to_save: str):

    """
    Function to ilterately visualize PSNR/reflectance plot a treatment at a time from assigned processing method by execute function to visualize PSNR/reflectance plot among treatments from assigned processing method
    steps:
    1. Loop through label
    2. Execute function to visualize PSNR plot among treatments from wavelet denoising
    """
    
    # Loop through label
    for index, label in enumerate(labels):

        # Execute function to visualize PSNR plot among treatments from wavelet denoising
        viz_averaged(X=X,
                     y=y,
                     labels={label:labels[label]},
                     wavelengths=wavelengths,
                     head_tail=head_tail,
                     color_palette=[color_palette[index]],
                     is_plot_averaged=is_plot_averaged,
                     is_plot_filtered=is_plot_filtered,
                     alpha=alpha,
                     mask=mask,
                     y_axis_name=y_axis_name,
                     figurename=figurename,
                     save=save,
                     path_to_save=path_to_save)
                          
    return None

In [None]:
# Define function to visualize evaluation metrics from image classification
def viz_evaluation(evaluation_score,
                   plot: str,
                   figurenames: list,
                   save: bool,
                   path_to_save: str):
    """
    Function to visualize evaluation metrics from image classification
    """
    
    # Import library
    import seaborn as sns
    from datetime import date
    from matplotlib import pyplot as plt
    
    # Loss plot
    ## Figure configuration
    fontsize = 16
    fig, ax = plt.subplots(figsize=(10,8)) # Create empty figure for drawing figure
    color = '#205375'
    
    if plot == 'scatter':
    
        ## Draw scatter plot of loss score
        plt.scatter(x=wavelengths, 
                    y=evaluation_score[:, 0],
                    color=color)
        
        # Decoration
        plt.title(label=figurenames[0], fontsize=fontsize)
        plt.xlabel('Wavelength (nm)', fontsize=fontsize)
        plt.ylabel('Loss', fontsize=fontsize)

    elif plot == 'histogram':
        
        ## Draw histogram of loss score
        sns.histplot(data=evaluation_score, 
                     x=evaluation_score[:, 0], 
                     kde=True,
                     color=color)
    
        # Decoration
        plt.title(label=figurenames[0], fontsize=fontsize)
        plt.xlabel('Loss', fontsize=fontsize)
        plt.ylabel('Count', fontsize=fontsize)

    # Save figure option checker
    if save == True:
    
        plt.savefig(path_to_save + figurenames[0] + '_' + str(date.today()) + '.png') # Save figure
        
        print('save figure:', figurenames[0], 'done')
        
    plt.show()    
    
    # --------------------------------------------------------------------------------------------------------    
    # Accuracy plot
    ## Figure configuration
    fig, ax = plt.subplots(figsize=(10,8)) # Create empty figure for drawing figure
    color = '#F66B0E'
    
    if plot == 'scatter':
    
        ## Draw scatter plot of loss score
        plt.scatter(x=wavelengths, 
                    y=evaluation_score[:, 1],
                    color=color)
        
        # Decoration
        plt.title(label=figurenames[1], fontsize=fontsize)
        plt.xlabel('Wavelength (nm)', fontsize=fontsize)
        plt.ylabel('Accuracy', fontsize=fontsize)

    elif plot == 'histogram':
        
        ## Draw histogram of loss score
        sns.histplot(data=evaluation_score, 
                     x=evaluation_score[:, 1], 
                     kde=True,
                     color=color)
    
        # Decoration
        plt.title(label=figurenames[1], fontsize=fontsize)
        plt.xlabel('Accuracy', fontsize=fontsize)
        plt.ylabel('Count', fontsize=fontsize)
    
    # Save figure option checker
    if save == True:
    
        plt.savefig(path_to_save + figurenames[1] + '_' + str(date.today()) + '.png') # Save figure
        
        print('save figure:', figurenames[1], 'done')
        
    plt.show()
    
    return None

In [None]:
# Define function visualize PLS regression performance
def viz_pls_cv(plsr,
               X, 
               y: list, 
               figurename: str,
               save: bool,
               path_to_save: str):
    
    """
    Function visualize PLS regression performance
    steps:
    1. Transpose regression coefficients for facilitate further plotting
    2. Calculate calibration and calculate cross-validation
    3. Calculate explained variance for calibration and cross validation
    4. Calculate scores for calibration and cross-validation
    5. Calculate scores for calibration and cross-validation
    6. Calculate least square polynomial fit between labels and predicted labels from PLSR regression
    7. Draw scatter plot of data points
    8. Draw line plot of PLS regression fitting (using least square polynomial fit)
    9. Draw line plot of true label
    10. Consider option save figure option checker; if save == True: save figure
    11. Return outputs
    """
 
    # Import libraries
    from datetime import date
    from matplotlib import pyplot as plt
    from sklearn.model_selection import cross_val_predict
    from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
    
    coeff = plsr.x_loadings_.T # Transpose regression coefficients for facilitate further plotting
    
    # Calculate calibration
    y_calibrated = plsr.predict(X)
 
    # Calculate cross-validation
    y_cross_validated = cross_val_predict(estimator=plsr, 
                                          X=X, 
                                          y=y, 
                                          cv=10)
    
    # Calculate explained variance for calibration and cross validation
    explained_variance_calibrated = explained_variance_score(y_true=y, 
                                                             y_pred=y_calibrated)
    
    explained_variance_cross_validated = explained_variance_score(y_true=y, 
                                                             y_pred=y_cross_validated)
    
    # Calculate scores for calibration and cross-validation
    score_calibrated = r2_score(y_true=y, 
                                y_pred=y_calibrated)
    
    score_cross_validated = r2_score(y_true=y, 
                                     y_pred=y_cross_validated)
    
    # Calculate mean square error for calibration and cross validation
    mse_calibrated = mean_squared_error(y_true=y, 
                                        y_pred=y_calibrated)
    
    mse_cross_validated = mean_squared_error(y_true=y, 
                                             y_pred=y_cross_validated)
    
    print('Explained Variance calibrated: %5.3f'  % explained_variance_calibrated)
    print('Explained Variance cross-validated: %5.3f'  % explained_variance_cross_validated)
    print('R2 calibrated: %5.3f'  % score_calibrated)
    print('R2 cross-validated: %5.3f'  % score_cross_validated)
    print('MSE calibrated: %5.3f' % mse_calibrated)
    print('MSE cross-validated: %5.3f' % mse_cross_validated)
    
    # Calculate least square polynomial fit between labels and predicted labels from PLSR regression
    coef_fitted = np.polyfit(x=y, 
                             y=y_cross_validated, 
                             deg=1)
    
    # Figure configuration
    fig, ax = plt.subplots(figsize=(10,8)) # Plotting configuration
    fontsize = '14' # Set font size
    
    # Draw scatter plot of data points
    ax.scatter(x=y_cross_validated, 
               y=y, 
               color='#2d6a4f', 
               edgecolors='#14213d')
    # Draw line plot of PLS regression fitting (using least square polynomial fit)
    ax.plot(coef_fitted[0] * y + coef_fitted[1], 
            y,
            color='#3a86ff', 
            label='Fit label')
    
    # Draw line plot of true label
    ax.plot(y, 
            y, 
            color='#fb5607', 
            label='True label')
    
    # Decoration
    plt.xlabel('Predicted label', fontsize=fontsize)
    plt.ylabel('True label', fontsize=fontsize)
    ax.legend(fontsize=fontsize)    
        
    # Save figure option checker
    if save == True:
    
        plt.savefig(path_to_save + figurename + '_' + str(date.today()) + '.png') # save figure
        
        print('save figure:', figurename, 'done')
        
    plt.show()

    return coeff, y_cross_validated, mse_cross_validated

In [None]:
# Define function to visualize confusion matrix from feature extractor
def viz_confusion_matrix(y: list,
                         y_cross_validated: list,
                         figurename: str,
                         save: bool,
                         path_to_save: str):
    
    """
    Function to visualize confusion matrix of PLS regression using parameters from "viz_pls_cv"
    steps:
    1. Plot confusion matrix
    2. Consider option save figure option checker; if save == True: save figure
    """

    # Import libraries
    from datetime import date
    from matplotlib import pyplot as plt
    from scikitplot.metrics import plot_confusion_matrix    
    
    # Figure configuration
    fig, ax = plt.subplots(figsize=(8,8)) # Plotting configuration
    fontsize = '16' # Font size
    figsize = (8, 8) # Figure size
    cmap = plt.cm.PuBuGn # Set color map of confusion matrix
    
    # Plot confusion matrix
    plot_confusion_matrix(y_true=y, 
                          y_pred=np.round(y_cross_validated), 
                          normalize=True, 
                          title=figurename, 
                          ax=ax, 
                          cmap=cmap)

    # Decoration
    num_classes = len(list(set(y)))
    plt.axis([-0.5, num_classes - 0.5, -0.5, num_classes - 0.5])
    plt.xlabel('Predicted label', fontsize=fontsize)
    plt.ylabel('True label', fontsize=fontsize)
    
    
    # Save figure option checker
    if save == True:
    
        plt.savefig(path_to_save + figurename + '_' + str(date.today()) + '.png') # Save figure
        
        print('save figure:', figurename, 'done')        
        
    plt.show()
    
    return None

In [None]:
# Define function to comparative visualize among spectral-axis
def viz_spectral(data,
                 indices_list: list,
                 wavelengths: list,
                 save: bool,
                 figurename: str, 
                 path_to_save: str):
    
    '''
    Function to comparative visualize among spectral-axis
    steps:
    1. Loop through assigned wavelengt for drawing 9 images, then draw image
    2. Consider save figure option checker; if save == True: figure
    '''
    
    # Import libraries
    import numpy as np
    from datetime import date
    from matplotlib import pyplot as plt # figure drawing
    from scipy import ndimage as ndimage # image rotation
    
    # Figure configuration
    fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(10, 8), sharex=True, sharey=True)
    fontsize = '16' # Define fontsize
    axs = axs.ravel()

    # Loop through assigned wavelengt for drawing 9 images
    for index, wavelength in enumerate(indices_list):
        
        # draw image
        axs[index].imshow(ndimage.rotate(input=data[...,wavelength], angle=180), aspect='auto') # draw image
        axs[index].set_title('\u03BB = ' + str(wavelengths[wavelength]) + ' nm', fontsize=fontsize) # define image title
        axs[index].axis("off") # disable axes
    
    # Decoration
    fig.tight_layout()
    
    # Save figure option checker
    if save == True:
    
        plt.savefig(path_to_save + figurename + '_' + str(date.today()) + '.png') # Save figure
        
        print('save figure:', figurename, 'done')        
        
    plt.show()
    
    return None

In [None]:
# Define function to visualize biplot from PLS regression results
def viz_biplot(data,
               X,
               y,
               coeff,
               labels: dict,
               y_encoded: list,
               wavelengths: list,
               color_palette: list,
               figurename: str,
               save: bool,
               loadings_label: bool):

    """
    Function to visualize biplot from PLS regression results
    steps:
    1. Loop through dataset in Dataframe
    2. Draw scatter plot of transformed value of considering dataset
    3. Draw loading plot as correspond index
    4. Check option to save output as argument; "save" == True, to save output to destination path
    5. Check option to show loading labels; "loadings_label" == True, set label of loading plot
    """

    # Import libraries
    import numpy as np
    from matplotlib import pyplot as plt
    import seaborn as sns
    
    # Define variables
    X_appended_y = np.hstack((X,y_encoded[:,None])) # hstack both the array together.
    X_appended_y_sorted = X_appended_y[X_appended_y[:, -1].argsort()] # sort the array by the last column
    X_grouped = np.split(X_appended_y_sorted[:,:-1], np.unique(X_appended_y_sorted[:, -1], return_index=True)[1][1:]) # split the array based on unique value index

    # Extract label_names from labels
    labels = list(labels.values()) #!
    
    # Figure configuration
    sns.set(font_scale=1) # option to set font scale
    fontsize = 16 # set fontsize
    scale_axes = 0.5 # scale value of axes
    fig, ax = plt.subplots(figsize=(12,12)) # define figure configuration
    
    # Loop through X index
    for index_X, X_separated in enumerate(X_grouped):

        x_axis = X_separated[:, 0] # Define x-axis
        y_axis = X_separated[:, 1] # Define y-axis
        scale_x_axis = scale_axes / (x_axis.max() - x_axis.min()) # Define min-max scaler of x-axis
        scale_y_axis = scale_axes / (y_axis.max() - y_axis.min()) # Define min-max scaler of y-axis
        
        label = labels[index_X]
        color = color_palette[index_X] # Define color from color palette

        # Draw scatter plot
        scatter = ax.scatter(x_axis * scale_x_axis, 
                             y_axis * scale_y_axis,
                             color=color, label=label)


    # Loop through PLS regression component
    for index_coeff in range(coeff.shape[0]):
     
        # Draw loading plot as correspond index
        ax.arrow(0, 0, coeff[index_coeff, 0], coeff[index_coeff, 1], color = '#999999', alpha = 0.9)
 
        wavelength = wavelengths[index_coeff] # define label of loading plot according to regression coefficient value

        # Check option to show loading labels
        if loadings_label == True:

            # Set label of loading plot
            plt.text(x=coeff[index_coeff, 0] * 1.15, 
                     y=coeff[index_coeff, 1] * 1.15, 
                     s='\u03BB = ' + str(wavelength) + ' nm', color = '#777777', 
                     ha = 'center', va = 'center', fontsize=fontsize)

    # Decoration
    plt.axhline(0, color='black')
    plt.axvline(0, color='black')
    ax.set_xlim(-0.5, 0.5)
    ax.set_ylim(-0.5, 0.5)
    ax.set_xlabel('Component 1', fontsize=fontsize) # Set x-axis label
    ax.set_ylabel('Component 2', fontsize=fontsize) # Set y-axis label
    ax.legend(fontsize=fontsize - 4)

    # Save figure option checker
    if save == True:
        
        file_to_save = path_to_save + figurename + '.png' # Define destination path to save file
        fig.savefig(file_to_save) # Dave figure to destination path

        print('save figure:', figurename, 'done')

    return None

In [None]:
# Define function to visualize label distribution of label dataset
def viz_label_distribution(y, 
                           labels: dict,
                           name_groups: list,
                           color_palette: list,
                           figurename:str,
                           path_to_save: str,
                           save: bool):
    
    """
    Function to visualize label distribution of label dataset
    steps:
    1. Separate treatments by variety
    2. Count labels
    3. Draw variety A label distribution and draw variety B label distribution
    4. Consider option to save figure option checker; if save == True: save figure
    """
    
    # Import libraries
    import numpy as np
    from datetime import date
    from matplotlib import pyplot as plt
    
    # Extract label_names from labels
    labels = list(labels.values()) #!
    
    # Separate treatments by variety
    y_1 = [label for label in np.argmax(y, axis=1) if label in [0, 1, 2]] # 1st group
    y_2 = [label for label in np.argmax(y, axis=1) if label not in [0, 1, 2]] # 2nd group
    
    # Count labels
    labels_1, counts_1 = np.unique(y_1, return_counts=True) # 1st group
    labels_2, counts_2 = np.unique(y_2, return_counts=True) # 2nd group
    
    # Figure configuration
    fig, ax = plt.subplots(figsize=(10, 8))
    plt.ylabel('Count', fontsize = 16)
    fontsize = 18

    # Draw 1st group distribution
    plt.bar(labels_1, counts_1, label=name_groups[0], color=color_palette[0], edgecolor='#495057')

    # Draw 2nd group distribution
    plt.bar(labels_2, counts_2, label=name_groups[1], color=color_palette[1], edgecolor='#495057')

    # Decoration
    plt.xticks(ticks=np.arange(0,6,1), 
               labels=labels,
               rotation = 90,
               fontsize=fontsize - 4)
    plt.ylabel('Count', fontsize = 16)
    plt.xlabel('Label', fontsize = 16)
    
    plt.legend(fontsize=fontsize - 4)

    # Save figure option checker
    if save == True:

        plt.savefig(path_to_save + figurename + '_' + str(date.today()) + '.png') # Save figure
        
        print('save figure:', figurename, 'done')

    plt.show()

    return None

# 3. Implementation

### 3.1. Data Acquisition

Loading datasets

In [None]:
# Execute function to perform load datasets from given path and filenames then merge together
# Dataset
## Variables, a.k.a. X
X = merge_datasets(path_to_files=path_to_files,
                   filenames=filenames_X,
                   is_y=False)

## Labels, a.k.a. y
y = merge_datasets(path_to_files=path_to_files,
                   filenames=filenames_y,
                   is_y=True)

# --------------------------------------------------------------------------------------------------------
# Reflectances
## Calibrated averaged reflectances
reflectances_averaged = merge_datasets(path_to_files=path_to_files,
                                       filenames=filenames_reflectances_averaged,
                                       is_y=False)

# --------------------------------------------------------------------------------------------------------
# PSNR
## PSNR of images from wavelet denoising 
psnr_wavelet = merge_datasets(path_to_files=path_to_files,
                              filenames=filenames_psnr_wavelet,
                              is_y=False)

## PSNR of images from wavelet denoising and median filtering
psnr_median = merge_datasets(path_to_files=path_to_files,
                             filenames=filenames_psnr_median,
                             is_y=False)

## PSNR of images from wavelet denoising, median filtering, and triangle thresholding
psnr_triangle = merge_datasets(path_to_files=path_to_files,
                               filenames=filenames_psnr_triangle,
                               is_y=False)

# --------------------------------------------------------------------------------------------------------

Explore the loaded dataset

In [None]:
print('X feature dataset has dimesionality of', X.shape)

print('y label dataset has dimesionality of', np.shape(y))

print('averaged reflectances dataset has dimesionality of', reflectances_averaged.shape)

print('PSNR of features from wavelet denoising has dimesionality of', psnr_wavelet.shape)

print('PSNR of features from median filtering has dimesionality of', psnr_median.shape)

print('PSNR of features from triangle thresholding has dimesionality of', psnr_triangle.shape)

Encode label

In [None]:
# Execute function to encode labels from names (type: string) => label integer (type: int) and from label integers to binary class matrices
y_encoded, y_categorized = encode_label(y=y)

### 3.2. EDA

Explore data to consider wether trimming head-tail wavelengths using spectoscopy approach

#### 3.2.1. Averaged Reflectances from Calibration

In [None]:
# Define metadata
y_axis_name = 'Average Reflectance'
figurename = 'Average Reflectances from Calibration against spectral domain'

# Execute function to visualize PSNR/reflectance plot among treatments from assigned processing method
viz_averaged(X=reflectances_averaged,
             y=y,
             labels=labels,
             wavelengths=wavelengths,
             head_tail=(0, -1),
             color_palette=color_palette,             
             is_plot_averaged=True,
             is_plot_filtered=False,
             alpha=0.1,
             mask=[],
             y_axis_name=y_axis_name,
             figurename=figurename,
             save=save,
             path_to_save=path_to_save,
             yticks=np.arange(0,0.25, 0.050))

#### 3.2.2. Averaged Reflectances from Preprocessing Methods

In [None]:
# Execute utility function to perform looping average values calculation from multiple label data 
reflectances_averaged_preprocessed = calculate_average_all_data(data=X,
                                                                y=y,
                                                                labels=labels)

# Define metadata
y_axis_name = 'Average Reflectance'
figurename = 'Average Reflectances from Preprocessing Methods against spectral domain'

# Execute function to visualize PSNR/reflectance plot among treatments from assigned processing method
viz_averaged(X=reflectances_averaged_preprocessed,
             y=y,
             labels=labels,
             wavelengths=wavelengths,
             head_tail=(0, -1),
             color_palette=color_palette,             
             is_plot_averaged=True,
             is_plot_filtered=False,
             alpha=0.1,
             mask=[],
             y_axis_name=y_axis_name,
             figurename=figurename,
             save=save,
             path_to_save=path_to_save,
             yticks=np.arange(0,0.25, 0.05))

#### 3.2.4. PSNR Plot among Treatments from Wavelet Denoising

In [None]:
# Define metadata
y_axis_name = 'PSNR'
figurename = 'PSNR from Wavelet Denoising against spectral domain '

# Execute function to visualize PSNR/reflectance plot among treatments from assigned processing method
viz_averaged(X=psnr_wavelet,
             y=y,
             labels=labels,
             wavelengths=wavelengths,
             head_tail=(0, -1),
             color_palette=color_palette,             
             is_plot_averaged=True,
             is_plot_filtered=False,
             alpha=0.1,
             mask=[],
             y_axis_name=y_axis_name,
             figurename=figurename,
             save=save,
             path_to_save=path_to_save,
             yticks=np.arange(50,129, 15))

#### 3.2.4. PSNR Plot among Treatments from Wavelet Denoising and Median Filtering

In [None]:
# Define metadata
y_axis_name = 'PSNR'
figurename = 'PSNR from Median Filtering against spectral domain'

# Execute function to visualize PSNR/reflectance plot among treatments from assigned processing method
viz_averaged(X=psnr_median,
             y=y,
             labels=labels,
             wavelengths=wavelengths,
             head_tail=(0, -1),
             color_palette=color_palette,             
             is_plot_averaged=True,
             is_plot_filtered=False,
             alpha=0.1,
             mask=[],
             y_axis_name=y_axis_name,
             figurename=figurename,
             save=save,
             path_to_save=path_to_save,
             yticks=np.arange(15,55, 10))

#### 3.2.5. PSNR Plot among Treatments from Wavelet Denoising, Median Filtering and T riangleThresholding

In [None]:
# # Define metadata
# y_axis_name = 'PSNR'
# figurename = 'PSNR Plot among Treatments from \nWavelet Denoising, Median Filtering and Triangle Thresholding'

# # Execute function to visualize PSNR/reflectance plot among treatments from assigned processing method
# viz_averaged(X=psnr_triangle,
#              y=y,
#              labels=labels,
#              wavelengths=wavelengths,
#              head_tail=(0, -1),
#              color_palette=color_palette,             
#              is_plot_averaged=True,
#              is_plot_filtered=False,
#              alpha=0.1,
#              mask=[],
#              y_axis_name=y_axis_name,
#              figurename=figurename,
#              save=save,
#              path_to_save=path_to_save)

#### 3.2.6. Treatment-wised Averaged Reflectances from Calibration

Averaged reflectances from calibration

In [None]:
# # Define metadata
# y_axis_name = 'Reflectances'
# figurename = 'Treatment-wised Averaged Reflectances from Calibration'

# # Execute function to ilterately visualize PSNR/reflectance plot a treatment at a time from assigned processing method by execute function to visualize PSNR/reflectance plot among treatments from assigned processing method 
# viz_averaged_iterated(X=reflectances_averaged,
#                       y=y,
#                       labels=labels,
#                       wavelengths=wavelengths,
#                       head_tail=head_tail,
#                       color_palette=color_palette,             
#                       is_plot_averaged=True,
#                       is_plot_filtered=False,
#                       alpha=0.1,
#                       mask=[],
#                       y_axis_name=y_axis_name,
#                       figurename=figurename,
#                       save=False,
#                       path_to_save=path_to_save)

#### 3.2.7. Treatment-wised Averaged Reflectances from Preprocessing Methods

In [None]:
# # Define metadata
# y_axis_name = 'Reflectances'
# figurename = 'Treatment-wised Averaged Reflectances from Preprocessing Methods'

# # Execute function to ilterately visualize PSNR/reflectance plot a treatment at a time from assigned processing method by execute function to visualize PSNR/reflectance plot among treatments from assigned processing method 
# viz_averaged_iterated(X=reflectances_averaged_preprocessed,
#                       y=y,
#                       labels=labels,
#                       wavelengths=wavelengths,
#                       head_tail=head_tail,
#                       color_palette=color_palette,             
#                       is_plot_averaged=True,
#                       is_plot_filtered=False,
#                       alpha=0.1,
#                       mask=[],
#                       y_axis_name=y_axis_name,
#                       figurename=figurename,
#                       save=False,
#                       path_to_save=path_to_save)

#### 3.2.8. Treatment-wised PSNR Plot among Treatments from Wavelet Denoising

In [None]:
# # Define metadata
# y_axis_name = 'PSNR'
# figurename = 'Treatment-wised PSNR Plot among Treatments from Wavelet Denoising'

# # Execute function to ilterately visualize PSNR/reflectaxrnce plot a treatment at a time from assigned processing method by execute function to visualize PSNR/reflectance plot among treatments from assigned processing method 
# viz_averaged_iterated(X=psnr_wavelet,
#                       y=y,
#                       labels=labels,
#                       wavelengths=wavelengths,
#                       head_tail=head_tail,
#                       color_palette=color_palette,             
#                       is_plot_averaged=True,
#                       is_plot_filtered=False,
#                       alpha=0.1,
#                       mask=[],
#                       y_axis_name=y_axis_name,
#                       figurename=figurename,
#                       save=False,
#                       path_to_save=path_to_save)

#### 3.2.9. Treatment-wised PSNR Plot among Treatments from Wavelet Denoising and Median Filtering

In [None]:
# # Define metadata
# y_axis_name = 'PSNR'
# figurename = 'Treatment-wised PSNR Plot among Treatments from \nWavelet Denoising and Median Filtering'

# # Execute function to ilterately visualize PSNR/reflectance plot a treatment at a time from assigned processing method by execute function to visualize PSNR/reflectance plot among treatments from assigned processing method 
# viz_averaged_iterated(X=psnr_median,
#                       y=y,
#                       labels=labels,
#                       wavelengths=wavelengths,
#                       head_tail=head_tail,
#                       color_palette=color_palette,             
#                       is_plot_averaged=True,
#                       is_plot_filtered=False,
#                       alpha=0.1,
#                       mask=[],
#                       y_axis_name=y_axis_name,
#                       figurename=figurename,
#                       save=False,
#                       path_to_save=path_to_save)

#### 3.2.10. Treatment-wised PSNR Plot among Treatments from Wavelet Denoising, Median Filtering and Triangle Thresholding

In [None]:
# # Define metadata
# y_axis_name = 'PSNR'
# figurename = 'Treatment-wised PSNR Plot among Treatments from \nWavelet Denoising, Median Filtering and Triangle Thresholding'

# # Execute function to ilterately visualize PSNR/reflectance plot a treatment at a time from assigned processing method by execute function to visualize PSNR/reflectance plot among treatments from assigned processing method 
# viz_averaged_iterated(X=psnr_triangle,
#                       y=y,
#                       labels=labels,
#                       wavelengths=wavelengths,
#                       head_tail=head_tail,
#                       color_palette=color_palette,             
#                       is_plot_averaged=True,
#                       is_plot_filtered=False,
#                       alpha=0.1,
#                       mask=[],
#                       y_axis_name=y_axis_name,
#                       figurename=figurename,
#                       save=False,
#                       path_to_save=path_to_save)

### 3.3. Feature Extraction

#### 3.3.a. Feature Extraction using Image Classification Approach

##### 3.3.a.1. Extract Feature using Image Classification Approach

**Note**: Very high resource consumption

In [None]:
# Neural network model checker
if is_network == True:
    
    # New model training checker
    if is_load_model == False:

        # Execute function perform feature extraction as wavelength selection by image classification using CNN model
        model, evaluation_score = extract_feature_image(X=X,
                                                        y_categorized=y_categorized,
                                                        num_classes=num_classes,
                                                        test_size=test_size,
                                                        is_slice=is_slice,   
                                                        model_name=model_name,
                                                        epochs=epochs,
                                                        verbose=verbose,
                                                        wavelengths=wavelengths)
        
        # Execute function to convert list to DataFrame then save to directory
        save_list_to_csv(data=evaluation_score,
                         dataname='evaluation_score',
                         path_to_save=path_to_save)
    
    # Loaded pre-trained model checker
    else:
        
        # Execute function to load model from file
        model = load_model(is_network=is_network,
                           path_to_file=path_to_save,
                           filename=model_name + '_model')
        
        evaluation_score_df = pd.read_csv(path_to_save + 'evaluation_score.csv', index_col='Unnamed: 0') # Load pre-trained evaluation score
        evaluation_score = evaluation_score_df.to_numpy() # Convert DataFrame to array
        
    model.summary() # Summary model

##### 3.3.a.2. Visualize Evaluation Metrics with Histogram

In [None]:
# Neural network model checker
if is_network == True:

    # Define arguments
    figurenames = [
        'Histogram of Loss from image classification',
        'Histogram of Accuracy from image classification'
    ]

    plot = 'histogram'

    # Execute function to visualize evaluation metrics from image classification
    viz_evaluation(evaluation_score=evaluation_score,
                   plot=plot,
                   figurenames=figurenames,
                   save=save,
                   path_to_save=path_to_save)

##### 3.3.a.3. Visualize Evaluation Metrics with Scatter Plot

In [None]:
# Neural network model checker
if is_network == True:

    # Define arguments
    figurenames = [
        'Scatter plot between Loss from image classification',
        'Scatter plot between Accuracy from image classification'
    ]

    plot = 'scatter'

    # Execute function to visualize evaluation metrics from image classification
    viz_evaluation(evaluation_score=evaluation_score,
                   plot=plot,
                   figurenames=figurenames,
                   save=save,
                   path_to_save=path_to_save)

#### 3.3.b. Feature Extraction using Spectroscopy Approach

##### 3.3.b.1. Extract Feature using Spectroscopy Approach

**Note**: High resource consumption

In [None]:
# Non-neural network model checker
if is_network == False:
    
    # Execute function to perform feature extraction by component number optimization of PLS regression using MSE and cross-validation
    indices_sorted, reflectances_optimized, n_components_optimized, wavelengths_discarded = extract_feature_signal(X=reflectances_averaged_preprocessed[:, head_tail[0]:head_tail[1]], 
                                                                                                                   y=y_encoded, 
                                                                                                                   max_components=max_components)

##### 3.3.b.2. Visualize Wavelength Selection

In [None]:
# # Neural network model checker
# if is_network == False:

#     # Define mask as follow PLSR optimization
#     mask = np.in1d(np.arange(reflectances_averaged_preprocessed[:,head_tail[0]:head_tail[1]].shape[-1]), 
#                    indices_sorted[wavelengths_discarded:])

#     # Define metadata
#     y_axis_name = 'Reflectances'
#     figurename = 'wavelength selection from spectrocopy approach'

#     # Execute function to visualize averaged reflectances plot among treatments from image preprocessing
#     # highlighted wavelengths ~ discarded
#     viz_averaged(X=reflectances_averaged_preprocessed,
#                  y=y,
#                  labels=labels,
#                  wavelengths=wavelengths,
#                  head_tail=head_tail,
#                  is_plot_averaged=True,
#                  is_plot_filtered=True,
#                  alpha=0.2,
#                  color_palette=color_palette,
#                  mask=mask,
#                  y_axis_name=y_axis_name,
#                  figurename=figurename,
#                  save=save,
#                  path_to_save=path_to_save)

##### 3.3.b.2. Visualize PLSR Curve Fitting

In [None]:
# Neural network model checker
if is_network == False:

    # Execute function to perform PLS regression with specific component number
    model, reflectances_optimized_transformed = plsr_fit_transform(X=reflectances_optimized,
                                                                  y=y_encoded,
                                                                  n_components=n_components_optimized)
    # Define metadata
    figurename = 'Cross-validated PLS regression'

    # Execute function visualize PLS regression performance
    coeff, y_cross_validated, mse_cross_validated = viz_pls_cv(plsr=model,
                                                               X=reflectances_optimized, 
                                                               y=y_encoded, 
                                                               figurename=figurename,
                                                               save=save,
                                                               path_to_save=path_to_save)

##### 3.3.b.3. Visualize Confusion Matrix

In [None]:
# Neural network model checker
if is_network == False:
    
    # Define metadata
    figurename = 'Confusion Matrix of PLS Regression with \nMSE = ' + str(mse_cross_validated)

    # Execute function to visualize confusion matrix of PLS regression using parameters from "viz_pls_cv"
    viz_confusion_matrix(y=y_encoded,
                         y_cross_validated=y_cross_validated,
                         figurename=figurename,
                         save=False,
                         path_to_save=path_to_save)

##### 3.3.b.4. Visualize PLSR Coefficients

**Note**: High resource consumption

In [None]:
# Neural network model checker
if is_network == False:
    
    # Prepare data for visualizing absolute value of PLS regression coefficients among spectra
    # Execute function to perform data preparation with optimize parameters for visualizing plots among sepectra
    X_trimmed, X_trimmed_reshaped, plsr_X = prepare_viz_spectral(X=X,
                                                                 y_encoded=y_encoded,
                                                                 n_components=n_components_optimized,
                                                                 head_tail=head_tail)

    # Prepare PLS regression coefficients for visualization
    coeff_X_reshaped = plsr_X.coef_.reshape(X_trimmed.shape[1:]) # Reshape regression coefficients of trimmed X
    indices_list = random.sample(range(X_trimmed.shape[-1]), 9) # Random 9 wavelength indices as list
    indices_list_sorted = sorted(indices_list) # Ascending sort list of indices

    # Define metadata
    figurename = 'Absolute value of PLS regression coefficients'

    # Execute function to comparative visualize among spectra
    viz_spectral(data=coeff_X_reshaped,
                 indices_list=indices_list,
                 wavelengths=wavelengths,
                 save=save,
                 figurename=figurename,
                 path_to_save=path_to_save)

### 3.4. Data Preparation

#### 3.4.1. Wavelength Selection

In [None]:
# Neural network model checker
if is_network == True:
    
    # Define parameters
    evaluation_score_trimmed = evaluation_score.copy() # copy original evaluation score
    metric = 'accuracy' # Metric for wavelength selection

    # execute function to sort index evaluation score and filter dataset
    X_filtered, indices_sorted = filter_dataset(X=X,
                                                evaluation_score=evaluation_score,
                                                metric=metric,
                                                number=number,
                                                is_exclude=is_exclude)
    
else:
    
    metric = 'mse' # Metric for wavelength selection
    
    # execute function to sort index evaluation score and filter dataset
    X_filtered, _ = filter_dataset(X=X,
                                   evaluation_score=indices_sorted,
                                   metric=metric,
                                   number=number,
                                   is_exclude=is_exclude)
    
# Save indices list
# Execute function to convert list to DataFrame then save to directory
save_list_to_csv(data=indices_sorted,
                 dataname='indices_sorted_' + model_name,
                 path_to_save=path_to_save)

##### 3.4.2. Visualize Wavelength selection

In [None]:
temp = [219,210,158,223,78,179,12,216,169,187]
result = []

for index in temp:
    
    result.append(wavelengths[index])
    
sorted(result)

In [None]:
# Neural network model checker
if is_network == True:

    indices_list = indices_sorted[:number] # Set selected wavelengths
    
#     # Define mask as follow CNN optimization
#     mask = np.in1d(np.arange(reflectances_averaged_preprocessed[:,head_tail[0]:head_tail[1]].shape[-1]), 
#                    indices_list)
    
    # Define figure name
    figurename = 'Wavelength Selection from Image Classification Approach'
    
    # Execute function to convert list to DataFrame then save to directory
    save_list_to_csv(data=evaluation_score,
                     dataname='evaluation_score',
                     path_to_save=path_to_save)

# Non-neural network model checker
else:
    
#     indices_list = indices_sorted[wavelengths_discarded:] # Set selected wavelengths
    indices_list = indices_sorted[:number] # Set selected wavelengths using fixed amount
    
#     # Define mask as follow PLSR optimization
#     mask = np.in1d(np.arange(reflectances_averaged_preprocessed[:,head_tail[0]:head_tail[1]].shape[-1]), 
#                    indices_list)
    
    # Define figure name
    figurename = 'Wavelength Selection from Spectrocopy Approach'
    
# Define mask as follow optimization
mask = np.in1d(np.arange(reflectances_averaged_preprocessed[:,head_tail[0]:head_tail[1]].shape[-1]), 
               indices_list)

# Define metadata
y_axis_name = 'Reflectance'

# Execute function to visualize averaged reflectances plot among treatments from image preprocessing
# highlighted wavelengths ~ discarded
viz_averaged(X=reflectances_averaged_preprocessed,
             y=y,
             labels=labels,
             wavelengths=wavelengths,
             head_tail=head_tail,
             is_plot_averaged=True,
             is_plot_filtered=True,
             alpha=0.1,
             color_palette=color_palette,
             mask=mask,
             y_axis_name=y_axis_name,
             figurename=figurename,
             save=save,
             path_to_save=path_to_save,
             yticks=np.arange(0,0.25, 0.050))

#### 3.4.3. Split dataset

In [None]:
test_size

In [None]:
# Define parameters
is_slice = False
index_wavelength = 0

# Execute function to perform data preparation by encoded y into one-hot format and split X and y follow "test_size"
X_train, X_test, y_train, y_test = prepare_dataset_image(X=X_filtered,
                                                         y=y_categorized,
                                                         num_classes=num_classes,
                                                         test_size=test_size,
                                                         is_slice=is_slice,
                                                         index_wavelength=index_wavelength)

# Full spectrum option checker
if full == True:
    
    # Execute function to perform data preparation by encoded y into one-hot format and split X and y follow "test_size"
    X_full_train, X_full_test, y_full_train, y_full_test = prepare_dataset_image(X=X,
                                                                                 y=y_categorized,
                                                                                 num_classes=num_classes,
                                                                                 test_size=test_size,
                                                                                 is_slice=is_slice,
                                                                                 index_wavelength=index_wavelength)

#### 3.4.4. Visualize the splitted training datasets

In [None]:
# Define metadata
figurename = 'Distribution of Training Dataset Labels from ' + model_name.upper()
name_groups = ['winter', 'summer']

# Execute function to visualize label distribution of label dataset
viz_label_distribution(y=y_train, 
                       labels=labels,
                       name_groups=name_groups,
                       color_palette=['#3a86ff', '#ffbe0b'],
                       path_to_save=path_to_save,
                       figurename=figurename,
                       save=save)

#### 3.4.5. Visualize the splitted testing datasets

In [None]:
# Define metadata
figurename = 'Distribution of Testing Dataset Labels from ' + model_name.upper()
name_groups = ['winter', 'summer']

# Execute function to visualize label distribution of label dataset
viz_label_distribution(y=y_test, 
                       labels=labels,
                       name_groups=name_groups,
                       color_palette=['#205375', '#F66B0E'],
                       path_to_save=path_to_save,
                       figurename=figurename,
                       save=save)

#### 3.4.6. Visualize Biplot (inactivated due to it set plotting style to seaborn)

In [None]:
# # Neural network model checker
# if is_network == False:

#     # Define metadata
#     figurename = 'biplot of transformed dataset using PLS regression'

#     # Show loadings labels
#     # Execute function to visualize biplot from PLS regression results
#     viz_biplot(data=reflectances_optimized,
#                X=reflectances_optimized_transformed,
#                y=y,
#                y_encoded=y_encoded,
#                coeff=coeff,
#                labels=labels,
#                wavelengths=wavelengths,
#                color_palette=color_palette,
#                figurename=figurename,
#                save=save,
#                loadings_label=True)

## 4. Data Saving

### 4.1. Dataset Saving

In [None]:
# Execute function to saving processed data
# Training dataset
save_dataset(data=X_train,
             dataname='X_train_' + model_name,
             path_to_save=path_to_save)

# Testing dataset
save_dataset(data=X_test,
             dataname='X_test_' + model_name,
             path_to_save=path_to_save)

# Training labels
save_dataset(data=y_train,
             dataname='y_train_' + model_name,
             path_to_save=path_to_save)

# Testing labels
save_dataset(data=y_test,
             dataname='y_test_' + model_name,
             path_to_save=path_to_save)

# Full spectrum option checker
if full == True:
    
    # Training dataset
    save_dataset(data=X_full_train,
                 dataname='X_train_full',
                 path_to_save=path_to_save)

    # Testing dataset
    save_dataset(data=X_full_test,
                 dataname='X_test_full',
                 path_to_save=path_to_save)

    # Training labels
    save_dataset(data=y_full_train,
                 dataname='y_train_full',
                 path_to_save=path_to_save)

    # Testing labels
    save_dataset(data=y_full_test,
                 dataname='y_test_full',
                 path_to_save=path_to_save)

### 4.2. Neural Network Model Saving

In [None]:
# Execute function to save model to file
save_model(model=model, 
           is_network=is_network,
           path_to_save=path_to_save,
           filename=model_name + '_model')