This is a notebook to explain the preprocessing steps to prepare the CRISM data for training the N2N4M model.

In [3]:
# Standard imports
import pandas as pd
import numpy as np
import os

# Internal imports
import n2n4m.preprocessing as preprocessing


In [4]:
# Set random seed
np.random.seed(42)

For this tutorial, the data needs to be collated in two JSON files - one for the mineral data, and one for the bland pixel data. Details on these can be found in the bland_dataset_collation.py and mineral_dataset_collation.py files. By default they should be saved in the /data dir.

In [5]:
# Load data
PARENT_DIR = os.path.dirname(os.getcwd())
MINERAL_DATASET_PATH = os.path.join(PARENT_DIR, "data", "extracted_mineral_pixel_data", "mineral_pixel_dataset.json")
BLAND_DATASET_PATH = os.path.join(PARENT_DIR, "data", "extracted_bland_pixel_data", "bland_pixel_dataset.json")
mineral_dataset = preprocessing.load_dataset(MINERAL_DATASET_PATH)
bland_dataset = preprocessing.load_dataset(BLAND_DATASET_PATH)

Both datasets are pandas DataFrames with the same columns. The columns are as follows:
- "Coordinates": The coordinates of the pixel in the image
- "Pixel_Class": The mineral class of the pixel, as in the Machine Learning Toolkit for CRISM Analysis: https://github.com/Banus/crism_ml/tree/master
- "Image_Name": The unique 5 digit hex code for the image the pixel was extracted from
- "Spectrum": The spectrum of the pixel, as a list of floats

In [6]:
mineral_dataset

Unnamed: 0,Coordinates,Pixel_Class,Image_Name,Spectrum
0,"[366, 2]",[39],0B438,"[65535.0, 65535.0, 0.1425750405, 0.1422003508,..."
1,"[367, 2]",[39],0B438,"[65535.0, 65535.0, 0.1399567425, 0.1396148801,..."
2,"[368, 2]",[39],0B438,"[65535.0, 65535.0, 0.1409857273, 0.1410183907,..."
3,"[369, 2]",[39],0B438,"[65535.0, 65535.0, 0.1417645663, 0.14170213040..."
4,"[367, 3]",[39],0B438,"[65535.0, 65535.0, 0.138021946, 0.1387139708, ..."
...,...,...,...,...
592408,"[275, 448]",[39],0AA03,"[65535.0, 65535.0, 0.21175374090000001, 0.2093..."
592409,"[276, 448]",[39],0AA03,"[65535.0, 65535.0, 0.2083025873, 0.20598560570..."
592410,"[277, 448]",[39],0AA03,"[65535.0, 65535.0, 0.20943352580000002, 0.2086..."
592411,"[276, 449]",[39],0AA03,"[65535.0, 65535.0, 0.2013488561, 0.19851198790..."


In [7]:
# The bland pixel dataset has > 300_000 pixels, but best configuration was to use 150_000 of them.
num_bland_pixels = 150_000
num_bland_images = bland_dataset["Image_Name"].nunique()
samples_per_image = num_bland_pixels // num_bland_images
bland_dataset_sample = bland_dataset.groupby("Image_Name").apply(lambda x: x.sample(min(len(x), samples_per_image), random_state=42)).reset_index(drop=True)

In [8]:
# Apply preprocessing
dataset = pd.concat([mineral_dataset, bland_dataset_sample], ignore_index=True).reset_index(drop=True) # Concatenate the mineral and bland datasets
dataset = preprocessing.expand_dataset(dataset) # Expand the dataframe to have a column for each band instead of a single column for the entire spectrum.

In [9]:
dataset

Unnamed: 0,Coordinates,Pixel_Class,Image_Name,1.00135,1.0079,1.01445,1.021,1.02755,1.0341,1.04065,...,3.88341,3.89008,3.89676,3.90344,3.91011,3.91679,3.92347,3.93015,3.93682,4.0
0,"[366, 2]",[39],0B438,65535.0,65535.0,0.142575,0.142200,0.138645,0.139015,0.138611,...,0.252324,0.259721,0.267016,0.269625,0.267429,0.257965,65535.000000,65535.0,65535.0,0.0
1,"[367, 2]",[39],0B438,65535.0,65535.0,0.139957,0.139615,0.138536,0.138762,0.138253,...,0.262671,0.270499,0.276709,0.279427,0.279138,0.271215,65535.000000,65535.0,65535.0,0.0
2,"[368, 2]",[39],0B438,65535.0,65535.0,0.140986,0.141018,0.140626,0.141171,0.141132,...,0.258144,0.265003,0.271166,0.274582,0.274235,0.269105,65535.000000,65535.0,65535.0,0.0
3,"[369, 2]",[39],0B438,65535.0,65535.0,0.141765,0.141702,0.139316,0.140736,0.141076,...,0.257740,0.264838,0.271900,0.277569,0.278337,0.276143,65535.000000,65535.0,65535.0,0.0
4,"[367, 3]",[39],0B438,65535.0,65535.0,0.138022,0.138714,0.137603,0.138308,0.137398,...,0.268688,0.281385,0.290638,0.294547,0.291726,0.281942,65535.000000,65535.0,65535.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742280,"[47, 355]",[39],9971,65535.0,65535.0,0.154603,0.151454,0.151575,0.151995,0.152522,...,0.200148,0.199211,0.200659,0.205101,0.208127,65535.000000,65535.000000,65535.0,65535.0,0.0
742281,"[50, 262]",[39],9971,65535.0,65535.0,0.133415,0.131712,0.130265,0.129898,0.130026,...,0.246075,0.245365,0.248376,0.251376,0.251646,65535.000000,65535.000000,65535.0,65535.0,0.0
742282,"[628, 125]",[39],9971,65535.0,65535.0,0.184361,0.183356,0.181248,0.181915,0.181142,...,0.279377,0.284904,0.283163,0.272682,0.270120,0.278719,0.000003,65535.0,65535.0,0.0
742283,"[377, 330]",[39],9971,65535.0,65535.0,0.171503,0.169282,0.165710,0.167715,0.167417,...,0.248972,0.247756,0.253012,0.259251,0.260744,0.254920,65535.000000,65535.0,65535.0,0.0


In [10]:
# Remove known bad bands, and reduce the dataset to only the bands that are present in the PLEBANI_WAVELENGTHS - the wavelengths provided in the original Toolkit.
dataset = preprocessing.drop_bad_bands(dataset, preprocessing.PLEBANI_WAVELENGTHS)

In [11]:
# Some pixels contain bad values in the remaining bands. These are then imputed in a hierarchical manner.
# If a pixel has a bad value, a search for pixels from the same image of the same mineral class is performed.
# If there are any, the bad value is imputed with the mean of the good values in the same band.
# If there are no good pixels from the same image and mineral class, the search is expanded to the same image, but any mineral class.
# If there are still no good pixels, the search is expanded to any mineral from any image.
dataset = preprocessing.impute_bad_values(dataset)

In [None]:
# The "Volcano-Scan" correction used as part of the CAT corrections does not perfectly remove the atmospheric artefact between 1900-2100nm in every image. 
# To correct for this, we assume no absorption features should exist in that range, and flag any pixels with a signficant deviation from the continuum of that section.
# We take the "good" pixels, and remove this continuum from each. The mean of the residual is then calculated.
# Any "bad" pixels have this mean residual added to their continuum, resulting in a corrected spectrum.
dataset = preprocessing.impute_atmospheric_artefacts(dataset)

In [9]:
# Generate training samples and split into train and testing data
noise_dataset = preprocessing.generate_noisy_pixels(dataset.iloc[:,3:], random_seed=42)
input_target_dataset = pd.concat([dataset, noise_dataset], axis=1)
train_set, test_set = preprocessing.train_test_split(input_target_dataset, bland_pixels=True)
train_set, validation_set = preprocessing.train_validation_split(train_set, bland_pixels=True)

In [10]:
# Split into features, targets and ancillary data
X_train, y_train, ancillary_train = preprocessing.split_features_targets_anciliary(train_set)
X_test, y_test, ancillary_test = preprocessing.split_features_targets_anciliary(test_set)
X_validation, y_validation, ancillary_validation = preprocessing.split_features_targets_anciliary(validation_set)


In [11]:
# Standardise the features
X_train, feature_scaler = preprocessing.standardise(X_train, method="RobustScaler")
X_test, _ = preprocessing.standardise(X_test, method="RobustScaler", scaler=feature_scaler)
X_validation, _ = preprocessing.standardise(X_validation, method="RobustScaler", scaler=feature_scaler)

At this point the training, validation, 