<a href="https://colab.research.google.com/github/shelleyg-bit/kp-labs-seeing-beyond-visible-challenge/blob/main/data_prep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction
How to open and understand the dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Generating baseline solution

In [5]:
class BaselineRegressor:
    """
    Baseline regressor, which calculates the mean value of the target from the training
    data and returns it for each testing sample.
    """
    def __init__(self):
        self.mean = 0

    def fit(self, X_train: np.ndarray, y_train: np.ndarray):
        self.mean = np.mean(y_train, axis=0)
        self.classes_count = y_train.shape[1]
        return self

    def predict(self, X_test: np.ndarray):
        return np.full((len(X_test), self.classes_count), self.mean)


class SpectralCurveFiltering():
    """
    Create a histogram (a spectral curve) of a 3D cube, using the merge_function
    to aggregate all pixels within one band. The return array will have
    the shape of [CHANNELS_COUNT]
    """

    def __init__(self, merge_function = np.mean):
        self.merge_function = merge_function

    def __call__(self, sample: np.ndarray):
        return self.merge_function(sample, axis=(1, 2))


## Load the data

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
train_data_path = '/content/drive/MyDrive/AI4EO/train_data'
test_data_path = '/content/drive/MyDrive/AI4EO/test_data'
gt_data_path = '/content/drive/MyDrive/AI4EO/train_gt.csv'
wavelength_data_path = '/content/drive/MyDrive/AI4EO/wavelengths.csv'


In [27]:
import os
from glob import glob

def load_data(directory: str):
    """Load each cube, reduce its dimensionality and append to array.

    Args:
        directory (str): Directory to either train or test set
    Returns:
        [type]: A list with spectral curve for each sample.
    """
    data = []
    filtering = SpectralCurveFiltering()
    all_files = np.array(
        sorted(
            glob(os.path.join(directory, "*.npz")),
            key=lambda x: int(os.path.basename(x).replace(".npz", "")),
        )
    )
    for file_name in all_files:
        with np.load(file_name) as npz:
            arr = np.ma.MaskedArray(**npz)
        arr = filtering(arr)
        data.append(arr)
    return np.array(data)


def load_gt(file_path: str):
    """Load labels for train set from the ground truth file.
    Args:
        file_path (str): Path to the ground truth .csv file.
    Returns:
        [type]: 2D numpy array with soil properties levels
    """
    gt_file = pd.read_csv(file_path)
    labels = gt_file[["P", "K", "Mg", "pH"]].values
    return labels


X_train = load_data(train_data_path)
y_train = load_gt(gt_data_path)
X_test = load_data(test_data_path)

print(f"Train data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")


Train data shape: (1732, 150)
Test data shape: (1154, 150)


In [28]:
wavelengths_df = pd.read_csv(wavelength_data_path)

In [29]:
wavelengths_df.head()

Unnamed: 0,band_no,wavelength
0,1,462.08
1,2,465.27
2,3,468.47
3,4,471.67
4,5,474.86


In [80]:
def write_to_csv(raw_data, out_path, labels_path=None):
  band_names = [f"b{band_no:03d}" for band_no in wavelengths_df['band_no']]
  wavelength = [f"{wavelength:3.0f}" for wavelength in wavelengths_df['wavelength']]
  column_names = [f"{b}_{v}" for b,v in zip(band_names, wavelength)]
  data_df = pd.DataFrame(data=raw_data, columns=column_names)
  data_df.index.name = 'sample_index'
  if (labels_path):
    labels_df = pd.read_csv(labels_path, index_col='sample_index')
    data_df = pd.concat([data_df, labels_df], axis=1)
  data_df.to_csv(out_path)
  return data_df

In [81]:
train_data = write_to_csv(X_train,
                          '/content/drive/MyDrive/AI4EO/train_data.csv',
                          labels_path='/content/drive/MyDrive/AI4EO/train_gt.csv')
test_data = write_to_csv(X_test, '/content/drive/MyDrive/AI4EO/test_data.csv')

In [82]:
train_data.head()

Unnamed: 0_level_0,b001_462,b002_465,b003_468,b004_472,b005_475,b006_478,b007_481,b008_484,b009_488,b010_491,...,b145_922,b146_926,b147_929,b148_932,b149_935,b150_938,P,K,Mg,pH
sample_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,450.139241,450.822785,446.810127,454.455696,466.075949,487.316456,499.620253,501.696203,505.164557,509.797468,...,1524.164557,1528.734177,1533.493671,1538.012658,1542.35443,1546.924051,45.1,188.0,179.0,7.2
1,553.833333,552.025641,545.397436,553.897436,563.628205,586.025641,597.179487,595.923077,595.589744,597.166667,...,2676.705128,2678.384615,2680.24359,2681.769231,2682.833333,2684.025641,44.8,205.0,188.0,7.0
2,422.551282,419.525641,414.871795,423.5,433.717949,453.358974,463.910256,464.666667,466.307692,468.064103,...,1177.307692,1179.051282,1181.166667,1183.089744,1184.692308,1186.5,44.4,207.0,145.0,6.8
3,609.87013,610.558442,604.584416,615.090909,628.675325,655.025974,670.623377,674.74026,681.519481,689.272727,...,1782.519481,1786.649351,1791.272727,1795.467532,1799.766234,1804.090909,46.5,204.0,143.0,6.8
4,403.910256,401.974359,396.294872,403.75641,412.397436,432.051282,443.653846,445.423077,448.487179,452.487179,...,1453.679487,1455.282051,1457.320513,1458.935897,1460.410256,1462.153846,52.0,212.0,167.0,6.7


## Make predictions and generate submission file

In [13]:
baseline_reg = BaselineRegressor()
baseline_reg = baseline_reg.fit(X_train, y_train)
predictions = baseline_reg.predict(X_test[:100])

submission = pd.DataFrame(data = predictions, columns=["P", "K", "Mg", "pH"])
submission.index.name='sample_index'
#submission.to_csv("submission.csv", index_label="sample_index")


In [14]:
submission.head()

Unnamed: 0_level_0,P,K,Mg,pH
sample_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,70.302656,227.98851,159.281236,6.782719
1,70.302656,227.98851,159.281236,6.782719
2,70.302656,227.98851,159.281236,6.782719
3,70.302656,227.98851,159.281236,6.782719
4,70.302656,227.98851,159.281236,6.782719


## Calculating the metric

For the purpose of presenting the final metric calculation, we will extract a small _test_set_ from the training set.

In [15]:
X_test = X_train[1500:]
y_test = y_train[1500:]

X_train_new = X_train[:1500]
y_train_new = y_train[:1500]

# Fit the baseline regressor once again on new training set
baseline_reg = baseline_reg.fit(X_train_new, y_train_new)
baseline_predictions = baseline_reg.predict(X_test)

# Generate baseline values to be used in score computation
baselines = np.mean((y_test - baseline_predictions) ** 2, axis=0)


# Generate random predictions, different from baseline predictions
np.random.seed(0)
predictions = np.zeros_like(y_test)
for column_index in range(predictions.shape[1]):
    class_mean_value = baseline_reg.mean[column_index]
    predictions[:, column_index] = np.random.uniform(low=class_mean_value - class_mean_value * 0.05,
                                                     high=class_mean_value + class_mean_value * 0.05,
                                                     size=len(predictions))

# Calculate MSE for each class
mse = np.mean((y_test - predictions) ** 2, axis=0)

# Calculate the score for each class individually
scores = mse / baselines

# Calculate the final score
final_score = np.mean(scores)

for score, class_name in zip(scores, ["P", "K", "Mg", "pH"]):
    print(f"Class {class_name} score: {score}")

print(f"Final score: {final_score}")

Class P score: 0.9896068600445717
Class K score: 1.004900913045855
Class Mg score: 1.0228518828521695
Class pH score: 1.6431314552511207
Final score: 1.1651227777984292
