In [1]:
# GENERAL UTILITIES
import os
from glob import glob
import pandas as pd
from  tqdm.notebook import tqdm
import cv2
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline

# MODEL DEVELOPMENT DEPENDENCIES
import numpy as np
import pywt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
def load_gt(file_path: str):
    """Load labels for train set from the ground truth file.
    Args:
        file_path (str): Path to the ground truth .csv file.
    Returns:
        [type]: 2D numpy array with soil properties levels
    """
    gt_file = pd.read_csv(file_path)
    labels = gt_file[["P", "K", "Mg", "pH"]].values

    return labels

In [3]:
def load_data_1d(directory, gt_file_path):
  x_train = []
  y_train = []
  x_test = []
  y_test = []

  labels = load_gt(gt_file_path)

  all_files = np.array(
      sorted(
          glob(os.path.join(directory, "*.npz")),
          key=lambda x: int(os.path.basename(x).replace(".npz", "")),
      )
  )

  train_size = 0.8
  val_size = 0.9

  for idx, file_name in tqdm(enumerate(all_files),total=len(all_files), desc="Loading {} data .."
                              .format("training")):
      # We load the data into memory as provided in the example notebook of the challenge
      with np.load(file_name) as npz:
          mask = npz["mask"]
          data = npz["data"]
          sh = data.shape[1:]
            
          augmented_data = []
          for x in range(0, sh[0]):
            for y in range(0, sh[1]):
              if mask[0][x][y] == False:
                augmented_data.append(data[:, x, y])
          
          for i in range(len(augmented_data)):
            if (i / len(augmented_data) < train_size):
              x_train.append(augmented_data[i])
              y_train.append(labels[idx])
            else:
              x_test.append(augmented_data[i])
              y_test.append(labels[idx])

  return x_train, y_train, x_test, y_test

In [12]:
def load_data_1d_kf(directory, gt_file_path):
  x_train = []
  y_train = []
  x_val = []
  y_val = []
  x_test = []
  y_test = []

  labels = load_gt(gt_file_path)

  all_files = np.array(
      sorted(
          glob(os.path.join(directory, "*.npz")),
          key=lambda x: int(os.path.basename(x).replace(".npz", "")),
      )
  )

  train_size = 0.8
  val_size = 0.9

  for idx, file_name in tqdm(enumerate(all_files),total=len(all_files), desc="Loading {} data .."
                              .format("training")):
      # We load the data into memory as provided in the example notebook of the challenge
      with np.load(file_name) as npz:
          mask = npz["mask"]
          data = npz["data"]
          sh = data.shape[1:]
            
          augmented_data = []
          for x in range(0, sh[0]):
            for y in range(0, sh[1]):
              if mask[0][x][y] == False:
#                 first_diff = np.diff(data[:, x, y])
#                 second_diff = np.diff(data[:, x, y], n=2)

#                 # Pad the arrays with zeros to make their lengths compatible with the original array
#                 first_diff_padded = np.pad(first_diff, (1, 0), mode='constant')
#                 second_diff_padded = np.pad(second_diff, (2, 0), mode='constant')

#                 # Concatenate the original array with the differentiated arrays
#                 new_array = np.concatenate((data[:, x, y], data[:, x, y] + first_diff_padded, data[:, x, y] + second_diff_padded))
                augmented_data.append(data[:, x, y])
          
          for i in range(len(augmented_data)):
              x_train.append(augmented_data[i])
              y_train.append(labels[idx])
                
#           print(augmented_data[0][149])

  return x_train, y_train

In [4]:
def load_data_testing(directory):
  x_test = []

  all_files = np.array(
      sorted(
          glob(os.path.join(directory, "*.npz")),
          key=lambda x: int(os.path.basename(x).replace(".npz", "")),
      )
  )

  train_size = 0.8
  val_size = 0.9

  for idx, file_name in tqdm(enumerate(all_files),total=len(all_files), desc="Loading {} data .."
                              .format("training")):
      # We load the data into memory as provided in the example notebook of the challenge
      with np.load(file_name) as npz:
          mask = npz["mask"]
          data = npz["data"]
          sh = data.shape[1:]
            
          augmented_data = []
          for x in range(0, sh[0]):
            for y in range(0, sh[1]):
              if mask[0][x][y] == False:
                augmented_data.append(data[:, x, y])
                
          augmented_data = np.array(augmented_data)
          augmented_data = scaler.transform(augmented_data)
          pred = model.predict(augmented_data)
          x_test.append(np.mean(pred, axis=0))
    
  x_test = np.array(x_test)
  return x_test

In [13]:
# Please be sure that the directory and file locations are given correctly in your own system
train_data_dir = "train_data/train_data/train_data"
test_data_dir = "test_data"
gt_data_path = "train_data/train_data/train_gt.csv"

# Loading training raw data
# X_train = load_data(train_data_dir, gt_data_path)
# X_train, Y_train, X_val, Y_val, X_test, Y_test = load_data_1d(train_data_dir, gt_data_path)
# X_train, Y_train, X_test, Y_test = load_data_1d(train_data_dir, gt_data_path)
X_train, Y_train = load_data_1d_kf(train_data_dir, gt_data_path)


Loading training data ..:   0%|          | 0/1732 [00:00<?, ?it/s]

In [14]:
X_train = np.array(X_train)
Y_train = np.array(Y_train)
# X_val = np.array(X_val)
# Y_val = np.array(Y_val)
X_test = np.array(X_test)
Y_test = np.array(Y_test)

In [15]:
Y_train.shape

(5798178, 4)

In [17]:
X_train.shape

(5798178, 150)

In [18]:
# dividing by the maximum value for making values 0 - 1
Y_train[:, 0] = Y_train[:,0]  /  70.3026558891455

In [19]:
Y_test[:,0] = Y_test[:,0] / 70.3026558891455
Y_train[:, 1] = Y_train[:,1]  /  227.9885103926097
Y_test[:, 1] = Y_test[:,1]  /  227.9885103926097
Y_train[:, 2] = Y_train[:,2]  /  159.28123556581986
Y_test[:, 2] = Y_test[:,2]  /  159.28123556581986
Y_train[:, 3] = Y_train[:,3]  /  6.782719399538106
Y_test[:, 3] = Y_test[:,3]  /  6.782719399538106

In [20]:
from tensorflow.keras.layers import (Concatenate, Conv1D, Dense, Flatten,
                                     Input, MaxPooling1D, Reshape)
from tensorflow.keras.models import Model


def getKerasModel(model_name):
    """Get keras model by name.

    Parameters
    ----------
    model_name : str
        Name of the respective model.

    Returns
    -------
    Sequential keras model
        Model.

    """
    if model_name == "LucasCNN":
        return LucasCNN()
    if model_name == "HuEtAl":
        return HuEtAl()
    if model_name == "LiuEtAl":
        return LiuEtAl()
    if model_name == "LucasResNet":
        return LucasResNet()
    if model_name == "LucasCoordConv":
        return LucasCoordConv()
    print("Error: Model {0} not implemented.".format(model_name))
    return None


def HuEtAl():
    """Return 1D-CNN by Wei Hu et al 2014."""
    seq_length = 150

    # definition by Hu et al for parameter k1 and k2
    kernel_size = seq_length // 9
    pool_size = int((seq_length - kernel_size + 1) / 35)

    inp = Input(shape=(seq_length, 1))

    # CONV1
    x = Conv1D(filters=20, kernel_size=kernel_size, activation="tanh")(inp)
    x = MaxPooling1D(pool_size)(x)

    # Flatten, FC1, Softmax
    x = Flatten()(x)
    x = Dense(units=100, activation="tanh")(x)
    x = Dense(4, activation="linear")(x)

    return Model(inputs=inp, outputs=x)


def LiuEtAl():
    """Return 1D-CNN by Lanfa Liu et al 2018."""
    seq_length = 150
    kernel_size = 3

    inp = Input(shape=(seq_length, 1))

    # CONV1
    x = Conv1D(filters=32, kernel_size=kernel_size, activation="relu")(inp)
    x = MaxPooling1D(2)(x)

    # CONV2
    x = Conv1D(filters=32, kernel_size=kernel_size, activation='relu')(x)
    x = MaxPooling1D(2)(x)

    # CONV3
    x = Conv1D(filters=64, kernel_size=kernel_size, activation='relu')(x)
    x = MaxPooling1D(2)(x)

    # CONV4
    x = Conv1D(filters=64, kernel_size=kernel_size, activation='relu')(x)
    x = MaxPooling1D(2)(x)

    # Flatten & Softmax
    x = Flatten()(x)
    x = Dense(4, activation="linear")(x)

    return Model(inputs=inp, outputs=x)


def LucasCNN():
    """Return LucasCNN implementation.

    Returns
    -------
    Sequential keras model
        Model.

    """
    seq_length = 150
    kernel_size = 3
    activation = "relu"
    padding = "valid"

    inp = Input(shape=(seq_length, 1))

    # CONV1
    x = Conv1D(filters=32,
               kernel_size=kernel_size,
               activation=activation,
               padding=padding)(inp)
    x = MaxPooling1D(2)(x)

    # CONV2
    x = Conv1D(filters=32,
               kernel_size=kernel_size,
               activation=activation,
               padding=padding)(x)
    x = MaxPooling1D(2)(x)

    # CONV3
    x = Conv1D(filters=64,
               kernel_size=kernel_size,
               activation=activation,
               padding=padding)(x)
    x = MaxPooling1D(2)(x)

    # CONV4
    x = Conv1D(filters=64,
               kernel_size=kernel_size,
               activation=activation,
               padding=padding)(x)
    x = MaxPooling1D(2)(x)

    # Flatten, FC1, FC2, Softmax
    x = Flatten()(x)
    x = Dense(120, activation=activation)(x)
    x = Dense(160, activation=activation)(x)
    x = Dense(4, activation="linear")(x)

    return Model(inputs=inp, outputs=x)


def LucasResNet():
    """Return LucasResNet implementation.

    Returns
    -------
    Sequential keras model
        Model.

    """
    seq_length = 150
    kernel_size = 3
    activation = "relu"
    padding = "same"

    inp = Input(shape=(seq_length, 1))

    # CONV1
    x = Conv1D(filters=32,
               kernel_size=kernel_size,
               activation=activation,
               padding=padding)(inp)
    x = MaxPooling1D(2)(x)

    # CONV2
    x = Conv1D(filters=32,
               kernel_size=kernel_size,
               activation=activation,
               padding=padding)(x)
    x = MaxPooling1D(2)(x)

    # CONV3
    x = Conv1D(filters=64,
               kernel_size=kernel_size,
               activation=activation,
               padding=padding)(x)
    x = MaxPooling1D(2)(x)

    # CONV4
    x = Conv1D(filters=64,
               kernel_size=kernel_size,
               activation=activation,
               padding=padding)(x)
    x = MaxPooling1D(2)(x)

    # Residual block
    inp_res = Reshape((10, 15))(inp)
    x = Concatenate(axis=-1)([x, inp_res])

    # Flatten, FC1, FC2, Softmax
    x = Flatten()(x)
    x = Dense(150, activation=activation)(x)
    x = Dense(100, activation=activation)(x)
    x = Dense(4, activation="linear")(x)

    return Model(inputs=inp, outputs=x)


def LucasCoordConv():
    """Return LucasCoordConv implementation.

    Returns
    -------
    Sequential keras model
        Model.

    """

    seq_length = 150
    kernel_size = 3
    activation = "relu"
    padding = "valid"

    inp = Input(shape=(seq_length, 1))

    # CoordCONV1
    x = Conv1D(filters=32,
               kernel_size=kernel_size,
               activation=activation,
               padding=padding)(inp)
    x = MaxPooling1D(2)(x)

    # CONV2
    x = Conv1D(filters=64,
               kernel_size=kernel_size,
               activation=activation,
               padding=padding)(x)
    x = MaxPooling1D(2)(x)

    # CONV3
    x = Conv1D(filters=64,
               kernel_size=kernel_size,
               activation=activation,
               padding=padding)(x)
    x = MaxPooling1D(2)(x)

    # CONV4
    x = Conv1D(filters=128,
               kernel_size=kernel_size,
               activation=activation,
               padding=padding)(x)
    x = MaxPooling1D(2)(x)

    # Flatte, FC1, FC2, Softmax
    x = Flatten()(x)
    x = Dense(256, activation=activation)(x)
    x = Dense(128, activation=activation)(x)
    x = Dense(4, activation="linear")(x)

    return Model(inputs=inp, outputs=x)

2024-03-22 08:27:00.983441: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-22 08:27:01.030170: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-22 08:27:01.276551: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-22 08:27:01.277860: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [21]:
from sklearn.preprocessing import MinMaxScaler

# Instantiate the MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the data
X_train = scaler.fit_transform(X_train)
# X_val = scaler.transform(X_val)
# X_test = scaler.transform(X_test)

In [30]:
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# Assuming X_train, Y_train, X_test, y_test are defined

k_folds = 5

def train_regression_model(model, X_train, Y_train, X_test, y_test, epochs=15, batch_size=256):
    # Compile the model
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    
    # Train the model
    model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))
    
    # Evaluate the model
    loss, mae = model.evaluate(X_test, y_test)
    print("Test Loss:", loss)
    print("Test MAE:", mae)
    
    return model, loss

best_model = None
best_rmse = 10000
# Initialize a KFold with the number of splits
n_splits = 5  # You can adjust this as needed
kfold = KFold(n_splits=n_splits)
results = []
for train_idx, test_idx in kfold.split(X_train):
    X_train1, y_train1 = X_train[train_idx], Y_train[train_idx]
    X_test1, y_test1 = X_train[test_idx], Y_train[test_idx]
    
    model = getKerasModel("LucasCoordConv")

    model, rmse = train_regression_model(model, X_train1, y_train1, X_test1, y_test1)
    results.append(rmse)

    # Check if this model has a lower RMSE than the best model so far
    if rmse < best_rmse:
        best_model = model
        best_rmse = rmse

        

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test Loss: 0.08052553981542587
Test MAE: 0.18282243609428406
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test Loss: 0.08317136019468307
Test MAE: 0.1871417611837387
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test Loss: 0.05930153653025627
Test MAE: 0.15759696066379547
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test Loss: 0.05635342374444008
Test MAE: 0.1698915660381317
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/

In [None]:
model.fit(X_train, Y_train, epochs=15, batch_size=256, validation_data=(X_test, Y_test))

Epoch 1/15
Epoch 2/15
Epoch 3/15

In [21]:
def train_regression_model(model, X_train, Y_train, X_test, y_test, epochs=15, batch_size=256):
    # Compile the model
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    
    # Train the model
    model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))
    
    # Evaluate the model
    loss, mae = model.evaluate(X_test, y_test)
    print("Test Loss:", loss)
    print("Test MAE:", mae)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate Mean Squared Error
    mse = mean_squared_error(y_test, y_pred)
    print("root Mean Squared Error:", mse ** 0.5)

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense

# Define the input shape
input_shape = (150, 1)  # Assuming your input data has 100 features

# Create the model
model = Sequential()

# Add a 1D convolutional layer
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=input_shape))

# Add a max pooling layer
model.add(MaxPooling1D(pool_size=2))

# Add another 1D convolutional layer
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))

# Add a max pooling layer
model.add(MaxPooling1D(pool_size=2))

# Flatten the output
model.add(Flatten())

# model.add(Dense(256, activation='relu'))

# model.add(Dense(128, activation='relu'))

# Add a dense layer
model.add(Dense(64, activation='relu'))

# Add the output layer
model.add(Dense(4))  # Assuming you're doing regression, so no activation function

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Print the model summary
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_2 (Conv1D)           (None, 148, 64)           256       
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 74, 64)           0         
 1D)                                                             
                                                                 
 conv1d_3 (Conv1D)           (None, 72, 64)            12352     
                                                                 
 max_pooling1d_3 (MaxPooling  (None, 36, 64)           0         
 1D)                                                             
                                                                 
 flatten_1 (Flatten)         (None, 2304)              0         
                                                                 
 dense_2 (Dense)             (None, 64)               

In [18]:
train_regression_model(model, X_train, Y_train, X_test, Y_test)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: 0.00723271956667304
Test MAE: 0.062470924109220505
root Mean Squared Error: 0.0850453066638279


In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error

# Instantiate the Random Forest model (for classification)
# Change RandomForestClassifier to RandomForestRegressor for regression tasks
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model to the training data
model.fit(X_train, Y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(Y_train, y_pred)
print("Mean Squared Error:", mse)


In [None]:
y = np.reshape(Y_train[:,0], 4639319)

In [20]:
y.shape

(4639319,)

In [11]:
import tensorflow as tf
from transformers import TFBertModel, BertTokenizer
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import numpy as np

# Load pre-trained BERT model and tokenizer
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Assuming X_train is your input data and y_train is your regression targets

# Convert numerical data into text tokens
X_train_diff = np.diff(X_train)

X_train_diff = np.maximum(0, X_train_diff)

# Example: converting a numerical vector [1, 2, 3, ..., 150] into a space-separated string '1 2 3 ... 150'
X_train_text = [' '.join(map(str, vec)) for vec in X_train_diff]

# Tokenize input
input_ids = []
attention_masks = []

for text in X_train_text:
    encoded_dict = tokenizer.encode_plus(
                        text,
                        add_special_tokens = True,
                        max_length = 149,  # Assuming max length is 150
                        padding = 'max_length',
                        truncation = True,
                        return_attention_mask = True,
                        return_tensors = 'tf'
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    

2024-03-07 02:21:39.281954: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-07 02:21:39.333691: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-07 02:21:39.581026: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-07 02:21:39.582428: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-07 02:21:44.656025: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen som

In [13]:
input_ids = tf.concat(input_ids, axis=0)
attention_masks = tf.concat(attention_masks, axis=0)

# Define model architecture
input_layer = Input(shape=(149,), dtype=tf.int32)
mask_layer = Input(shape=(149,), dtype=tf.int32)

bert_output = bert_model(input_layer, attention_mask=mask_layer)[0]
output = Dense(4)(bert_output[:, 0, :])  # No activation function for regression

model = Model(inputs=[input_layer, mask_layer], outputs=output)

# Compile the model
optimizer = Adam(lr=2e-5)
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mean_absolute_error'])

# Train the model
model.fit([input_ids, attention_masks], Y_train, epochs=5, batch_size=256, validation_split=0.2)




Epoch 1/5
















 2621/14498 [====>.........................] - ETA: 79:54:50 - loss: 2138.3713 - mean_absolute_error: 26.5699

KeyboardInterrupt: 

In [37]:
X_train[1]

array([ 498,  500,  504,  508,  517,  542,  552,  552,  554,  562,  573,
        587,  593,  612,  633,  641,  650,  670,  681,  682,  688,  694,
        722,  749,  764,  777,  796,  808,  826,  844,  866,  882,  893,
        902,  911,  915,  927,  940,  957,  972,  987,  994, 1000, 1005,
       1018, 1032, 1044, 1046, 1050, 1056, 1065, 1069, 1072, 1072, 1077,
       1084, 1091, 1112, 1127, 1127, 1128, 1129, 1130, 1131, 1132, 1133,
       1134, 1141, 1148, 1188, 1233, 1273, 1302, 1319, 1320, 1323, 1327,
       1334, 1341, 1348, 1356, 1363, 1370, 1376, 1384, 1392, 1400, 1407,
       1412, 1416, 1418, 1419, 1422, 1424, 1425, 1429, 1433, 1455, 1479,
       1498, 1507, 1508, 1509, 1510, 1511, 1512, 1513, 1514, 1514, 1515,
       1521, 1526, 1531, 1536, 1543, 1546, 1548, 1554, 1562, 1585, 1607,
       1614, 1615, 1616, 1617, 1618, 1619, 1620, 1621, 1622, 1623, 1627,
       1631, 1638, 1639, 1641, 1643, 1646, 1651, 1652, 1652, 1652, 1655,
       1657, 1659, 1662, 1665, 1668, 1671, 1674], d

In [8]:
def preprocess_array(arr):
    n, m = arr.shape
    for i in range(n):
        for j in range(m - 1):
            if arr[i, j + 1] < arr[i, j]:
                if j + 2 < m and arr[i, j + 2] >= arr[i, j]:
                    arr[i, j + 1] = (arr[i, j] + arr[i, j + 2]) / 2
                else:
                    arr[i, j + 1] = arr[i, j] + 1
    return arr

In [None]:
# judge that in ends we have better results means the error is more due to the 11 x 11 patch think more about them

In [10]:
np.diff(X_train[0]).shape

(149,)

In [15]:
X_TEST = load_data_testing(test_data_dir)
X_TEST = np.array(X_TEST)

Loading training data ..:   0%|          | 0/1154 [00:00<?, ?it/s]









In [16]:
submission_df = pd.DataFrame(data=X_TEST, columns=["P", "K", "Mg", "pH"])
data = submission_df
# Create new columns for labels and values
labels = []
values = []

# Iterate through each row of the original data and expand it
for i, row in data.iterrows():
    for j, col in row.iteritems():
        labels.append(f"{i}_{j}")
        values.append(col)

# Create a new DataFrame with the expanded data
new_data = pd.DataFrame({"Label": labels, "Value": values})

new_data.to_csv("SampleSubmission.csv", index=False)