In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from datetime import datetime
from tqdm import tqdm

In [None]:
# Load the full data
data = pd.read_parquet("/content/drive/MyDrive/LSE Capstone G Drive/Train Val Test Sets/2D Dataframes/2d_data_cleaned.parquet")

# Show shape and columns
print("Shape:", data.shape)
print("Columns:", data.columns.tolist())
print(data.head())

Shape: (14146902, 13)
Columns: ['lat_5km', 'lon_5km', 'KD490', 'ZSD', 'RRS490', 'RRS443', 'CHL', 'MICRO', 'BBP', 'CDM', 'time', 'flags', 'date']
   lat_5km  lon_5km     KD490        ZSD  RRS490  RRS443       CHL  MICRO  \
0    49.15   -10.75  0.035727  26.936846    -2.0    -2.0  0.124980   -2.0   
1    49.15   -10.70  0.035727  26.936848    -2.0    -2.0  0.124980   -2.0   
2    49.15   -10.65  0.035787  26.878199    -2.0    -2.0  0.125559   -2.0   
3    49.15   -10.60  0.035876  26.790228    -2.0    -2.0  0.126427   -2.0   
4    49.15   -10.55  0.035876  26.790228    -2.0    -2.0  0.126427   -2.0   

   BBP  CDM       time  flags        date  
0 -2.0 -2.0 2025-01-01    0.0  2025-01-01  
1 -2.0 -2.0 2025-01-01    0.0  2025-01-01  
2 -2.0 -2.0 2025-01-01    0.0  2025-01-01  
3 -2.0 -2.0 2025-01-01    0.0  2025-01-01  
4 -2.0 -2.0 2025-01-01    0.0  2025-01-01  


## Creation of Train / Validation / Test sets

### Identify date ranges of the data & check for missing dates

In [None]:
# Convert to datetime if needed
data['date'] = pd.to_datetime(data['date'])

# Unique sorted dates
unique_dates = pd.to_datetime(sorted(data['date'].unique()))
print(f"Number of unique dates: {len(unique_dates)}")
print(f"Date range: {unique_dates.min().date()} to {unique_dates.max().date()}")

# Full expected date range (daily)
expected_dates = pd.date_range(start=unique_dates.min(), end=unique_dates.max(), freq='D')

# Find missing dates
missing_dates = set(expected_dates) - set(unique_dates)
if missing_dates:
    print(f"\nMissing dates: {len(missing_dates)}")
    print("Example missing dates:", sorted(missing_dates)[:10])
else:
    print("\n All dates are continuous.")

print(sorted(missing_dates))

Number of unique dates: 1067
Date range: 2022-01-27 to 2025-07-21

Missing dates: 205
Example missing dates: [Timestamp('2022-11-21 00:00:00'), Timestamp('2022-11-22 00:00:00'), Timestamp('2022-11-23 00:00:00'), Timestamp('2022-11-24 00:00:00'), Timestamp('2022-11-25 00:00:00'), Timestamp('2022-11-26 00:00:00'), Timestamp('2022-11-27 00:00:00'), Timestamp('2022-11-28 00:00:00'), Timestamp('2022-11-29 00:00:00'), Timestamp('2022-11-30 00:00:00')]
[Timestamp('2022-11-21 00:00:00'), Timestamp('2022-11-22 00:00:00'), Timestamp('2022-11-23 00:00:00'), Timestamp('2022-11-24 00:00:00'), Timestamp('2022-11-25 00:00:00'), Timestamp('2022-11-26 00:00:00'), Timestamp('2022-11-27 00:00:00'), Timestamp('2022-11-28 00:00:00'), Timestamp('2022-11-29 00:00:00'), Timestamp('2022-11-30 00:00:00'), Timestamp('2022-12-01 00:00:00'), Timestamp('2022-12-02 00:00:00'), Timestamp('2022-12-03 00:00:00'), Timestamp('2022-12-04 00:00:00'), Timestamp('2022-12-05 00:00:00'), Timestamp('2022-12-06 00:00:00'), Times

### Create and save the sets in tabular 2D arrays

In [None]:
# Calculate sizes for data split
train_start = pd.to_datetime("2022-01-27")
train_end = pd.to_datetime("2023-11-17")
val_start = pd.to_datetime("2024-01-26")
val_end = pd.to_datetime("2024-07-31")
test_start = pd.to_datetime("2024-08-01")
test_end = pd.to_datetime("2025-07-21")

# Filter data for each split
train_set = data[(data["date"]>= train_start) & (data["date"]<= train_end)]
val_set = data[(data["date"]>= val_start) & (data["date"]<= val_end)]
test_set = data[(data["date"]>= test_start) & (data["date"]<= test_end)]

print(f"Train set dates: {train_set['date'].min().date()} to {train_set['date'].max().date()}")
print(f"Validation set dates: {val_set['date'].min().date()} to {val_set['date'].max().date()}")
print(f"Test set dates: {test_set['date'].min().date()} to {test_set['date'].max().date()}")

# Ensure data is sorted, Find positional indices of the split boundaries
unique_train_days = pd.to_datetime(sorted(train_set["date"].unique()))
unique_val_days = pd.to_datetime(sorted(val_set["date"].unique()))
unique_test_days = pd.to_datetime(sorted(test_set["date"].unique()))

print(f"\nTotal number of unique days: {len(unique_dates)}")
print(f"\nTrain days: {len(unique_train_days)}")
print(f"Val days: {len(unique_val_days)}")
print(f"Test days: {len(unique_test_days)}")
print(f"Sum of days: {len(unique_train_days) + len(unique_val_days) + len(unique_test_days)}")

train_set.to_parquet("train_set_2D.parquet")
val_set.to_parquet("val_set_2D.parquet")
test_set.to_parquet("test_set_2D.parquet")

Train set dates: 2022-01-27 to 2023-11-17
Validation set dates: 2024-01-26 to 2024-07-31
Test set dates: 2024-08-01 to 2025-07-21

Total number of unique days: 1067

Train days: 593
Val days: 188
Test days: 286
Sum of days: 1067


### Find the continuous blocks of data (winter gaps in between) in train-val-test sets

In [None]:
def find_blocks(df):
    dates = pd.to_datetime(sorted(df['date'].unique()))
    blocks = []

    start_date = dates[0]
    prev_date = dates[0]

    for current_date in dates[1:]:
        # If there’s a gap >1 day, close the current block
        if (current_date - prev_date).days > 1:
            blocks.append((start_date.date(), prev_date.date()))
            start_date = current_date
        prev_date = current_date

    # Append the last block
    blocks.append((start_date.date(), prev_date.date()))
    return blocks

train_blocks = find_blocks(train_set)
val_blocks = find_blocks(val_set)
test_blocks = find_blocks(test_set)

print("Train Set Blocks:")
for b in train_blocks:
    print(f"{b[0]}  →  {b[1]}")

print("\nValidation Set Blocks:")
for b in val_blocks:
    print(f"{b[0]}  →  {b[1]}")

print("\nTest Set Blocks:")
for b in test_blocks:
    print(f"{b[0]}  →  {b[1]}")

Train Set Blocks:
2022-01-27  →  2022-11-20
2023-01-27  →  2023-11-17

Validation Set Blocks:
2024-01-26  →  2024-07-31

Test Set Blocks:
2024-08-01  →  2024-11-17
2025-01-26  →  2025-07-21


### Sanity check for missing periods of data overall (not used in modelling)
This is to make sure that the only missing data indeed falls within the winter months for the train set, and the validation set should have no missing data at all.

In [None]:
# --- Check for missing dates in train set ---
expected_train_dates = pd.date_range(start=train_set['date'].min(), end=train_set['date'].max(), freq='D')
actual_train_dates = pd.to_datetime(train_set['date'].unique())
missing_train_dates = expected_train_dates.difference(actual_train_dates)

print(f"\nTrain set expected {len(expected_train_dates)} days, actual {len(actual_train_dates)} days.")
print(f"Missing train dates: {missing_train_dates.tolist()}" if len(missing_train_dates) > 0 else "No missing dates in train set.")

# --- Check for missing dates in validation set ---
expected_val_dates = pd.date_range(start=val_set['date'].min(), end=val_set['date'].max(), freq='D')
actual_val_dates = pd.to_datetime(val_set['date'].unique())
missing_val_dates = expected_val_dates.difference(actual_val_dates)

print(f"\nValidation set expected {len(expected_val_dates)} days, actual {len(actual_val_dates)} days.")
print(f"Missing validation dates: {missing_val_dates.tolist()}" if len(missing_val_dates) > 0 else "No missing dates in validation set.")


Train set expected 660 days, actual 593 days.
Missing train dates: [Timestamp('2022-11-21 00:00:00'), Timestamp('2022-11-22 00:00:00'), Timestamp('2022-11-23 00:00:00'), Timestamp('2022-11-24 00:00:00'), Timestamp('2022-11-25 00:00:00'), Timestamp('2022-11-26 00:00:00'), Timestamp('2022-11-27 00:00:00'), Timestamp('2022-11-28 00:00:00'), Timestamp('2022-11-29 00:00:00'), Timestamp('2022-11-30 00:00:00'), Timestamp('2022-12-01 00:00:00'), Timestamp('2022-12-02 00:00:00'), Timestamp('2022-12-03 00:00:00'), Timestamp('2022-12-04 00:00:00'), Timestamp('2022-12-05 00:00:00'), Timestamp('2022-12-06 00:00:00'), Timestamp('2022-12-07 00:00:00'), Timestamp('2022-12-08 00:00:00'), Timestamp('2022-12-09 00:00:00'), Timestamp('2022-12-10 00:00:00'), Timestamp('2022-12-11 00:00:00'), Timestamp('2022-12-12 00:00:00'), Timestamp('2022-12-13 00:00:00'), Timestamp('2022-12-14 00:00:00'), Timestamp('2022-12-15 00:00:00'), Timestamp('2022-12-16 00:00:00'), Timestamp('2022-12-17 00:00:00'), Timestamp('20

# Tensor Creation: no winter date observations, but land observations imputed with -2

In [None]:
# Define your variable columns
parameter_columns = ['KD490', 'ZSD', 'RRS490', 'RRS443', 'CHL', 'MICRO', 'BBP', 'CDM']

##### Train set to tensor
# train_set = train_set.copy()
# train_set['date'] = pd.to_datetime(train_set['date']).dt.strftime('%Y-%m-%d')

##### Validation set to tensor
# val_set = val_set.copy()
# val_set['date'] = pd.to_datetime(val_set['date']).dt.strftime('%Y-%m-%d')

##### Test set to tensor
test_set = test_set.copy()
test_set['date'] = pd.to_datetime(test_set['date']).dt.strftime('%Y-%m-%d')

# Get unique sorted coordinates and dates
dates = np.sort(test_set['date'].unique())
lats = np.sort(test_set['lat_5km'].unique())
lons = np.sort(test_set['lon_5km'].unique())

# Create mappings
date_to_index = {date: i for i, date in enumerate(dates)}
lat_to_index = {lat: i for i, lat in enumerate(lats)}
lon_to_index = {lon: i for i, lon in enumerate(lons)}
var_to_index = {var: i for i, var in enumerate(parameter_columns)}

# Initialize tensor with -2.0 to indicate missing values for land values
tensor = np.full((len(dates), len(parameter_columns), len(lats), len(lons)), -2.0)

# Fill the tensor
for _, row in tqdm(test_set.iterrows(), total=len(test_set), desc="Filling tensor"):
    t = date_to_index[row['date']]
    i = lat_to_index[row['lat_5km']]
    j = lon_to_index[row['lon_5km']]
    for var in parameter_columns:
        k = var_to_index[var]
        value = row[var]
        if pd.notna(value):  # only fill if value is not NaN
            tensor[t, k, i, j] = value

Filling tensor: 100%|██████████| 2443584/2443584 [02:23<00:00, 17041.47it/s]


In [None]:
np.save('test_tensor.npy', tensor)

### Train tensor

In [None]:
print("Full tensor shape:", tensor.shape)
print("A time step shape (tensor[0]):", tensor[0].shape)
print("A variable slice (tensor[0, 0]):", tensor[0, 0].shape)
print(tensor)

Full tensor shape: (593, 8, 63, 173)
A time step shape (tensor[0]): (8, 63, 173)
A variable slice (tensor[0, 0]): (63, 173)
[[[[ 4.51284237e-02  4.62857448e-02  4.68164720e-02 ...  8.39128494e-02
     8.63078237e-02  8.69329721e-02]
   [ 4.56127524e-02  4.69169766e-02  4.74153534e-02 ...  1.02339953e-01
    -2.00000000e+00 -2.00000000e+00]
   [ 4.42601852e-02  4.56468314e-02  4.76057790e-02 ...  8.67161602e-02
    -2.00000000e+00 -2.00000000e+00]
   ...
   [ 6.25731871e-02  6.37616590e-02  6.70026615e-02 ... -2.00000000e+00
    -2.00000000e+00 -2.00000000e+00]
   [ 6.22849502e-02  6.37925789e-02  6.68000951e-02 ... -2.00000000e+00
    -2.00000000e+00 -2.00000000e+00]
   [ 6.26005903e-02  6.42931461e-02  6.66101351e-02 ... -2.00000000e+00
    -2.00000000e+00 -2.00000000e+00]]

  [[ 2.00677719e+01  1.94583168e+01  1.91909256e+01 ...  9.69777107e+00
     9.39030743e+00  9.30843544e+00]
   [ 1.98150902e+01  1.91499844e+01  1.89132748e+01 ...  7.84172773e+00
    -2.00000000e+00 -2.00000000e

### Validation tensor

In [None]:
print("Full tensor shape:", tensor.shape)
print("A time step shape (tensor[0]):", tensor[0].shape)
print("A variable slice (tensor[0, 0]):", tensor[0, 0].shape)
print(tensor)

Full tensor shape: (188, 8, 63, 173)
A time step shape (tensor[0]): (8, 63, 173)
A variable slice (tensor[0, 0]): (63, 173)
[[[[ 4.63621467e-02  4.66969237e-02  4.69633937e-02 ...  1.27710924e-01
     1.29416823e-01  1.31838709e-01]
   [ 4.71739732e-02  4.74598035e-02  4.80784364e-02 ...  1.23484194e-01
    -2.00000000e+00 -2.00000000e+00]
   [ 4.77817021e-02  4.79629971e-02  4.99766804e-02 ...  1.14502639e-01
    -2.00000000e+00 -2.00000000e+00]
   ...
   [ 7.01626837e-02  7.40659311e-02  7.92190656e-02 ... -2.00000000e+00
    -2.00000000e+00 -2.00000000e+00]
   [ 6.98432103e-02  7.19099119e-02  7.88725391e-02 ... -2.00000000e+00
    -2.00000000e+00 -2.00000000e+00]
   [ 7.11984411e-02  7.40316287e-02  7.94310346e-02 ... -2.00000000e+00
    -2.00000000e+00 -2.00000000e+00]]

  [[ 1.94187946e+01  1.92499466e+01  1.91213112e+01 ...  5.93077993e+00
     5.83935547e+00  5.71486950e+00]
   [ 1.90152664e+01  1.88779984e+01  1.85866833e+01 ...  6.17079115e+00
    -2.00000000e+00 -2.00000000e

### Test tensor

In [None]:
print("Full tensor shape:", tensor.shape)
print("A time step shape (tensor[0]):", tensor[0].shape)
print("A variable slice (tensor[0, 0]):", tensor[0, 0].shape)
print(tensor)

Full tensor shape: (286, 8, 63, 173)
A time step shape (tensor[0]): (8, 63, 173)
A variable slice (tensor[0, 0]): (63, 173)
[[[[ 4.85968031e-02  5.45631871e-02  5.55635765e-02 ...  8.68904367e-02
     1.08969942e-01  1.12511486e-01]
   [ 4.59344834e-02  4.53761294e-02  4.55255806e-02 ...  9.84660462e-02
    -2.00000000e+00 -2.00000000e+00]
   [ 4.29607481e-02  4.35780995e-02  4.39737365e-02 ...  1.00679323e-01
    -2.00000000e+00 -2.00000000e+00]
   ...
   [ 6.15996793e-02  5.77808581e-02  5.83372600e-02 ... -2.00000000e+00
    -2.00000000e+00 -2.00000000e+00]
   [ 5.83535098e-02  5.69395833e-02  5.24931066e-02 ... -2.00000000e+00
    -2.00000000e+00 -2.00000000e+00]
   [ 5.71831316e-02  5.50670139e-02  5.36378920e-02 ... -2.00000000e+00
    -2.00000000e+00 -2.00000000e+00]]

  [[ 1.83528214e+01  1.62178688e+01  1.56712904e+01 ...  9.37384701e+00
     7.17007923e+00  6.91560841e+00]
   [ 1.96535072e+01  1.99397659e+01  1.98839836e+01 ...  8.05392838e+00
    -2.00000000e+00 -2.00000000e