# 0. Data loading and setup

In [1]:
import pandas as pd 
import sklearn as sk
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, Sequential # type: ignore
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten # type: ignore
import math
import os
from datetime import datetime
import time
from tqdm import tqdm  # For progress bar

In [2]:
# Data preparation

optics_df = pd.read_parquet('/Users/imarcolic/Desktop/1 ACADEMIA/2.0 MSc LSE/1 MSc Data Science/ST498 Capstone/1 Data/2 years (0 uncleaned)/optics_merged_ocean_data.parquet')
plankton_df = pd.read_parquet('/Users/imarcolic/Desktop/1 ACADEMIA/2.0 MSc LSE/1 MSc Data Science/ST498 Capstone/1 Data/2 years (0 uncleaned)/plankton_merged_ocean_data.parquet')
reflectance_df = pd.read_parquet('/Users/imarcolic/Desktop/1 ACADEMIA/2.0 MSc LSE/1 MSc Data Science/ST498 Capstone/1 Data/2 years (0 uncleaned)/reflectance_merged_ocean_data.parquet')
transparency_df = pd.read_parquet('/Users/imarcolic/Desktop/1 ACADEMIA/2.0 MSc LSE/1 MSc Data Science/ST498 Capstone/1 Data/2 years (0 uncleaned)/transparency_merged_ocean_data.parquet')

In [3]:
transparency_df

Unnamed: 0,time,latitude,longitude,KD490,ZSD,flags
0,2023-01-01,49.140621,-10.755208,,,0.0
1,2023-01-01,49.140621,-10.744791,,,0.0
2,2023-01-01,49.140621,-10.734375,,,0.0
3,2023-01-01,49.140621,-10.723958,,,0.0
4,2023-01-01,49.140621,-10.713541,,,0.0
...,...,...,...,...,...,...
180152021,2024-12-31,52.234371,-2.192708,,,1.0
180152022,2024-12-31,52.234371,-2.182291,,,1.0
180152023,2024-12-31,52.234371,-2.171874,,,1.0
180152024,2024-12-31,52.234371,-2.161458,,,1.0


In [4]:
number_of_unique_latitudes = optics_df['latitude'].nunique()
number_of_unique_longitudes = optics_df['longitude'].nunique()

print(f"There are {number_of_unique_latitudes} unique latitude values")
print(f"There are {number_of_unique_longitudes} unique longitude values")

There are 298 unique latitude values
There are 827 unique longitude values


---
In order to understand the dimensions we want to setup for the Tensor we can do easy calculations:

- **Dimension 1: 731 = days/samples**

Since we have a 2 years dataset at hand, we have in total 2*365=730 days, however since 2024 was a year with 29 days in february we will need to add one day, so we will have in total 731 samples.

- **Dimension 2: 298 = latitude**
  
As seen above, we have counted the amount of unique values for each latitude coordinate which enables us to build the first axis/rows.

- **Dimension 3: 827 = longitude**

As seen above, we have counted the amount of unique values for each longitude coordinate which enables us to build the second axis/columns.
  
- **Dimension 4: 8 = ocean variables**

We have 4 datasets in total we are analyzing, 1.) Optics 2.) Plankton 3.) Reflectance 4.) Transparency, and each of those variables includes in total 2 subvariables to depict the given variable.

---

*This leaves us with the following Tensor dimensions:*

(731, 298, 827, 8)
  
---

In [5]:
timestamps = optics_df['time'].unique().tolist() # 731
lats = optics_df['latitude'].unique().tolist() # 298 
longs = optics_df['longitude'].unique().tolist() # 827

In [6]:
kd490 = transparency_df['KD490'].tolist()
zsd = transparency_df['ZSD'].tolist()

In [7]:
optics_tensor = np.zeros([len(timestamps), len(lats), len(longs), 2])
plankton_tensor = np.zeros([len(timestamps), len(lats), len(longs), 2])
reflectance_tensor = np.zeros([len(timestamps), len(lats), len(longs), 2])
transparency_tensor = np.zeros([len(timestamps), len(lats), len(longs), 2])

In [8]:
def insert_channel_values(tensor, values, channel_idx):
    # Create a counter for position in the values list
    value_idx = 0
    
    # Loop through all samples (time points)
    for s in range(len(timestamps)):
        # Loop through all rows (latitude)
        for r in range(len(lats)):
            # Loop through all columns (longitude)
            for c in range(len(longs)):
                # Check if we still have values to assign
                if value_idx < len(values):
                    # Assign the value to the specified channel
                    tensor[s, r, c, channel_idx] = values[value_idx]
                    # Increment counter
                    value_idx += 1
                else:
                    print(f"Warning: Not enough values to fill the entire tensor. Filled {value_idx} positions.")
                    return tensor
    
    # Check if we used all values
    if value_idx < len(values):
        print(f"Warning: Not all values were used. Used {value_idx} out of {len(values)} values.")
        
    return tensor

In [9]:
# Populate BBP values into channel 0
transparency_tensor = insert_channel_values(transparency_tensor, kd490, 0)

In [10]:
# Populate CDM values into channel 1 
transparency_tensor = insert_channel_values(transparency_tensor, zsd, 1)

In [11]:
transparency_tensor

array([[[[        nan,         nan],
         [        nan,         nan],
         [        nan,         nan],
         ...,
         [        nan,         nan],
         [        nan,         nan],
         [        nan,         nan]],

        [[        nan,         nan],
         [        nan,         nan],
         [        nan,         nan],
         ...,
         [        nan,         nan],
         [        nan,         nan],
         [        nan,         nan]],

        [[        nan,         nan],
         [        nan,         nan],
         [        nan,         nan],
         ...,
         [        nan,         nan],
         [        nan,         nan],
         [        nan,         nan]],

        ...,

        [[ 0.07731601, 10.6525383 ],
         [ 0.07766177, 10.59768486],
         [ 0.07825939, 10.50449085],
         ...,
         [        nan,         nan],
         [        nan,         nan],
         [        nan,         nan]],

        [[ 0.07861214, 10.44942284

In [12]:
# 1. Convert to NumPy and save
np.save('/Users/imarcolic/Desktop/1 ACADEMIA/2.0 MSc LSE/1 MSc Data Science/ST498 Capstone/1 Data/2 years (1 tensor)/transparency_tensor.npy', transparency_tensor)

In [13]:
transparency_array = np.load('/Users/imarcolic/Desktop/1 ACADEMIA/2.0 MSc LSE/1 MSc Data Science/ST498 Capstone/1 Data/2 years (1 tensor)/transparency_tensor.npy')

In [14]:
transparency_real_tensor = tf.convert_to_tensor(transparency_tensor)

In [15]:
transparency_real_tensor

<tf.Tensor: shape=(731, 298, 827, 2), dtype=float64, numpy=
array([[[[        nan,         nan],
         [        nan,         nan],
         [        nan,         nan],
         ...,
         [        nan,         nan],
         [        nan,         nan],
         [        nan,         nan]],

        [[        nan,         nan],
         [        nan,         nan],
         [        nan,         nan],
         ...,
         [        nan,         nan],
         [        nan,         nan],
         [        nan,         nan]],

        [[        nan,         nan],
         [        nan,         nan],
         [        nan,         nan],
         ...,
         [        nan,         nan],
         [        nan,         nan],
         [        nan,         nan]],

        ...,

        [[ 0.07731601, 10.6525383 ],
         [ 0.07766177, 10.59768486],
         [ 0.07825939, 10.50449085],
         ...,
         [        nan,         nan],
         [        nan,         nan],
         [   

In [16]:
merged_tensor = tf.concat([optics_tensor, plankton_tensor, reflectance_tensor, transparency_tensor], axis=3)