# 0. Data loading and setup

In [8]:
import pandas as pd  # type: ignore
import sklearn as sk # type: ignore
import numpy as np # type: ignore
import tensorflow as tf # type: ignore
from tensorflow.keras import layers, models, Sequential # type: ignore
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten # type: ignore
import math
import os
from datetime import datetime
import time
from tqdm import tqdm  # For progress bar

In [9]:
all_data_df = pd.read_parquet('/Users/imarcolic/Desktop/1 ACADEMIA/2.0 MSc LSE/1 MSc Data Science/ST498 Capstone/1 Data/04 2 years (filled Data)/spatially_filled_data_final.parquet')

In [10]:
all_data_df

Unnamed: 0,time,latitude,longitude,KD490,ZSD,RRS490,RRS443,CHL,flags,MICRO,BBP,CDM,date
0,2023-01-27,49.140621,-10.755208,0.036846,25.871191,0.006751,0.007874,0.136023,0.0,0.267015,0.007805,0.019359,2023-01-27
1,2023-01-27,49.140621,-10.744791,0.036823,25.892086,0.006663,0.007783,0.135793,0.0,0.267015,0.007805,0.019359,2023-01-27
2,2023-01-27,49.140621,-10.734375,0.036774,25.937103,0.006431,0.007562,0.135306,0.0,0.267015,0.007805,0.019359,2023-01-27
3,2023-01-27,49.140621,-10.723958,0.036709,25.997976,0.006109,0.007268,0.134649,0.0,0.267015,0.007805,0.019359,2023-01-27
4,2023-01-27,49.140621,-10.713541,0.036507,26.185064,0.006138,0.007382,0.132640,0.0,0.267015,0.007805,0.019359,2023-01-27
...,...,...,...,...,...,...,...,...,...,...,...,...,...
121044305,2024-11-17,52.234371,-4.328125,0.146952,5.027317,0.005816,0.006983,2.177474,0.0,0.266766,0.003325,0.015229,2024-11-17
121044306,2024-11-17,52.234371,-4.317708,0.147682,4.998570,0.006174,0.007728,2.195725,0.0,0.266766,0.003325,0.015229,2024-11-17
121044307,2024-11-17,52.234371,-4.307291,0.149990,4.907660,0.006174,0.007728,2.253445,0.0,0.266766,0.003325,0.015229,2024-11-17
121044308,2024-11-17,52.234371,-4.296875,0.157728,4.626949,0.006174,0.007728,2.451085,0.0,0.266766,0.003325,0.015229,2024-11-17


In [12]:
optics_df = all_data_df.iloc[:, [0, 1, 2, 10, 11, 8]]

In [13]:
optics_df

Unnamed: 0,time,latitude,longitude,BBP,CDM,flags
0,2023-01-27,49.140621,-10.755208,0.007805,0.019359,0.0
1,2023-01-27,49.140621,-10.744791,0.007805,0.019359,0.0
2,2023-01-27,49.140621,-10.734375,0.007805,0.019359,0.0
3,2023-01-27,49.140621,-10.723958,0.007805,0.019359,0.0
4,2023-01-27,49.140621,-10.713541,0.007805,0.019359,0.0
...,...,...,...,...,...,...
121044305,2024-11-17,52.234371,-4.328125,0.003325,0.015229,0.0
121044306,2024-11-17,52.234371,-4.317708,0.003325,0.015229,0.0
121044307,2024-11-17,52.234371,-4.307291,0.003325,0.015229,0.0
121044308,2024-11-17,52.234371,-4.296875,0.003325,0.015229,0.0


---

In [2]:
# Data preparation

optics_df = pd.read_parquet('/Users/imarcolic/Desktop/1 ACADEMIA/2.0 MSc LSE/1 MSc Data Science/ST498 Capstone/1 Data/2 years (0 uncleaned)/optics_merged_ocean_data.parquet')
plankton_df = pd.read_parquet('/Users/imarcolic/Desktop/1 ACADEMIA/2.0 MSc LSE/1 MSc Data Science/ST498 Capstone/1 Data/2 years (0 uncleaned)/plankton_merged_ocean_data.parquet')
reflectance_df = pd.read_parquet('/Users/imarcolic/Desktop/1 ACADEMIA/2.0 MSc LSE/1 MSc Data Science/ST498 Capstone/1 Data/2 years (0 uncleaned)/reflectance_merged_ocean_data.parquet')
transparency_df = pd.read_parquet('/Users/imarcolic/Desktop/1 ACADEMIA/2.0 MSc LSE/1 MSc Data Science/ST498 Capstone/1 Data/2 years (0 uncleaned)/transparency_merged_ocean_data.parquet')

In [3]:
optics_df

Unnamed: 0,time,latitude,longitude,BBP,CDM,flags
0,2023-01-01,49.140621,-10.755208,,,0.0
1,2023-01-01,49.140621,-10.744791,,,0.0
2,2023-01-01,49.140621,-10.734375,,,0.0
3,2023-01-01,49.140621,-10.723958,,,0.0
4,2023-01-01,49.140621,-10.713541,,,0.0
...,...,...,...,...,...,...
187545401,2024-12-31,52.234371,-2.192708,,,1.0
187545402,2024-12-31,52.234371,-2.182291,,,1.0
187545403,2024-12-31,52.234371,-2.171874,,,1.0
187545404,2024-12-31,52.234371,-2.161458,,,1.0


In [14]:
# 1. Check how many total rows exist
total_rows = len(optics_df)
print(f"Total rows in optics_df: {total_rows}")  # Should be 187,545,406

# 2. Count unique coordinate combinations
unique_coords = optics_df[['time', 'latitude', 'longitude', 'BBP', 'CDM', "flags"]].drop_duplicates()
unique_count = len(unique_coords)
print(f"Unique coordinate combinations: {unique_count}")

# 3. Calculate duplicate coordinates
duplicate_count = total_rows - unique_count
print(f"Number of duplicate coordinates: {duplicate_count}")

Total rows in optics_df: 121044310
Unique coordinate combinations: 115206160
Number of duplicate coordinates: 5838150


In [15]:
optics_df = unique_coords
optics_df

Unnamed: 0,time,latitude,longitude,BBP,CDM,flags
0,2023-01-27,49.140621,-10.755208,0.007805,0.019359,0.0
1,2023-01-27,49.140621,-10.744791,0.007805,0.019359,0.0
2,2023-01-27,49.140621,-10.734375,0.007805,0.019359,0.0
3,2023-01-27,49.140621,-10.723958,0.007805,0.019359,0.0
4,2023-01-27,49.140621,-10.713541,0.007805,0.019359,0.0
...,...,...,...,...,...,...
121044305,2024-11-17,52.234371,-4.328125,0.003325,0.015229,0.0
121044306,2024-11-17,52.234371,-4.317708,0.003325,0.015229,0.0
121044307,2024-11-17,52.234371,-4.307291,0.003325,0.015229,0.0
121044308,2024-11-17,52.234371,-4.296875,0.003325,0.015229,0.0


In [22]:
number_of_unique_timestamps = optics_df["time"].nunique()
number_of_unique_latitudes = optics_df['latitude'].nunique()
number_of_unique_longitudes = optics_df['longitude'].nunique()

print(f"There are {number_of_unique_timestamps} unique timestamp values")
print(f"There are {number_of_unique_latitudes} unique latitude values")
print(f"There are {number_of_unique_longitudes} unique longitude values")

There are 592 unique timestamp values
There are 298 unique latitude values
There are 827 unique longitude values


In [21]:
coords_df = pd.DataFrame({'coords': list(zip(optics_df['latitude'], optics_df['longitude']))})

In [25]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import time

# Extract unique values from the current dataframe
timestamps = sorted(optics_df['time'].unique().tolist())  # 592 unique timestamps
latitudes = sorted(optics_df['latitude'].unique().tolist())  # 298 unique latitudes
longitudes = sorted(optics_df['longitude'].unique().tolist())  # 827 unique longitudes

print(f"Creating a tensor with dimensions:")
print(f"- {len(timestamps)} timestamps")
print(f"- {len(latitudes)} latitudes")
print(f"- {len(longitudes)} longitudes")
print(f"- 2 channels (BBP and CDM)")
print(f"Total tensor elements: {len(timestamps) * len(latitudes) * len(longitudes) * 2:,}")

# Create mappings for faster indexing
time_idx = {t: i for i, t in enumerate(timestamps)}
lat_idx = {lat: i for i, lat in enumerate(latitudes)}
lon_idx = {lon: i for i, lon in enumerate(longitudes)}

# Initialize the tensor with -20 values
print("Initializing tensor with -20 values...")
start_time = time.time()
optics_tensor = np.full((len(timestamps), len(latitudes), len(longitudes), 2), -20.0)
print(f"Initialized tensor in {time.time() - start_time:.2f} seconds")

# Fill the tensor with actual values from the original dataframe
print("Filling tensor with actual values...")
start_time = time.time()
filled_count = 0

# Process in batches to show progress
batch_size = 1000000  # Process 1 million rows at a time
total_batches = (len(optics_df) + batch_size - 1) // batch_size

for batch_idx in tqdm(range(total_batches), desc="Processing batches"):
    start_idx = batch_idx * batch_size
    end_idx = min(start_idx + batch_size, len(optics_df))
    batch = optics_df.iloc[start_idx:end_idx]
    
    for _, row in batch.iterrows():
        t_idx = time_idx[row['time']]
        lat_idx_val = lat_idx[row['latitude']]
        lon_idx_val = lon_idx[row['longitude']]
        
        # Set BBP value (channel 0)
        optics_tensor[t_idx, lat_idx_val, lon_idx_val, 0] = row['BBP']
        
        # Set CDM value (channel 1)
        optics_tensor[t_idx, lat_idx_val, lon_idx_val, 1] = row['CDM']
        
        filled_count += 1

print(f"Filled tensor in {time.time() - start_time:.2f} seconds")
print(f"Tensor shape: {optics_tensor.shape}")
print(f"Filled tensor entries: {filled_count:,} ({filled_count/(optics_tensor.size/2)*100:.2f}%)")

# Check tensor for missing values
missing_count = np.sum(optics_tensor == -20.0) / 2  # Divide by 2 because we count per location, not per channel
total_locations = len(timestamps) * len(latitudes) * len(longitudes)
print(f"Missing values: {missing_count:,} ({missing_count/total_locations*100:.2f}%)")

# Inspect a small slice of the tensor to verify
print("\nInspecting a small slice of the tensor:")
slice_t = 0  # First timestamp
slice_lat = 0  # First latitude
slice_lon_range = slice(0, 5)  # First 5 longitudes
print(f"Values at timestamp {timestamps[slice_t]}, latitude {latitudes[slice_lat]}, first 5 longitudes:")
print(optics_tensor[slice_t, slice_lat, slice_lon_range, :])

Creating a tensor with dimensions:
- 592 timestamps
- 298 latitudes
- 827 longitudes
- 2 channels (BBP and CDM)
Total tensor elements: 291,792,064
Initializing tensor with -20 values...
Initialized tensor in 0.43 seconds
Filling tensor with actual values...


Processing batches: 100%|██████████| 116/116 [32:23<00:00, 16.75s/it]


Filled tensor in 1943.63 seconds
Tensor shape: (592, 298, 827, 2)
Filled tensor entries: 115,206,160 (78.96%)
Missing values: 30,689,872.0 (21.04%)

Inspecting a small slice of the tensor:
Values at timestamp 2023-01-27 00:00:00, latitude 49.140621185302734, first 5 longitudes:
[[0.00780507 0.0193592 ]
 [0.00780507 0.0193592 ]
 [0.00780507 0.0193592 ]
 [0.00780507 0.0193592 ]
 [0.00780507 0.0193592 ]]


In [26]:
# Optionally save the tensor
np.save('complete_optics_tensor.npy', optics_tensor)

: 

In [None]:
# Convert to TensorFlow tensor if needed
# import tensorflow as tf
tf_optics_tensor = tf.convert_to_tensor(optics_tensor)

---

In [30]:
# 1. Count values per unique timestamp
timestamp_counts = optics_df.groupby('time').size().reset_index(name='count')
print("Values per timestamp:")
print(timestamp_counts.describe())
print("\nTop 5 timestamps by count:")
print(timestamp_counts.sort_values('count', ascending=False).head())
print("\nBottom 5 timestamps by count:")
print(timestamp_counts.sort_values('count').head())

Values per timestamp:
                                time     count
count                            592     592.0
mean   2023-12-23 02:47:50.270270464  194605.0
min              2023-01-27 00:00:00  194605.0
25%              2023-06-23 18:00:00  194605.0
50%              2024-01-26 12:00:00  194605.0
75%              2024-06-22 06:00:00  194605.0
max              2024-11-17 00:00:00  194605.0
std                              NaN       0.0

Top 5 timestamps by count:
          time   count
0   2023-01-27  194605
389 2024-04-29  194605
391 2024-05-01  194605
392 2024-05-02  194605
393 2024-05-03  194605

Bottom 5 timestamps by count:
          time   count
0   2023-01-27  194605
391 2024-05-01  194605
392 2024-05-02  194605
393 2024-05-03  194605
394 2024-05-04  194605


In [31]:
# 2. Count values per unique latitude
latitude_counts = optics_df.groupby('latitude').size().reset_index(name='count')
print("\n\nValues per latitude:")
print(latitude_counts.describe())
print("\nTop 5 latitudes by count:")
print(latitude_counts.sort_values('count', ascending=False).head())
print("\nBottom 5 latitudes by count:")
print(latitude_counts.sort_values('count').head())



Values per latitude:
         latitude          count
count  298.000000     298.000000
mean    50.687500  386597.852349
std      0.897598  104311.745695
min     49.140621  142080.000000
25%     49.914061  352240.000000
50%     50.687498  406704.000000
75%     51.460935  484848.000000
max     52.234371  489584.000000

Top 5 latitudes by count:
     latitude   count
0   49.140621  489584
36  49.515621  489584
53  49.692707  489584
52  49.682289  489584
51  49.671871  489584

Bottom 5 latitudes by count:
      latitude   count
296  52.223957  142080
295  52.213539  146816
297  52.234371  147408
293  52.192707  149776
294  52.203121  149776


In [32]:

# 3. Count values per unique longitude
longitude_counts = optics_df.groupby('longitude').size().reset_index(name='count')
print("\n\nValues per longitude:")
print(longitude_counts.describe())
print("\nTop 5 longitudes by count:")
print(longitude_counts.sort_values('count', ascending=False).head())
print("\nBottom 5 longitudes by count:")
print(longitude_counts.sort_values('count').head())



Values per longitude:
        longitude          count
count  827.000000     827.000000
mean    -6.453125  139306.118501
std      2.488317   32388.019955
min    -10.755208   74592.000000
25%     -8.604167  108040.000000
50%     -6.453125  145040.000000
75%     -4.302083  170496.000000
max     -2.151041  176416.000000

Top 5 longitudes by count:
     longitude   count
0   -10.755208  176416
468  -5.880208  176416
447  -6.098958  176416
448  -6.088541  176416
449  -6.078125  176416

Bottom 5 longitudes by count:
     longitude  count
675  -3.723958  74592
677  -3.703125  75184
681  -3.661458  75776
674  -3.734375  75776
676  -3.713541  75776


---
In order to understand the dimensions we want to setup for the Tensor we can do easy calculations:

- **Dimension 1: 731 = days/samples**

Since we have a 2 years dataset at hand, we have in total 2*365=730 days, however since 2024 was a year with 29 days in february we will need to add one day, so we will have in total 731 samples.

- **Dimension 2: 298 = latitude**
  
As seen above, we have counted the amount of unique values for each latitude coordinate which enables us to build the first axis/rows.

- **Dimension 3: 827 = longitude**

As seen above, we have counted the amount of unique values for each longitude coordinate which enables us to build the second axis/columns.
  
- **Dimension 4: 8 = ocean variables**

We have 4 datasets in total we are analyzing, 1.) Optics 2.) Plankton 3.) Reflectance 4.) Transparency, and each of those variables includes in total 2 subvariables to depict the given variable.

---

*This leaves us with the following Tensor dimensions:*

(731, 298, 827, 8)
  
---

In [23]:
timestamps = optics_df['time'].unique().tolist() # 731
lats = optics_df['latitude'].unique().tolist() # 298 
longs = optics_df['longitude'].unique().tolist() # 827

In [21]:
bbp = optics_df['BBP'].tolist()
cdm = optics_df['CDM'].tolist()

In [22]:
optics_tensor = np.zeros([len(timestamps), len(lats), len(longs), 2])

In [27]:
optics_tensor.shape

(592, 298, 827, 2)

In [23]:
def insert_channel_values(tensor, values, channel_idx):
    # Create a counter for position in the values list
    value_idx = 0
    
    # Loop through all samples (time points)
    for s in range(len(timestamps)):
        # Loop through all rows (latitude)
        for r in range(len(lats)):
            # Loop through all columns (longitude)
            for c in range(len(longs)):
                # Check if we still have values to assign
                if value_idx < len(values):
                    # Assign the value to the specified channel
                    tensor[s, r, c, channel_idx] = values[value_idx]
                    # Increment counter
                    value_idx += 1
                else:
                    print(f"Warning: Not enough values to fill the entire tensor. Filled {value_idx} positions.")
                    return tensor
    
    # Check if we used all values
    if value_idx < len(values):
        print(f"Warning: Not all values were used. Used {value_idx} out of {len(values)} values.")
        
    return tensor

In [24]:
# Populate BBP values into channel 0
optics_tensor = insert_channel_values(optics_tensor, bbp, 0)



In [25]:
# Populate CDM values into channel 1 
optics_tensor = insert_channel_values(optics_tensor, cdm, 1)



In [26]:
optics_tensor

array([[[[0.00780507, 0.0193592 ],
         [0.00780507, 0.0193592 ],
         [0.00780507, 0.0193592 ],
         ...,
         [0.09829997, 0.08624734],
         [0.09829997, 0.08650506],
         [0.09122612, 0.08596087]],

        [[0.00780507, 0.0193592 ],
         [0.00780507, 0.0193592 ],
         [0.00780507, 0.0193592 ],
         ...,
         [0.08835776, 0.08514669],
         [0.08731999, 0.08608334],
         [0.08514529, 0.08617996]],

        [[0.00780507, 0.0193592 ],
         [0.00780507, 0.0193592 ],
         [0.00780507, 0.0193592 ],
         ...,
         [0.07318334, 0.08269233],
         [0.07121663, 0.08350196],
         [0.06927498, 0.08482695]],

        ...,

        [[0.00738205, 0.03708844],
         [0.00738205, 0.03708844],
         [0.00738205, 0.03708844],
         ...,
         [0.00738205, 0.03708844],
         [0.00738205, 0.03708844],
         [0.00738205, 0.03708844]],

        [[0.00738205, 0.03708844],
         [0.00738205, 0.03708844],
         [0.

In [14]:
# 1. Convert to NumPy and save
np.save('/Users/imarcolic/Desktop/1 ACADEMIA/2.0 MSc LSE/1 MSc Data Science/ST498 Capstone/1 Data/2 years (1 tensor)/optics_tensor.npy', optics_tensor)

In [15]:
optics_array = np.load('/Users/imarcolic/Desktop/1 ACADEMIA/2.0 MSc LSE/1 MSc Data Science/ST498 Capstone/1 Data/2 years (1 tensor)/optics_tensor.npy')

In [16]:
optics_real_tensor = tf.convert_to_tensor(optics_array)

In [17]:
optics_real_tensor

<tf.Tensor: shape=(731, 298, 827, 2), dtype=float64, numpy=
array([[[[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan, nan]],

        ...,

        [[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan, nan]]],


       [[[nan, nan],
         [nan, nan],
         [nan, nan],
         

In [None]:
merged_tensor = tf.concat([optics_tensor, plankton_tensor, reflectance_tensor, transparency_tensor], axis=3)