# 0. Data loading and setup

In [1]:
import pandas as pd 
import sklearn as sk
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, Sequential # type: ignore
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten # type: ignore
import math
import os
from datetime import datetime
import time
from tqdm import tqdm  # For progress bar

In [2]:
# Data preparation

optics_df = pd.read_parquet('/Users/imarcolic/Desktop/1 ACADEMIA/2.0 MSc LSE/1 MSc Data Science/ST498 Capstone/1 Data/2 years (0 uncleaned)/optics_merged_ocean_data.parquet')
plankton_df = pd.read_parquet('/Users/imarcolic/Desktop/1 ACADEMIA/2.0 MSc LSE/1 MSc Data Science/ST498 Capstone/1 Data/2 years (0 uncleaned)/plankton_merged_ocean_data.parquet')
reflectance_df = pd.read_parquet('/Users/imarcolic/Desktop/1 ACADEMIA/2.0 MSc LSE/1 MSc Data Science/ST498 Capstone/1 Data/2 years (0 uncleaned)/reflectance_merged_ocean_data.parquet')
transparency_df = pd.read_parquet('/Users/imarcolic/Desktop/1 ACADEMIA/2.0 MSc LSE/1 MSc Data Science/ST498 Capstone/1 Data/2 years (0 uncleaned)/transparency_merged_ocean_data.parquet')

In [3]:
optics_df

Unnamed: 0,time,latitude,longitude,BBP,CDM,flags
0,2023-01-01,49.140621,-10.755208,,,0.0
1,2023-01-01,49.140621,-10.744791,,,0.0
2,2023-01-01,49.140621,-10.734375,,,0.0
3,2023-01-01,49.140621,-10.723958,,,0.0
4,2023-01-01,49.140621,-10.713541,,,0.0
...,...,...,...,...,...,...
187545401,2024-12-31,52.234371,-2.192708,,,1.0
187545402,2024-12-31,52.234371,-2.182291,,,1.0
187545403,2024-12-31,52.234371,-2.171874,,,1.0
187545404,2024-12-31,52.234371,-2.161458,,,1.0


In [4]:
# 1. Check how many total rows exist
total_rows = len(optics_df)
print(f"Total rows in optics_df: {total_rows}")  # Should be 187,545,406

# 2. Count unique coordinate combinations
unique_coords = optics_df[['time', 'latitude', 'longitude', 'BBP', 'CDM', "flags"]].drop_duplicates()
unique_count = len(unique_coords)
print(f"Unique coordinate combinations: {unique_count}")

# 3. Calculate duplicate coordinates
duplicate_count = total_rows - unique_count
print(f"Number of duplicate coordinates: {duplicate_count}")

Total rows in optics_df: 187545406
Unique coordinate combinations: 180152026
Number of duplicate coordinates: 7393380


In [5]:
optics_df = unique_coords
optics_df

Unnamed: 0,time,latitude,longitude,BBP,CDM,flags
0,2023-01-01,49.140621,-10.755208,,,0.0
1,2023-01-01,49.140621,-10.744791,,,0.0
2,2023-01-01,49.140621,-10.734375,,,0.0
3,2023-01-01,49.140621,-10.723958,,,0.0
4,2023-01-01,49.140621,-10.713541,,,0.0
...,...,...,...,...,...,...
187545401,2024-12-31,52.234371,-2.192708,,,1.0
187545402,2024-12-31,52.234371,-2.182291,,,1.0
187545403,2024-12-31,52.234371,-2.171874,,,1.0
187545404,2024-12-31,52.234371,-2.161458,,,1.0


In [6]:
reflectance_df

Unnamed: 0,time,latitude,longitude,RRS490,RRS443,flags
0,2023-01-01,49.140621,-10.755208,,,0.0
1,2023-01-01,49.140621,-10.744791,,,0.0
2,2023-01-01,49.140621,-10.734375,,,0.0
3,2023-01-01,49.140621,-10.723958,,,0.0
4,2023-01-01,49.140621,-10.713541,,,0.0
...,...,...,...,...,...,...
180152021,2024-12-31,52.234371,-2.192708,,,1.0
180152022,2024-12-31,52.234371,-2.182291,,,1.0
180152023,2024-12-31,52.234371,-2.171874,,,1.0
180152024,2024-12-31,52.234371,-2.161458,,,1.0


In [7]:
number_of_unique_latitudes = optics_df['latitude'].nunique()
number_of_unique_longitudes = optics_df['longitude'].nunique()

print(f"There are {number_of_unique_latitudes} unique latitude values")
print(f"There are {number_of_unique_longitudes} unique longitude values")

There are 298 unique latitude values
There are 827 unique longitude values


---
In order to understand the dimensions we want to setup for the Tensor we can do easy calculations:

- **Dimension 1: 731 = days/samples**

Since we have a 2 years dataset at hand, we have in total 2*365=730 days, however since 2024 was a year with 29 days in february we will need to add one day, so we will have in total 731 samples.

- **Dimension 2: 298 = latitude**
  
As seen above, we have counted the amount of unique values for each latitude coordinate which enables us to build the first axis/rows.

- **Dimension 3: 827 = longitude**

As seen above, we have counted the amount of unique values for each longitude coordinate which enables us to build the second axis/columns.
  
- **Dimension 4: 8 = ocean variables**

We have 4 datasets in total we are analyzing, 1.) Optics 2.) Plankton 3.) Reflectance 4.) Transparency, and each of those variables includes in total 2 subvariables to depict the given variable.

---

*This leaves us with the following Tensor dimensions:*

(731, 298, 827, 8)
  
---

In [8]:
timestamps = optics_df['time'].unique().tolist() # 731
lats = optics_df['latitude'].unique().tolist() # 298 
longs = optics_df['longitude'].unique().tolist() # 827

In [9]:
rrs490 = reflectance_df['RRS490'].tolist()
rrs443 = reflectance_df['RRS443'].tolist()

In [14]:
import pandas as pd
import numpy as np
from collections import Counter

def analyze_float_values(float_list):
    # Convert to numpy array for easier NaN handling
    values = np.array(float_list)
    
    # Create a dictionary to store counts
    value_counts = {}
    
    # Count NaN values
    nan_mask = np.isnan(values)
    nan_count = np.sum(nan_mask)
    if nan_count > 0:
        value_counts['NaN'] = nan_count
    
    # Count non-NaN values
    non_nan_values = values[~nan_mask]
    for value, count in Counter(non_nan_values).items():
        value_counts[value] = count
    
    # Create DataFrame and sort by count
    df = pd.DataFrame({'Value': list(value_counts.keys()), 
                      'Count': list(value_counts.values())})
    
    # Sort by count in descending order
    df = df.sort_values('Count', ascending=False).reset_index(drop=True)
    
    # Add percentage column
    total = len(float_list)
    df['Percentage'] = (df['Count'] / total * 100).round(2)
    
    return df

In [15]:
# Example usage:
# Let's create an example list with some repeating values and NaNs
result = analyze_float_values(rrs490)
print(result)

             Value      Count  Percentage
0              NaN  130870013       72.64
1         0.004256        405        0.00
2         0.003966        404        0.00
3         0.004362        381        0.00
4         0.004718        377        0.00
...            ...        ...         ...
21533648  0.002524          1        0.00
21533649  0.006523          1        0.00
21533650  0.011353          1        0.00
21533651  0.006047          1        0.00
21533652   0.00317          1        0.00

[21533653 rows x 3 columns]


In [16]:
optics_tensor = np.zeros([len(timestamps), len(lats), len(longs), 2])
plankton_tensor = np.zeros([len(timestamps), len(lats), len(longs), 2])
reflectance_tensor = np.zeros([len(timestamps), len(lats), len(longs), 2])
transparency_tensor = np.zeros([len(timestamps), len(lats), len(longs), 2])

In [17]:
def insert_channel_values(tensor, values, channel_idx):
    # Create a counter for position in the values list
    value_idx = 0
    
    # Loop through all samples (time points)
    for s in range(len(timestamps)):
        # Loop through all rows (latitude)
        for r in range(len(lats)):
            # Loop through all columns (longitude)
            for c in range(len(longs)):
                # Check if we still have values to assign
                if value_idx < len(values):
                    # Assign the value to the specified channel
                    tensor[s, r, c, channel_idx] = values[value_idx]
                    # Increment counter
                    value_idx += 1
                else:
                    print(f"Warning: Not enough values to fill the entire tensor. Filled {value_idx} positions.")
                    return tensor
    
    # Check if we used all values
    if value_idx < len(values):
        print(f"Warning: Not all values were used. Used {value_idx} out of {len(values)} values.")
        
    return tensor

In [18]:
# Populate BBP values into channel 0
reflectance_tensor = insert_channel_values(reflectance_tensor, rrs490, 0)

In [19]:
# Populate CDM values into channel 1 
reflectance_tensor = insert_channel_values(reflectance_tensor, rrs443, 1)

In [20]:
reflectance_tensor

array([[[[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan, nan]],

        ...,

        [[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan, nan]]],


       [[[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan

In [21]:
result2 = analyze_float_values(reflectance_tensor)
print(result2)

             Value      Count   Percentage
0              NaN  261740026  35805749.11
1         0.003966        642        87.82
2         0.003992        640        87.55
3         0.004256        617        84.40
4         0.004344        609        83.31
...            ...        ...          ...
27872432  0.001132          1         0.14
27872433 -0.000036          1         0.14
27872434  0.001226          1         0.14
27872435  0.000023          1         0.14
27872436  0.001843          1         0.14

[27872437 rows x 3 columns]


In [15]:
# 1. Convert to NumPy and save
np.save('/Users/imarcolic/Desktop/1 ACADEMIA/2.0 MSc LSE/1 MSc Data Science/ST498 Capstone/1 Data/2 years (1 tensor)/reflectance_tensor.npy', reflectance_tensor)

In [16]:
reflectance_array = np.load('/Users/imarcolic/Desktop/1 ACADEMIA/2.0 MSc LSE/1 MSc Data Science/ST498 Capstone/1 Data/2 years (1 tensor)/reflectance_tensor.npy')

In [17]:
reflectance_real_tensor = tf.convert_to_tensor(reflectance_tensor)

In [18]:
reflectance_real_tensor

<tf.Tensor: shape=(731, 298, 827, 2), dtype=float64, numpy=
array([[[[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan, nan]],

        ...,

        [[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan, nan]]],


       [[[nan, nan],
         [nan, nan],
         [nan, nan],
         

: 

In [None]:
merged_tensor = tf.concat([optics_tensor, plankton_tensor, reflectance_tensor, transparency_tensor], axis=3)