____

# Preparation

## Import library and set directory

In [1]:
import os
import numpy as np
import pandas as pd
import xarray as xr
import geopandas as gpd

# Add the path to the designated folder containing custom modules.
import sys
sys.path.append('../src')

# Import the custom module for benthic habitat mapping.
# This module contains functions and utilities for tasks such as
# correction and classification of benthic habitats.
import benthic_mapping as bm

import matplotlib.pyplot as plt
# plt.style.use('dark_background')

from datetime import datetime

In [2]:
def construct_file_path(out_dir, user_year):
    # Ensure the year is a string
    user_year = str(user_year)
    # Construct the directory path
    year_folder = os.path.join(out_dir, 'atmospheric_correction', user_year)
    # Iterate through the files in the year folder
    for filename in os.listdir(year_folder):
        # Check if the file contains 'L2R' in its name
        if 'L2R' in filename:
            # Construct the full file path
            file_data_path = os.path.join(year_folder, filename)
            return file_data_path
    # If no file is found, return None
    return

# Define folder path
base_dir =  os.path.abspath(os.path.join(os.getcwd(), '../..'))
data_dir = os.path.join(base_dir, 'data')
out_dir = os.path.join(base_dir, 'out')
raw_data_dir = os.path.join(data_dir, 'raw')

# Define file paths
user_year = "default" #input(int)
file_data_path = construct_file_path(out_dir, user_year)
shapefile_path_deepWater = os.path.join(out_dir, 'geom_def', 'geom_deepWater.shp')
shapefile_path_sandObject = os.path.join(out_dir, 'geom_def', 'geom_sandObject.shp')

# Extract the base filename
base_filename = os.path.splitext(os.path.basename(file_data_path))[0]
base_filename = base_filename.rsplit('_', 1)[0]

## Pre-processing dataset

### Open and prepare the dataset

In [3]:
# Open dataset
data = xr.open_dataset(file_data_path)

if 'S2A' in file_data_path:
    print('Detected dataset from Sentinel-2A')
    variables_to_keep = {
        'transverse_mercator': 'transverse_mercator',
        'lat': 'lat',
        'lon': 'lon',
        'rhot_492': 'blue_l1r',
        'rhos_492': 'blue_l2r',
        'rhot_560': 'green_l1r',
        'rhos_560': 'green_l2r',
        'rhot_665': 'red_l1r',
        'rhos_665': 'red_l2r',
        'rhot_704': 'red_edge_l1r',
        'rhos_704': 'red_edge_l2r',
        'rhot_833': 'nir_l1r',
        'rhos_833': 'nir_l2r',
        'rhot_1614': 'swir1_l1r',
        'rhos_1614': 'swir1_l2r',
        'rhot_2202': 'swir2_l1r',
        'rhos_2202': 'swir2_l2r'
    }
elif 'S2B' in file_data_path:
    print('Detected dataset from Sentinel-2B')
    variables_to_keep = {
        'transverse_mercator': 'transverse_mercator',
        'lat': 'lat',
        'lon': 'lon',
        'rhot_492': 'blue_l1r',
        'rhos_492': 'blue_l2r',
        'rhot_559': 'green_l1r',
        'rhos_559': 'green_l2r',
        'rhot_665': 'red_l1r',
        'rhos_665': 'red_l2r',
        'rhot_704': 'red_edge_l1r',
        'rhos_704': 'red_edge_l2r',
        'rhot_833': 'nir_l1r',
        'rhos_833': 'nir_l2r',
        'rhot_1610': 'swir1_l1r',
        'rhos_1610': 'swir1_l2r',
        'rhot_2186': 'swir2_l1r',
        'rhos_2186': 'swir2_l2r'
    }
else:
    raise ValueError("The dataset file path does not indicate whether it is S2A or S2B.")

# Create the new dataset
new_vars = {}
for old_name, new_name in variables_to_keep.items():
    if old_name in data:
        # Select the variable and transpose if needed
        variable = data[old_name]
        new_vars[new_name] = variable

# Construct the new dataset
ds = xr.Dataset(new_vars)

# Preserve selected attributes
attributes_to_keep = [
    'generated_by', 'generated_on', 'contact', 'product_type', 'metadata_profile', 'Conventions', 
    'sensor', 'isodate', 'global_dims', 'sza', 'vza', 'raa', 'scene_xrange', 'scene_yrange', 
    'scene_dims', 'scene_pixel_size', 'data_dimensions', 'data_elements', 'acolite_version', 
    'acolite_file_type', 'tile_code', 'proj4_string', 'pixel_size', 'uoz', 'uwv', 'wind', 
    'pressure', 'oname'
]
ds.attrs = {key: data.attrs[key] for key in attributes_to_keep}

# Close the original dataset
data.close()

Detected dataset from Sentinel-2B


### Reset encoding and define projection

In [4]:
# Reset encoding 
ds = ds.drop_encoding()

# Set CRS
wkt = ds.attrs['proj4_string']
ds = ds.rio.write_crs(wkt, inplace=True)

# Drop 'grid_mapping'
for var in ds.data_vars:
    if 'grid_mapping' in ds[var].attrs:
        del ds[var].attrs['grid_mapping']

# Print the current CRS
print("Current CRS:", ds.rio.crs)

Current CRS: EPSG:32748


# Image Processing

## Sun Glint Correction (Hedley et al., 2005)

In [5]:
# Subsetting sample area for the Sun Glint Correction
# Read shapefile and desired year for the input
var_select = ['blue_l2r', 'green_l2r', 'red_l2r', 'red_edge_l2r', 'nir_l2r']  # Variables to select from the dataset
gdf = gpd.read_file(shapefile_path_deepWater)  # Load shapefile containing the region of interest
desired_year = int(user_year)  # Convert user_year to integer

# Mask the dataset based on the shapefile and desired year
samples = bm.mask_dataset(
    ds[var_select], gdf, desired_year
)

# Compute sun glint correction using the 'sunglint_correction' function from the module
# Note: The 'vars_ignore' parameter excludes 'lat' and 'lon' from the correction process. Default set to None
sg_ds = bm.sunglint_correction(ds, samples, 'nir_l2r', vars_ignore=['lat', 'lon'])

Minimum NIR brightness (MinNir): -0.00034847320057451725
Regression results for blue_l2r: slope=0.7068467459698609, r_value=0.49478330537082554, p_value=0.0
Regression results for green_l2r: slope=0.789761224416469, r_value=0.579725037945171, p_value=0.0
Regression results for red_l2r: slope=0.7772012852314969, r_value=0.6827873468967652, p_value=0.0
Regression results for red_edge_l2r: slope=0.6594397377757661, r_value=0.673968606695982, p_value=0.0
Slope information not found for variable 'blue_l1r'. Skipping correction.
Slope information not found for variable 'green_l1r'. Skipping correction.
Slope information not found for variable 'red_l1r'. Skipping correction.
Slope information not found for variable 'red_edge_l1r'. Skipping correction.
Slope information not found for variable 'nir_l1r'. Skipping correction.
Slope information not found for variable 'swir1_l1r'. Skipping correction.
Slope information not found for variable 'swir1_l2r'. Skipping correction.
Slope information not 

## Depth Invariant Index (Green et al., 2000)

In [6]:
# Subsetting sample area for the DII calculation
# Read the shapefile containing the region of interest
gdf = gpd.read_file(shapefile_path_sandObject)  # Load shapefile for sand object classification

# Mask the dataset based on the shapefile and desired year
samples = bm.mask_dataset(
    sg_ds, gdf, desired_year
)

# Define pairs of bands for which to calculate k-ratio and Depth Invariant Index (DII)
band_pairs = [
    ('blue_l2r_sg', 'green_l2r_sg'),
    ('blue_l2r_sg', 'red_l2r_sg'),
    ('blue_l2r_sg', 'red_edge_l2r_sg'),
    ('green_l2r_sg', 'red_l2r_sg'),
    ('green_l2r_sg', 'red_edge_l2r_sg'),
    ('red_l2r_sg', 'red_edge_l2r_sg')
]

# Calculate the water column corrected dataset using the specified band pairs
wc_ds = bm.water_column_correction(sg_ds, samples, band_pairs)

Calculating DII for bands blue_l2r_sg and green_l2r_sg with k-ratio: 0.721259206831931
Calculating DII for bands blue_l2r_sg and red_l2r_sg with k-ratio: 0.6542580278231951
Calculating DII for bands blue_l2r_sg and red_edge_l2r_sg with k-ratio: 0.46927000614747094
Calculating DII for bands green_l2r_sg and red_l2r_sg with k-ratio: 0.9935577984441735
Calculating DII for bands green_l2r_sg and red_edge_l2r_sg with k-ratio: 1.053037410596423
Calculating DII for bands red_l2r_sg and red_edge_l2r_sg with k-ratio: 1.0408077657224628


## Spectral Indices

### Normalized Difference

In [7]:
# Define pairs of variables for which to compute normalized difference indices
variable_pairs = [
    ('nir_l2r', 'red_l2r'),
    ('green_l2r', 'swir1_l2r'),
    ('nir_l2r', 'green_l2r'),
    ('green_l2r', 'red_l2r'),
    ('green_l2r', 'red_edge_l2r')
]

# Define names for the resulting normalized difference indices
var_names = ['ndvi', 'mndwi', 'gndvi', 'ngrdi_red','ngrdi_red_edge']

# Compute the normalized difference indices
si_ds = bm.normalized_difference(ds, variable_pairs, var_names)

### Non-Normalized Difference

In [8]:
# Calculate EVI
evi = (ds['nir_l2r'] - ds['red_l2r']) / (ds['nir_l2r'] + 6 * ds['red_l2r'] - 7.5 * ds['blue_l2r'] + 1)

# Calculate AWEI
awei = 4 * (ds['green_l2r'] - ds['swir2_l2r']) - (0.25 * ds['nir_l2r'] + 2.75 * ds['swir1_l2r'])

# Create DataArray for EVI with attributes
si_ds['evi'] = xr.DataArray(
    data=evi,
    dims=ds['nir_l2r'].dims,
    coords=ds['nir_l2r'].coords,
    name='evi',
    attrs={
        'long_name': 'Enhanced Vegetation Index (EVI)',
        'formula': '(NIR - RED) / (NIR + 6 * RED - 7.5 * BLUE + 1)',
        'units': '1',
        'date_created': datetime.utcnow().isoformat(),
    }
)

# Create DataArray for AWEI with attributes
si_ds['awei'] = xr.DataArray(
    data=awei,
    dims=ds['green_l2r'].dims,
    coords=ds['green_l2r'].coords,
    name='awei',
    attrs={
        'long_name': 'Automated Water Extraction Index (AWEI)',
        'formula': '4 * (GREEN - SWIR2) - (0.25 * NIR + 2.75 * SWIR1)',
        'units': '1',
        'date_created': datetime.utcnow().isoformat(),
    }
)

## Merge Processed Dataset

In [9]:
# Merge dataset
clf_ds = xr.merge([ds, sg_ds, wc_ds, si_ds])

In [10]:
# Show the dataset
clf_ds

# Statistical Summary

In [11]:
# Initialize a dictionary to hold summary statistics
summary_stats = {}

# Loop through each variable in the clf_ds
for var in clf_ds.data_vars:
    data_array = clf_ds[var].values  # Get the data values
    
    summary_stats[var] = {
        'Mean': np.nanmean(data_array),
        'Median': np.nanmedian(data_array),
        'Std Dev': np.nanstd(data_array),
        'Min': np.nanmin(data_array),
        'Max': np.nanmax(data_array),
        'Range': np.nanmax(data_array) - np.nanmin(data_array),
        'Count': np.count_nonzero(~np.isnan(data_array))
    }

# Create a DataFrame from the summary statistics
summary_df = pd.DataFrame.from_dict(summary_stats, orient='index')

# Reset the index and name the first column as 'Variables'
summary_df.reset_index(inplace=True)
summary_df.columns = ['Variables'] + list(summary_df.columns[1:])

# Print the DataFrame
summary_df

Unnamed: 0,Variables,Mean,Median,Std Dev,Min,Max,Range,Count
0,lat,-5.788169,-5.78817,0.061164,-5.894375,-5.681961,0.212414,5927790
1,lon,106.556274,106.556198,0.065944,106.441757,106.670685,0.228928,5927790
2,blue_l1r,0.11684,0.1141,0.011224,0.1063,0.3483,0.242,5927790
3,blue_l2r,0.04411,0.040507,0.01401,0.031006,0.321216,0.29021,5927790
4,green_l1r,0.080468,0.0762,0.016833,0.068,0.3712,0.3032,5927790
5,green_l2r,0.03354,0.028312,0.02074,0.018635,0.378724,0.360089,5927790
6,red_l1r,0.051196,0.0485,0.011345,0.042,0.3864,0.3444,5927790
7,red_l2r,0.016775,0.013634,0.012999,0.006867,0.388807,0.38194,5927790
8,red_edge_l1r,0.042531,0.04,0.010629,0.0296,0.2966,0.267,5927790
9,red_edge_l2r,0.011469,0.008552,0.012171,-0.00236,0.295854,0.298214,5927790


In [12]:
# Filter the rows where "blue" is in the 'Variables' column
blue_rows = summary_df[summary_df['Variables'].str.contains("blue", na=False)]

# Display the filtered DataFrame
blue_rows

Unnamed: 0,Variables,Mean,Median,Std Dev,Min,Max,Range,Count
2,blue_l1r,0.11684,0.1141,0.011224,0.1063,0.3483,0.242,5927790
3,blue_l2r,0.04411,0.040507,0.01401,0.031006,0.321216,0.29021,5927790
16,blue_l2r_sg,0.040171,0.037709,0.015644,-0.24316,0.17752,0.42068,5927790
20,dii_blue_l2r_sg_green_l2r_sg,8.352251,8.361502,0.074194,0.0,11.279569,11.279569,5897468
21,dii_blue_l2r_sg_red_l2r_sg,8.287834,8.292946,0.0954,0.0,13.031171,13.031171,5895973
22,dii_blue_l2r_sg_red_edge_l2r_sg,8.575824,8.576065,0.108466,0.0,11.79463,11.79463,5896088


In [13]:
# Filter the rows where "green" is in the 'Variables' column
green_rows = summary_df[summary_df['Variables'].str.contains("green", na=False)]

# Display the filtered DataFrame
green_rows

Unnamed: 0,Variables,Mean,Median,Std Dev,Min,Max,Range,Count
4,green_l1r,0.080468,0.0762,0.016833,0.068,0.3712,0.3032,5927790
5,green_l2r,0.03354,0.028312,0.02074,0.018635,0.378724,0.360089,5927790
17,green_l2r_sg,0.029139,0.025201,0.02089,-0.252151,0.213204,0.465355,5927790
20,dii_blue_l2r_sg_green_l2r_sg,8.352251,8.361502,0.074194,0.0,11.279569,11.279569,5897468
23,dii_green_l2r_sg_red_l2r_sg,9.614991,9.614626,0.117382,0.0,18.442089,18.442089,5896336
24,dii_green_l2r_sg_red_edge_l2r_sg,11.5888,11.611284,0.228617,0.0,18.897377,18.897377,5896720


In [16]:
# Filter the rows where "red" is in the 'Variables' column but exclude those that contain only "red_edge"
red_rows = summary_df[
    summary_df['Variables'].str.contains("red", na=False) & 
    ~summary_df['Variables'].str.contains("red_edge") |
    summary_df['Variables'].str.contains("red.*red_edge", na=False)
]

# Display the filtered DataFrame
red_rows

Unnamed: 0,Variables,Mean,Median,Std Dev,Min,Max,Range,Count
6,red_l1r,0.051196,0.0485,0.011345,0.042,0.3864,0.3444,5927790
7,red_l2r,0.016775,0.013634,0.012999,0.006867,0.388807,0.38194,5927790
18,red_l2r_sg,0.012444,0.010635,0.013709,-0.287087,0.150743,0.43783,5927790
21,dii_blue_l2r_sg_red_l2r_sg,8.287834,8.292946,0.0954,0.0,13.031171,13.031171,5895973
23,dii_green_l2r_sg_red_l2r_sg,9.614991,9.614626,0.117382,0.0,18.442089,18.442089,5896336
25,dii_red_l2r_sg_red_edge_l2r_sg,8.831077,8.842816,0.174862,0.0,16.040051,16.040051,5896446
29,ngrdi_red,0.344897,0.34707,0.050208,-0.345435,0.671164,1.0166,5927790


In [15]:
# Filter the rows where "red_edge" is in the 'Variables' column
red_edge_rows = summary_df[summary_df['Variables'].str.contains("red_edge", na=False)]

# Display the filtered DataFrame
red_edge_rows

Unnamed: 0,Variables,Mean,Median,Std Dev,Min,Max,Range,Count
8,red_edge_l1r,0.042531,0.04,0.010629,0.0296,0.2966,0.267,5927790
9,red_edge_l2r,0.011469,0.008552,0.012171,-0.00236,0.295854,0.298214,5927790
19,red_edge_l2r_sg,0.007794,0.006016,0.010555,-0.191496,0.156562,0.348058,5927790
22,dii_blue_l2r_sg_red_edge_l2r_sg,8.575824,8.576065,0.108466,0.0,11.79463,11.79463,5896088
24,dii_green_l2r_sg_red_edge_l2r_sg,11.5888,11.611284,0.228617,0.0,18.897377,18.897377,5896720
25,dii_red_l2r_sg_red_edge_l2r_sg,8.831077,8.842816,0.174862,0.0,16.040051,16.040051,5896446
30,ngrdi_red_edge,0.523769,0.535248,0.079212,-0.407978,1.112962,1.52094,5927790
