-> Since our noaa sst data has 11376 missing values (NaN) from total 23184 sst values , we are going to mask them to perform visualizations.                        
-> keeping the original data as it is to interpolate the NaN values later on if needed for analysis operations.

In [2]:
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import seaborn as sns
import pandas as pd
from matplotlib.colors import LinearSegmentedColormap
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import os

In [3]:
def mask_sst_data(input_file, output_file, var_name='sst', fill_value=np.nan):
    """
    Mask missing values in the NOAA SST dataset and save the result to a new file.
   
    Parameters:
    -----------
    input_file : str
        Path to the input NetCDF file containing the NOAA SST data
    output_file : str
        Path to save the output NetCDF file with masked data
    var_name : str, optional
        Name of the SST variable in the dataset (default: 'sst')
    fill_value : float, optional
        Fill value for missing data in the output file (default: np.nan)
   
    Returns:
    --------
    ds_masked : xarray.Dataset
        The masked dataset that was saved to the output file
    """
    # Open the SST dataset
    print(f"Opening dataset: {input_file}")
    ds = xr.open_dataset(input_file)
   
    # Print basic information about the dataset
    print(f"Dataset dimensions: {ds.dims}")
    print(f"Data variables: {list(ds.data_vars)}")
   
    # Check for missing values
    total_values = ds[var_name].size
    nan_values = np.isnan(ds[var_name].values).sum()
    percent_missing = (nan_values / total_values) * 100
    print(f"Total values: {total_values}")
    print(f"Missing values (NaN): {nan_values}")
    print(f"Percentage missing: {percent_missing:.2f}%")
   
    # Create a masked dataset
    # In xarray, masking is handled through the creation of a DataArray with explicit NaN values
    # The original dataset already has NaN values, so we're just ensuring they're properly recognized
   
    # Create a deep copy of the dataset to avoid modifying the original
    ds_masked = ds.copy(deep=True)
   
    # Confirm that mask is properly applied by checking NaN values again
    masked_nan_values = np.isnan(ds_masked[var_name].values).sum()
    print(f"NaN values in masked dataset: {masked_nan_values}")
   
    # Set encoding for the output file
    encoding = {var_name: {'zlib': True, 'complevel': 5, '_FillValue': fill_value}}
   
    # Save the masked dataset to a new NetCDF file
    print(f"Saving masked dataset to: {output_file}")
    ds_masked.to_netcdf(output_file, encoding=encoding)
    print("Masked dataset saved successfully!")
   
    return ds_masked

if __name__ == "__main__":
    # Define input and output file paths
    input_file =  r"C:\Users\Admin\RIYA PROJECT\DATASETS\noaa_sst.nc"
    output_file =  r"C:\Users\Admin\RIYA PROJECT\DATASETS\noaa_sst_masked.nc"
   
    # Process the dataset
    ds_masked = mask_sst_data(input_file, output_file)


Opening dataset: C:\Users\Admin\RIYA PROJECT\DATASETS\noaa_sst.nc
Dataset dimensions: Frozen({'time': 72, 'lat': 20, 'lon': 24})
Data variables: ['sst']
Total values: 34560
Missing values (NaN): 11376
Percentage missing: 32.92%
NaN values in masked dataset: 11376
Saving masked dataset to: C:\Users\Admin\RIYA PROJECT\DATASETS\noaa_sst_masked.nc
Masked dataset saved successfully!


In [7]:
a = xr.open_dataset(r"C:\Users\Admin\RIYA PROJECT\DATASETS\noaa_sst_masked.nc")
a['sst'].values

array([[[26.843548, 26.89    , 27.02129 , ...,       nan,       nan,
               nan],
        [26.777418, 26.906128, 27.090322, ...,       nan,       nan,
               nan],
        [26.764837, 26.950321, 27.186451, ...,       nan,       nan,
               nan],
        ...,
        [26.158064, 26.125807, 25.92    , ...,       nan,       nan,
               nan],
        [25.983547, 25.911934, 25.662258, ...,       nan,       nan,
               nan],
        [25.83387 , 25.653547, 25.334192, ...,       nan,       nan,
               nan]],

       [[26.981785, 27.294641, 27.467857, ...,       nan,       nan,
               nan],
        [27.023214, 27.314999, 27.4425  , ...,       nan,       nan,
               nan],
        [27.043571, 27.302856, 27.395357, ...,       nan,       nan,
               nan],
        ...,
        [25.348213, 25.353928, 25.17107 , ...,       nan,       nan,
               nan],
        [25.242857, 25.221071, 24.981428, ...,       nan,       nan,
   