In [None]:
import pandas as pd
import numpy as np
from scipy.spatial import cKDTree
import netCDF4
from noaa_coops import Station
from sklearn.metrics import mean_squared_error
from math import sqrt

# Function to calculate RMSE
def calculate_rmse(true_values, predicted_values):
    # Check if both arrays have more than 48 elements
    if len(true_values) > 48 and len(predicted_values) > 48:
        # Exclude the first 48 values
        true_values = true_values[48:]
        predicted_values = predicted_values[48:]
    else:
        # Not enough data points to exclude the first 48 values
        # Handle this case as needed, for example, return np.nan or raise an error
        return np.nan

    # Handle cases where the lengths are different due to missing data
    valid_mask = ~np.isnan(true_values) & ~np.isnan(predicted_values)
    true_values, predicted_values = true_values[valid_mask], predicted_values[valid_mask]
    
    # Calculate and return RMSE
    return sqrt(mean_squared_error(true_values, predicted_values))

# Read the station details CSV file
station_details = pd.read_csv('station_details.csv')

# Load the NetCDF file
nc_file = netCDF4.Dataset('/work2/07174/soelem/validation/sentinel_flag/fort.63.nc', 'r')

# Extract the latitude and longitude from the NetCDF file
latitudes = nc_file.variables['y'][:]  # Replace with your actual variable name
longitudes = nc_file.variables['x'][:]  # Replace with your actual variable name
zeta = nc_file.variables['zeta'][:]

# Normalize longitudes from 0-360 to -180 to +180 if needed
longitudes = np.where(longitudes > 180, longitudes - 360, longitudes)

# Create an array of tuples with the latitude and longitude
points = np.column_stack((latitudes, longitudes))

# Create a KDTree for quick nearest-neighbor lookup
kdtree = cKDTree(points)

# Prepare a list to hold RMSE values
rmse_values = []

# Loop through each station, find the closest point, and retrieve water level data
for index, row in station_details.iterrows():
    try:
        station_id = row['Station ID']
        
        # Find the closest point for the current station's coordinates
        _, closest_point_idx = kdtree.query([row['Latitude'], row['Longitude']])
        
        # Adjust the index for 0-based Python indexing
        closest_point_idx_zero_based = closest_point_idx - 1
        
        # Retrieve 'zeta' variable data for the closest point
        nc_zeta = zeta[:, closest_point_idx_zero_based]
        
        # Delete the first 4 values and shift the remaining values up
        nc_zeta_adjusted = np.delete(nc_zeta, np.s_[0:4])
        
        # Initialize the station object
        station = Station(station_id)

        # Try to retrieve water level data for the station
        try:
            station_data = station.get_data(
                begin_date="20230716",
                end_date="20231031",
                product="hourly_height",
                datum="MSL",
                units="metric",
                time_zone="lst"
            )
        except Exception as e:
            print(f"Error retrieving data for station {station_id}: {e}")
            rmse_values.append(np.nan)  # Append a NaN value for stations with errors
            continue  # Skip to the next iteration of the loop

        # The index 't' is already in datetime format, so no conversion is necessary
        station_df = pd.DataFrame(station_data)
        
        # Ensure nc_zeta_adjusted aligns with the time frame, accounting for the deleted entries
        nc_zeta_series = pd.Series(nc_zeta_adjusted.flatten(), index=pd.date_range(start="2023-07-16 00:00:00", periods=len(nc_zeta_adjusted), freq='H'))

        # Resample or interpolate to ensure both series align on the same time index
        station_df_resampled = station_df['v'].reindex(nc_zeta_series.index, method='nearest')
        
        # Calculate RMSE
        rmse = calculate_rmse(station_df_resampled.values, nc_zeta_series.values)
        rmse_values.append(rmse)

    except Exception as e:
        # Catch any other exception that might occur
        print(f"An unexpected error occurred for station {station_id}: {e}")
        rmse_values.append(np.nan)  # Append a NaN value for stations with errors
        continue  # Skip to the next iteration of the loop

# Add the RMSE values to the station_details DataFrame
station_details['RMSE_with'] = rmse_values

# Save the updated dataframe as "rmse.csv"
station_details.to_csv('rmse_with_sentinel_with_flag.csv', index=False)

# Close the NetCDF file
nc_file.close()

In [None]:
import pandas as pd
import numpy as np
from scipy.spatial import cKDTree
import netCDF4
from noaa_coops import Station
from sklearn.metrics import mean_squared_error
from math import sqrt

# Function to calculate RMSE
def calculate_rmse(true_values, predicted_values):
    # Handle cases where the lengths are different due to missing data
    if len(true_values) != len(predicted_values):
        valid_mask = ~np.isnan(true_values) & ~np.isnan(predicted_values)
        true_values, predicted_values = true_values[valid_mask], predicted_values[valid_mask]
    return sqrt(mean_squared_error(true_values, predicted_values))

# Read the station details CSV file
station_details = pd.read_csv('station_details.csv')

# Load the NetCDF file
nc_file = netCDF4.Dataset('diff/swotfort.63.nc', 'r')

# Extract the latitude and longitude from the NetCDF file
latitudes = nc_file.variables['y'][:]  # Replace with your actual variable name
longitudes = nc_file.variables['x'][:]  # Replace with your actual variable name
zeta = nc_file.variables['zeta'][:]

# Normalize longitudes from 0-360 to -180 to +180 if needed
longitudes = np.where(longitudes > 180, longitudes - 360, longitudes)

# Create an array of tuples with the latitude and longitude
points = np.column_stack((latitudes, longitudes))

# Create a KDTree for quick nearest-neighbor lookup
kdtree = cKDTree(points)

# Prepare a list to hold RMSE values
rmse_values = []

# Define the desired date range
desired_start = pd.Timestamp("2023-09-22 00:00:00")
desired_end = pd.Timestamp("2023-09-23 23:00:00")

# Loop through each station, find the closest point, and retrieve water level data
for index, row in station_details.iterrows():
    try:
        station_id = row['Station ID']
        
        # Find the closest point for the current station's coordinates
        _, closest_point_idx = kdtree.query([row['Latitude'], row['Longitude']])
        
        # Adjust the index for 0-based Python indexing
        closest_point_idx_zero_based = closest_point_idx - 1
        
        # Retrieve 'zeta' variable data for the closest point
        nc_zeta = zeta[:, closest_point_idx_zero_based]
        
        # Delete the first 4 values and shift the remaining values up
        nc_zeta_adjusted = np.delete(nc_zeta, np.s_[0:8])
        
        # Adjust NetCDF 'zeta' data to match the desired time range
        nc_zeta_series = pd.Series(nc_zeta_adjusted.flatten(), index=pd.date_range(start="2023-09-11 04:00:00", periods=len(nc_zeta_adjusted), freq='H'))
        nc_zeta_filtered = nc_zeta_series[desired_start:desired_end]

        # Initialize the station object
        station = Station(station_id)

        # Retrieve water level data for the station for the specified date range
        station_data = station.get_data(
            begin_date=desired_start.strftime("%Y%m%d"),
            end_date=desired_end.strftime("%Y%m%d"),
            product="hourly_height",
            datum="NAVD",
            units="metric",
            time_zone="lst"
        )

        # The index 't' is already in datetime format, so no conversion is necessary
        station_df = pd.DataFrame(station_data)
        
        # Resample or interpolate to ensure both series align on the same time index
        station_df_resampled = station_df['v'].reindex(nc_zeta_filtered.index, method='nearest')

        # Filter NOAA station data to match the same period
        station_df_resampled_filtered = station_df_resampled[desired_start:desired_end]

        # Calculate RMSE for the filtered period
        rmse = calculate_rmse(station_df_resampled_filtered.values, nc_zeta_filtered.values)
        rmse_values.append(rmse)

    except Exception as e:
        print(f"An unexpected error occurred for station {station_id}: {e}")
        rmse_values.append(np.nan)  # Append a NaN value for stations with errors
        continue  # Skip to the next iteration of the loop

# Add the RMSE values to the station_details DataFrame
station_details['RMSE'] = rmse_values

# Save the updated dataframe as "rmse_saral.csv"
station_details.to_csv('rmse_swot_surge_with.csv', index=False)

# Close the NetCDF file
nc_file.close