In [1]:
import pandas as pd
import numpy as np
from scipy.spatial import cKDTree
import netCDF4
from noaa_coops import Station
from sklearn.metrics import mean_squared_error
from math import sqrt
import time

# Function to calculate RMSE
def calculate_rmse(true_values, predicted_values):
    if len(true_values) > 384 and len(predicted_values) > 384:
        true_values = true_values[384:]
        predicted_values = predicted_values[384:]
    else:
        return np.nan

    valid_mask = ~np.isnan(true_values) & ~np.isnan(predicted_values)
    true_values, predicted_values = true_values[valid_mask], predicted_values[valid_mask]

    return sqrt(mean_squared_error(true_values, predicted_values))

# Read the station details CSV file
start_time = time.time()
station_details = pd.read_csv('station_details.csv')
print(f"Reading station details took {time.time() - start_time:.2f} seconds")

# Load the NetCDF file
start_time = time.time()
nc_file = netCDF4.Dataset('/scratch/07174/soelem/test_jason_highres/fort.63.nc', 'r')
print(f"Loading NetCDF file took {time.time() - start_time:.2f} seconds")

latitudes = nc_file.variables['y'][:]
longitudes = nc_file.variables['x'][:]

# Normalize longitudes from 0-360 to -180 to +180 if needed
longitudes = np.where(longitudes > 180, longitudes - 360, longitudes)

# Create an array of tuples with the latitude and longitude
points = np.column_stack((latitudes, longitudes))

# Create a KDTree for quick nearest-neighbor lookup
start_time = time.time()
kdtree = cKDTree(points)
print(f"Building KDTree took {time.time() - start_time:.2f} seconds")

# Find the closest points in the NetCDF data for each station
start_time = time.time()
closest_points = []
for index, row in station_details.iterrows():
    _, closest_point_idx = kdtree.query([row['Latitude'], row['Longitude']])
    closest_points.append(closest_point_idx)
print(f"Finding closest points took {time.time() - start_time:.2f} seconds")

# Print the array of closest points
print("Closest points to stations:", closest_points)

# Add the closest point indices to the station details
station_details['node'] = closest_points

# Prepare a list to hold RMSE values
rmse_values = []

# Load the zeta values corresponding to the closest points only
start_time = time.time()
for index, row in station_details.iterrows():
    try:
        station_id = row['Station ID']
        closest_point_idx = closest_points[index]

        # Retrieve 'zeta' variable data for the closest point, skipping the first 384 values
        nc_zeta = nc_file.variables['zeta'][384:, closest_point_idx]

        # Initialize the station object
        station = Station(station_id)

        try:
            station_data = station.get_data(
                begin_date="20230716",
                end_date="20231031",
                product="hourly_height",
                datum="MSL",
                units="metric",
                time_zone="lst"
            )
        except Exception as e:
            print(f"Error retrieving data for station {station_id}: {e}")
            rmse_values.append(np.nan)
            continue

        station_df = pd.DataFrame(station_data)

        # Adjusted starting date after skipping 384 hours (16 days)
        nc_zeta_series = pd.Series(nc_zeta.flatten(), index=pd.date_range(start="2023-08-01 00:00:00", periods=len(nc_zeta), freq='H'))
        station_df_resampled = station_df['v'].reindex(nc_zeta_series.index, method='nearest')

        rmse = calculate_rmse(station_df_resampled.values, nc_zeta_series.values)
        rmse_values.append(rmse)

    except Exception as e:
        print(f"An unexpected error occurred for station {station_id}: {e}")
        rmse_values.append(np.nan)
        continue

print(f"Processing all stations took {time.time() - start_time:.2f} seconds")

# Add the RMSE values to the station_details DataFrame
station_details['RMSE_with'] = rmse_values

# Save the updated DataFrame to a CSV file
station_details.to_csv('rmse_jason_highres_with.csv', index=False)

# Close the NetCDF file
nc_file.close()

Reading station details took 0.00 seconds
Loading NetCDF file took 0.39 seconds
Building KDTree took 0.14 seconds
Finding closest points took 0.02 seconds
Closest points to stations: [489095, 489095, 4457, 4736, 204929, 347491, 330160, 335279, 325276, 252827, 125697, 427326, 307001, 297740, 343140, 337390, 135609, 72454, 269387, 281092, 154309, 129087, 343481, 40915, 238947, 4378, 185620, 368888, 469586, 392448, 339632, 383, 11773, 1098, 854, 164, 71929, 90047, 456947, 419695, 476565, 528705, 414106, 358497, 19904, 204491, 467916, 521361, 552255, 351805, 514415, 545930, 539782, 527682, 502705, 494247, 520057, 217120, 488072, 440574, 318024, 357301, 19411, 430400, 405996, 312580, 317443, 282066, 344737, 390473, 390474, 411202, 397386, 370689, 370548, 344387, 317267, 322053, 307523, 196663, 36397, 183794, 134305, 118164, 80211, 241000, 207786, 117377, 173394, 213008, 96687, 205794, 96506, 113884, 83854, 68602, 55112, 123160, 129623, 107760, 72661, 77870, 209078, 27888, 183594, 9026, 8449