In [1]:
import pandas as pd
import numpy as np
from scipy.spatial import cKDTree
import netCDF4
from noaa_coops import Station
from sklearn.metrics import mean_squared_error
from math import sqrt

# Function to calculate RMSE
def calculate_rmse(true_values, predicted_values):
    if len(true_values) > 48 and len(predicted_values) > 48:
        true_values = true_values[48:]
        predicted_values = predicted_values[48:]
    else:
        return np.nan

    valid_mask = ~np.isnan(true_values) & ~np.isnan(predicted_values)
    true_values, predicted_values = true_values[valid_mask], predicted_values[valid_mask]

    return sqrt(mean_squared_error(true_values, predicted_values))

# Read the station details CSV file
station_details = pd.read_csv('station_details.csv')

# Load the NetCDF file
nc_file = netCDF4.Dataset('/work2/07174/soelem/hopper/fort.63.nc', 'r')

latitudes = nc_file.variables['y'][:]
longitudes = nc_file.variables['x'][:]

# Normalize longitudes from 0-360 to -180 to +180 if needed
longitudes = np.where(longitudes > 180, longitudes - 360, longitudes)

# Create an array of tuples with the latitude and longitude
points = np.column_stack((latitudes, longitudes))

# Create a KDTree for quick nearest-neighbor lookup
kdtree = cKDTree(points)

rmse_values = []

# Chunk size for reading 'zeta' variable
chunk_size = 1000

for index, row in station_details.iterrows():
    try:
        station_id = row['Station ID']
        _, closest_point_idx = kdtree.query([row['Latitude'], row['Longitude']])

        # Initialize the station object
        station = Station(station_id)

        try:
            station_data = station.get_data(
                begin_date="20230716",
                end_date="20231031",
                product="hourly_height",
                datum="MSL",
                units="metric",
                time_zone="lst"
            )
        except Exception as e:
            print(f"Error retrieving data for station {station_id}: {e}")
            rmse_values.append(np.nan)
            continue

        station_df = pd.DataFrame(station_data)

        # Reading 'zeta' data in chunks
        nc_zeta_adjusted = []
        for i in range(0, len(nc_file.variables['zeta'][:, closest_point_idx]), chunk_size):
            chunk = nc_file.variables['zeta'][i:i + chunk_size, closest_point_idx]
            chunk = np.delete(chunk, np.s_[0:4]) if i == 0 else chunk  # Delete the first 4 values only for the first chunk
            nc_zeta_adjusted.append(chunk)

        nc_zeta_adjusted = np.concatenate(nc_zeta_adjusted)

        nc_zeta_series = pd.Series(nc_zeta_adjusted.flatten(), index=pd.date_range(start="2023-07-16 00:00:00", periods=len(nc_zeta_adjusted), freq='H'))
        station_df_resampled = station_df['v'].reindex(nc_zeta_series.index, method='nearest')

        rmse = calculate_rmse(station_df_resampled.values, nc_zeta_series.values)
        rmse_values.append(rmse)

    except Exception as e:
        print(f"An unexpected error occurred for station {station_id}: {e}")
        rmse_values.append(np.nan)
        continue

station_details['RMSE_without'] = rmse_values
station_details.to_csv('rmse_jason_highres_without.csv', index=False)

nc_file.close()

Error retrieving data for station 8311030: CO-OPS API returned an error. Status Code: 400. Reason: Bad Request

Error retrieving data for station 8311062: CO-OPS API returned an error. Status Code: 400. Reason: Bad Request

