In [None]:
import xarray as xr
import requests
from scipy.spatial import KDTree
import numpy as np

# Earth's radius in meters
EARTH_RADIUS_METERS = 6371000

def haversine(lon1, lat1, lon2, lat2):
    """Calculate the great-circle distance between two points on the Earth using the Haversine formula."""
    # Convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    
    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    distance = EARTH_RADIUS_METERS * c
    return distance

# Step 1: Load the NetCDF file and extract node coordinates
netcdf_file = '/scratch/07174/soelem/global_1.5-20/fort.63.nc'
ds = xr.open_dataset(netcdf_file)

# Extract longitude (x) and latitude (y) coordinates of the nodes
lon = ds['x'].values
lat = ds['y'].values
coordinates = np.column_stack((lon, lat))  # Stack them as (lon, lat) pairs

# Step 2: NOAA API to get tide gauge stations metadata (coordinates and station_id)
def get_noaa_stations():
    """Retrieve all NOAA tide gauge stations."""
    base_url = 'https://api.tidesandcurrents.noaa.gov/mdapi/prod/webapi/stations.json'
    response = requests.get(base_url)
    if response.status_code == 200:
        stations = response.json()['stations']
        station_list = []
        for station in stations:
            try:
                station_id = station['id']
                lat = float(station['lat'])
                lon = float(station['lng'])
                station_list.append({'station_id': station_id, 'lat': lat, 'lon': lon})
            except KeyError:
                continue
        return station_list
    else:
        raise Exception(f"Error retrieving NOAA station data, status code {response.status_code}")

# Step 3: Find the closest station-node pairs and store the first 10 sorted by distance
def find_closest_station_node_pairs(coordinates, stations, top_n=10):
    """Find the closest station-node pairs and return the top N sorted by distance using KDTree."""
    kdtree = KDTree(coordinates)
    distances = []

    for station in stations:
        station_coords = (station['lon'], station['lat'])
        distance, node_index = kdtree.query(station_coords)

        # Convert the distance from degrees to meters using the Haversine formula
        node_lon, node_lat = coordinates[node_index]
        distance_meters = haversine(station['lon'], station['lat'], node_lon, node_lat)

        # Store station ID, node index, and distance in meters
        distances.append({
            'station_id': station['station_id'],
            'node_index': node_index,
            'distance': distance_meters
        })

    # Sort the distances list by the 'distance' key
    distances_sorted = sorted(distances, key=lambda x: x['distance'])

    # Return the top N closest pairs
    return distances_sorted[:top_n]

# Get NOAA tide gauge stations
stations = get_noaa_stations()

# Step 4: Find and print the top 10 closest station-node pairs
top_10_pairs = find_closest_station_node_pairs(coordinates, stations, top_n=20)

# Print the top 10 pairs
print(f"{'Station ID':<12} {'Node Index':<12} {'Distance (meters)':<20}")
for pair in top_10_pairs:
    print(f"{pair['station_id']:<12} {pair['node_index']:<12} {pair['distance']:<20.2f}")