In [None]:
import pandas as pd
import numpy as np
import netCDF4
from noaa_coops import Station
from sklearn.metrics import mean_absolute_error
import time
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def calculate_mae(true_values, predicted_values):
    """Calculate MAE between true and predicted values, ignoring the first 384 values."""
    if len(true_values) > 384 and len(predicted_values) > 384:
        true_values = true_values[384:]
        predicted_values = predicted_values[384:]
    else:
        return np.nan

    valid_mask = ~np.isnan(true_values) & ~np.isnan(predicted_values)
    true_values, predicted_values = true_values[valid_mask], predicted_values[valid_mask]

    return mean_absolute_error(true_values, predicted_values)

# Read the station details CSV file
start_time = time.time()
station_details = pd.read_csv('/work2/07174/soelem/validation/result_data/station_detailsv2.csv')
logging.info(f"Reading station details took {time.time() - start_time:.2f} seconds")

# Load the NetCDF file
start_time = time.time()
nc_file = netCDF4.Dataset('/work2/07174/soelem/validation/swot/fort.63.nc', 'r')
logging.info(f"Loading NetCDF file took {time.time() - start_time:.2f} seconds")

# Prepare a list to hold MAE values
mae_values = []

# Load the zeta values corresponding to the closest points only
start_time = time.time()
for index, row in station_details.iterrows():
    try:
        station_id = row['Station ID']
        closest_point_idx = row['Closest Point ID']

        # Retrieve 'zeta' variable data for the closest point, skipping the first 384 values
        nc_zeta = nc_file.variables['zeta'][384:, closest_point_idx]

        # Initialize the station object
        station = Station(station_id)

        # Try to retrieve water level data for the station
        try:
            station_data = station.get_data(
                begin_date="20230716",
                end_date="20231031",
                product="hourly_height",
                datum="MSL",
                units="metric",
                time_zone="lst"
            )
        except Exception as e:
            logging.error(f"Error retrieving data for station {station_id}: {e}")
            mae_values.append(np.nan)
            continue

        station_df = pd.DataFrame(station_data)

        # Adjusted starting date after skipping 384 hours (16 days)
        nc_zeta_series = pd.Series(nc_zeta.flatten(), index=pd.date_range(start="2023-08-01 00:00:00", periods=len(nc_zeta), freq='H'))
        station_df_resampled = station_df['v'].reindex(nc_zeta_series.index, method='nearest')

        # Calculate MAE
        mae = calculate_mae(station_df_resampled.values, nc_zeta_series.values)
        mae_values.append(mae)

    except Exception as e:
        logging.error(f"An unexpected error occurred for station {station_id}: {e}")
        mae_values.append(np.nan)
        continue

logging.info(f"Processing all stations took {time.time() - start_time:.2f} seconds")

# Add the MAE values to the station_details DataFrame
station_details['MAE_with'] = mae_values

# Save the updated DataFrame to a CSV file
station_details.to_csv('mae_swot_with.csv', index=False)

# Close the NetCDF file
nc_file.close()