In [5]:
import pandas as pd
import datetime
import re
import xarray as xr
import numpy as np 
import xroms
import pyresample 
import metpy
import matplotlib.pyplot as plt
import os, glob

#FUNCTIONS

# TEXTFILE TO PANDAS
def txt_to_pd(txtfile,LAT,LON):

    columns_to_keep = ['T_degC', 'T_qf', 'S', 'S_qf', 'Date', 'Time'] 
    new_column_names = ['TEMP','TEMP_QC','PSAL','PSAL_QC','Date','Time']


    # Hvis vi vil at pandas skal tolke dato som som et datetime object 
    # kan vi gi informasjon om hvordan dato stirngene er formatert med parser.  
    #
    # For denne filen vil det kunne se slik ut:
    # parse = lambda x: datetime.datetime.strptime(x, '%d %b %Y %H:%M:%S')
    # Betydningen av de ulike %bokstaven finnes her: (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior)

    # Men for å gjøre ting litt vanskligere er det brukt norske forkortelser for månedsnavn... 
    # Vi bytter enkelt ut norske forkortelser med engelske ved å bruke metoden .replace(norsk, engelsk)

    parse = lambda x: datetime.datetime.strptime(x.replace('Des', 'Dec').replace('Mai', 'May').replace('Okt', 'Oct'), '%d %b %Y %H:%M:%S')

    #df = pd.read_csv(txtfile, delimiter='\t', usecols=columns_to_keep, parse_dates={"Datetime" : ['Date', 'Time']}, date_parser = parse)
    df = pd.read_csv(txtfile, delimiter='\t', usecols=columns_to_keep)
    df.columns = new_column_names
    
    # Combine Date and Time columns and strip any leading or trailing whitespace
    df['TIME'] = (df['Date'] + ' ' + df['Time']).str.strip()
    
    # Apply the custom parsing function
    df['TIME'] = df['TIME'].apply(parse)
    df.drop(columns=['Date', 'Time'], inplace=True)
    
    # Add depth:
    df['DEPTH'] = get_depth(txtfile)
    df['LAT'] = LAT
    df['LON'] = LON
    return(df)

#NETCDF FILE TO PANDAS:
def netcdf_to_pd(txt):
    # Load the NetCDF file using xarray
    ds = xr.open_dataset(txt)

    # Select the variables you are interested in
    variables = ['TEMP', 'TEMP_QC', 'PSAL', 'PSAL_QC']

    # Initialize an empty DataFrame to merge into
    df_combined = pd.DataFrame()

    # Loop over each variable to process and merge
    for var in variables:

        # Select the variable data
        data = ds[var]
    
        # Stack the depth and time dimensions into a MultiIndex
        stacked_data = data.stack(points=('DEPTH', 'TIME'))
    
    
        # Convert the stacked DataArray to a pandas DataFrame
        df = stacked_data.to_dataframe()
            
        # Temporarily rename columns to avoid conflicts
        df.rename(columns={'DEPTH': 'Depth_col', 'TIME': 'Time_col', 'LONGITUDE': 'LON_col'}, inplace=True)
        #        
        df = df.reset_index()
        df.drop(columns=['Depth_col', 'Time_col'], inplace=True)
        #    
        # If df_combined is empty, initialize it with the current DataFrame
        if df_combined.empty:
            df_combined = df
        else:
            # Merge the current DataFrame with the combined DataFrame
            df_combined = pd.merge(df_combined, df, on=['DEPTH', 'TIME'], how='outer')  
    df_combined['LON'] = ds['LONGITUDE'].values[0]
    df_combined['LAT'] = ds['LATITUDE'].values[0]
    return(df_combined)     

# GET DEPTH FROM FILENAME
def get_depth(filename):
    # Use a regular expression to extract the depth value
    depth_match = re.search(r'_(\d+)m', filename)
    if depth_match:
        depth_value = int(depth_match.group(1))
    else:
        depth_value = None
    return(depth_value)   

# FIND DEPTH INDEX IN ROMS (weighted mean)
def get_Zindices(ds,X,Y,target_depth):
    ds = ds.isel(xi_rho = X, eta_rho = Y)

    # Extract the z_rho values at the given point
    z_rho_values = ds.z_rho.values
    s_rho_values = ds.s_rho.values

    diff = np.abs(z_rho_values - target_depth)
    diff = diff.squeeze()
    
    # Find the indices of the two smallest differences
    closest_indices = diff.argsort()[:2]
    
    # Calculate the weights as the inverse of the differences
    # Adding a small epsilon to avoid division by zero
    epsilon = 1e-10
    weights = 1 / (diff[closest_indices] + epsilon)

    # Normalize the weights so that they sum to 1
    weights /= weights.sum()
    
    return(weights,closest_indices)

# FIND TIME-INDEX IN ROMS RELATIVE TO OBSERVATION TIME
def find_time_index(r_time,ocean_time):
    days_since_1970 = (r_time - datetime.datetime(1970,1,1,0,0,0)).total_seconds()
    ocean_time = pd.to_datetime(ocean_time)
    # Convert ocean_time
    ocean_time_since_1970 = (ocean_time - datetime.datetime(1970,1,1,0,0,0)).total_seconds()
    index = np.abs(ocean_time_since_1970 - days_since_1970).argmin()
    return(index)

# FIND THE ROMS-MODEL's X AND Y POSITIONS
def get_XYpositions(filename, lons, lats):
    
    fh = xr.open_dataset(filename)
    x   = np.linspace(0, fh.lat_rho.values.shape[1]-1, fh.lat_rho.values.shape[1])
    y   = np.linspace(0, fh.lat_rho.values.shape[0]-1, fh.lat_rho.values.shape[0])
    xi  = np.zeros_like(fh.lon_rho.values)
    yi  = np.zeros([fh.lon_rho.values.shape[1], fh.lon_rho.values.shape[0]])
    xi[:,:] = x
    yi[:,:] = y
    yi  = np.swapaxes(yi, 1, 0)

    # First I define the wet points of the field as the lon,lat values with mask_rho==1 
    sea_def = pyresample.geometry.SwathDefinition(lons= fh.lon_rho.values[np.where(fh.mask_rho)], lats = fh.lat_rho.values[np.where(fh.mask_rho)])

    # Second, the full grid definiton (our target domain):
    orig_def = pyresample.geometry.SwathDefinition(lons=lons, lats=lats)

    # Then I fill the temperature field by the nearest neighbour approace.
    # Note that only wet points are used as input. 

    # The radius of influence sets a limit (in meters) for how far away a true value can be from the point that will be filled

    ypos = pyresample.kd_tree.resample_nearest(sea_def, yi[np.where(fh.mask_rho)], \
                               orig_def, radius_of_influence=2400)

    xpos = pyresample.kd_tree.resample_nearest(sea_def, xi[np.where(fh.mask_rho)], \
                               orig_def, radius_of_influence=2400)
    return np.array([int(x) for x in xpos]), np.array([int(y) for y in ypos])


# MAIN FUNCTION THAT EXTRACTS DATA FROM THE ROMS MODEL, AS LONG AS THE PANDAS DATAFRAME IS GROUPED INTO DAYS
def extract_data_for_group(group,dsG,var='temp'):
    # Define a function to extract data for each group:
    date = group['DAY'].iloc[0]  # All rows in the group have the same date
    year = date.strftime('%Y')
    month = date.strftime('%m')
    day = date.strftime('%d')
    
    file_path = f'https://thredds.met.no/thredds/dodsC/sea_norshelf_files/{year}/{month}/norshelf_qck_an_{year}{month}{day}T00Z.nc'
                  
    # Read the file and extract data (assuming file has some structured data)
    
    try:
        with xr.open_dataset(file_path) as ds:
            # Selcect variables: 
            ds = ds.get([var])
            # Here we simulate extracting relevant data from the file for each x, y
            extracted_data = []
            for _, row in group.iterrows():
                x, y = row['X'], row['Y']
                
                #fine time-index
                r_time = row['TIME']
                ocean_time = ds.ocean_time.values
                index = find_time_index(r_time,ocean_time)
                
                #find depth index to interpolate over:
                depth = row['DEPTH']
                weights,indices = get_Zindices(dsG,x,y,depth*-1)
                
                # Data extraction based on x, y, time and s_rho
                temp1 = ds.isel(ocean_time=index,s_rho = indices[0], xi_rho = x, eta_rho = y)[var].values
                temp2 = ds.isel(ocean_time=index,s_rho = indices[1], xi_rho = x, eta_rho = y)[var].values
                # Calculate the weighted average temperature
                weighted_temperature = temp1 * weights[0] + temp2 * weights[1]
                extracted_data.append(weighted_temperature)
            return extracted_data
    except:
        extracted_data = []
        for _, row in group.iterrows():
            extracted_data.append(np.nan)
        return extracted_data   

In [6]:
#FILES:
# Grid file Norshelf:
gridfile = '/lustre/storeB/project/fou/hi/oper/norshelf/static_inputfiles/norshelf_2.4_vert_grd.nc'

# ADD INFORMATION TO GRIDFILE TO FIND DEPTH-index:

# Load your ROMS dataset
dsG = xr.open_dataset(gridfile)
# Initialize the ROMS dataset and create the grid object
dsG, xgrid = xroms.roms_dataset(dsG, include_cell_volume=True, include_Z0=True)
# Associate the dataset with the grid
dsG.xroms.set_grid(xgrid)


#StationM (txt)
INfile = "/lustre/storeB/project/fou/hi/projects/NorEmso/Observations/moorings/SouthCape/T_S_SouthCape.csv"
#'/lustre/storeB/project/fou/hi/projects/NorEmso/Observations/moorings/StationM/Deployment3/StaM_SBE_20221125_2000m.txt'

#StationM (netCDF):
#INfile = '/lustre/storeB/project/fou/hi/projects/NorEmso/Observations/moorings/StationM/Deployment1/StationM_2021_hydrography.nc'

In [8]:
# FIND X and Y:

# READING TEXT FILES
ds = txt_to_pd(INfile,66.015,1.983)

# READING NETCDF FILES:
#ds = netcdf_to_pd(INfile)

# MOVE THIS INTO A FUNCTION AND RE-USE THEM IN THE FUNCTIONS txt_to_pd and netcdf_to_pd

x,y = get_XYpositions(gridfile, ds.LON.values, ds.LAT.values)
ds['X'] = x
ds['Y'] = y


# Time to datetime:
ds['TIME'] = pd.to_datetime(ds['TIME'])

# Round the TIME column to the nearest hour:
ds['TIME'] = ds['TIME'].dt.round('H')

# Round the TIME column to the nearest day:
ds['DAY'] = ds['TIME'].dt.date

# Extract unique months (Year-Month) from the date column
ds['year_month'] = ds['TIME'].dt.to_period('M')

# Test-data
#ds=ds.head(50)


  result = getattr(ufunc, method)(*inputs, **kwargs)


AttributeError: 'DataFrame' object has no attribute 'lon'

In [None]:
#GET MODELRESULTS:

# Find unique Year-Month combinations
unique_months = ds['year_month'].unique()

for month in unique_months:
    monthly_data = ds.loc[ds['year_month'] == month].copy() 
    print(str(month))
    # Group by the TIME column and apply the function to each group
    grouped = monthly_data.groupby('DAY').apply(lambda group: extract_data_for_group(group,dsG,'temp'))
    # Flatten the grouped data into the original DataFrame
    monthly_data['TEMP_MOD'] = [item for sublist in grouped for item in sublist]
    #!!!! DO the same for salt
    # Group by the TIME column and apply the function to each group
    grouped = monthly_data.groupby('DAY').apply(lambda group: extract_data_for_group(group,dsG,'salt'))
    # Flatten the grouped data into the original DataFrame
    monthly_data['SALT_MOD'] = [item for sublist in grouped for item in sublist]
    #save_file:
    filename = INfile.split('.')[0].split('/')[-1]+'_{}.csv'
    monthly_data.to_csv(filename.format(str(month)), float_format='%.3f', index=False)
    del monthly_data

dsG.close()    