In [None]:
"""
Code for preparing data for CMIP6 HighResMIP, NPD_eORCA025 and Obs.-based Products

Processing in this notebook includes:
-------------------------------------
1. Computing con. temperature data and interpolating/remapping to a regular lat-lon grid 
over the Pacific (30˚S-30˚N; 120˚E-290˚E), using bilinear interpolation. The grid description
for which was manually configured to be a 0.25˚ regular grid and saved as ./data/griddes_025.grd
2. Calculating anomalies using centred moving 30yr baselines, for which the bounds were
manually configured and saved as a .csv files in ./data/ (with prefix as Centred_BS_bounds),
separetely for CMIP6 HighResMIP total period (1950-2050) and Observational datasets and NPD_eOR025 
runs (1976-2023).
3. Calculate depth of 20˚ isotherm from computed con. temp data, and saving the file.
4. Compute zonal wind anomalies at 850hPa for CMIP6 HighResMIP and ERA5 Reanalysis.
-------------------------------------------------------------------------------------------
Author: Sreevathsa G. (sg13n23@soton.ac.uk; ORCID ID: 0000-0003-4084-9677)
Last updated: 11 December 2025
"""

In [11]:
import gc
import sys
import gsw
import glob
import xarray as xr
import os
from itertools import product
from utils import *
from cdo import *
cdo = Cdo()
from tqdm.notebook import tqdm


# Dask cluster
import dask
from dask.distributed import Client, LocalCluster

  _pyproj_global_context_initialize()


In [None]:
# =============================================================================
# DASK CLUSTER SETUP (ON ANEMONE HPC @ NOCS, UK)
# =============================================================================
def setup_dask_cluster(n_workers=6, threads_per_worker=3, memory_limit="48GB"):
    """Initialize and return a Dask cluster and client."""
    dask.config.set({
        "temporary_directory": "/dssgfs01/scratch/sg13n23/temp/",
        "local_directory": "/dssgfs01/scratch/sg13n23/temp/"
    })
    cluster = LocalCluster(
        n_workers=n_workers,
        threads_per_worker=threads_per_worker,
        memory_limit=memory_limit
    )
    client = Client(cluster)
    return cluster, client

In [None]:
# Dataset name lists
PARENT_DIR = '/badc/cmip6/data/CMIP6/HighResMIP/'
HIGHRESMIP_MODEL_PATH_SUFFIX = ['MPI-ESM1-2-HR']#, 'MPI-ESM1-2-XR', 'HadGEM3-GC31-HH', 
                                #'EC-Earth3P-HR', 'CMCC-CM2-VHR4', 'CNRM-CM6-1-HR']
EXP_TYPES = {'CTRL' : ['control-1950'], 
             'HIST-FUT' : ['hist-1950', 'highres-future']}

In [None]:
# Setting up the Dask client for this notebook.
cluster, client = setup_dask_cluster()
client

In [17]:
# Helper functions
def cdo_remapbil(filename_list, var_name):
    ds = []
    for f in filename_list:
        ds += [renamer(cdo.remapbil('./data/griddes_025.grd', input=f'-selvar,{var_name} '+f, returnXDataset = True))]
    ds = xr.concat(ds, dim = 'time')
    return ds

# Calculate Z20
def z20_calculator(ds):
    abs_diff = abs(ds['thetao_con'] - 20)
    valid_mask = abs_diff.notnull().any(dim='depth')
    abs_diff = abs_diff.where(valid_mask)
    min_index = abs_diff.fillna(np.inf).argmin(dim='depth')
    z20 = ds['depth'].isel(depth=min_index).where(valid_mask).rename('z20')
    return z20

In [None]:
# =============================================================================
# CALCULATING CON. TEMPERATURE DATA OVER A PACIFIC SUBSET USING A GSW FUNCTION
# =============================================================================

# CMIP6 HighResMIP
for model, exp_type in tqdm(list(product(HIGHRESMIP_MODEL_PATH_SUFFIX, EXP_TYPES))):
    # Creating a {model}_{exp_type} directory if it doesn't exist
    if os.path.exists(f'./data/thetao_con/{model}_{exp_type}/') == False:
        os.mkdir(f'./data/thetao_con/{model}_{exp_type}/')
    for year in tqdm(np.arange(1950, 2050+1), leave = False):
        # Filenames corresponding to each year's data: pt = potential temperature (var_name : thetao, in degC) and so = absolute salinity (var_name : so, in g/kg)
        fnames_pt = flatten([sorted(glob.glob(PARENT_DIR + f'*/{model}/{exp_id}/r1*/Omon/thetao/*/latest/*{year}??.nc')) for exp_id in EXP_TYPES[exp_type]])
        fnames_so = flatten([sorted(glob.glob(PARENT_DIR + f'*/{model}/{exp_id}/r1*/Omon/so/*/latest/*{year}??.nc')) for exp_id in EXP_TYPES[exp_type]])
        # Ensuring the lists are not empty i.e., files for the iterated year exists
        if (fnames_pt != []) & (fnames_so != []):
            # Reading and bilinearly interpolating the data into a 0.25deg regular lat-lon grid
            ds_thetao = cdo_remapbil(filename_list = fnames_pt, var_name = 'thetao')
            ds_so = cdo_remapbil(filename_list = fnames_so, var_name = 'so')
            # Computing conservative temperature from potential temperature and salinity using gsw.conversions.CT_from_pt function
            ds = gsw.conversions.CT_from_pt(SA = ds_so['so'], pt = ds_thetao['thetao']).rename('thetao_con').to_dataset()
            # Saving the file as a single-year file
            ds.to_netcdf(f'./data/thetao_con/{model}_{exp_type}/BILINTERP_PAC_THETAO_CON_CMIP6_HIGHRESMIP_{model}_{exp_type}_{year}.nc')
    # Cleaning-up
    cdo.cleanTempDir()
    
# Observations-based Products (OBS)
for obs_dataset in ['ORAS5', 'EN4']:
    for ts in tqdm(pd.date_range(start = '1976-01-01', end = '2023-12-31', freq = '1MS')):
        if os.path.exists(f'./data/thetao_con/{obs_dataset}/') == False:
            os.mkdir(f'./data/thetao_con/{obs_dataset}/')

        if obs_dataset == 'ORAS5':
            f_so = glob.glob(f'/gws/nopw/j04/nemo_vol1/golla/OBS_DATA/ORAS5/orig_data/vosaline*{ts.year}{ts.month:02d}*')[0]
            f_pt = glob.glob(f'/gws/nopw/j04/nemo_vol1/golla/OBS_DATA/ORAS5/orig_data/votemper*{ts.year}{ts.month:02d}*')[0]
            varname_pt = 'votemper'
            varname_so = 'vosaline'
        elif obs_dataset == 'EN4':
            f_so = f_pt = f'/gws/nopw/j04/nemo_vol1/golla/OBS_DATA/EN4/orig_data/EN.4.2.2.f.analysis.g10.{ts.year}{ts.month:02d}.nc'
            varname_pt = 'temperature'
            varname_so = 'salinity'
            
        # Reading and bilinearly interpolating the data into a 0.25deg regular lat-lon grid
        ds_thetao = cdo_remapbil(filename_list = [f_pt], var_name = varname_pt)
        ds_so = cdo_remapbil(filename_list = [f_so], var_name = varname_so)
        # Computing conservative temperature from potential temperature and salinity using gsw.conversions.CT_from_pt function
        ds = gsw.conversions.CT_from_pt(SA = ds_so[varname_so], pt = ds_thetao[varname_pt]).rename('thetao_con').to_dataset()
        # Setting time axis appropriately
        ds['time'] = pd.to_datetime([str(_)[0:8]+'01' for _ in ds['time'].values])
        # Saving the file as a single-year file
        ds.to_netcdf(f'./data/thetao_con/{obs_dataset}/BILINTERP_PAC_THETAO_CON_{obs_dataset}_{ts.year}{ts.month:02d}.nc')
        # Cleaning-up
        cdo.cleanTempDir()  

# NPD_eORCA025 exps. (NOTE: thetao_con is pre-calculated online when the model is run, so just bilinearly interpolating and saving data here).
for npd_exp in ['NPD_eORCA025_ERA5', 'NPD_eORCA025_JRA55']:
    for ts in tqdm(pd.date_range(start = '1976-01-01', end = '2023-12-31', freq = '1MS')):
        fname = f'/dssgfs01/scratch/npd/simulations/{npd_exp[4:]}/{ts.year}/eORCA025_1m_grid_T_{ts.year}{ts.month:02d}-{ts.year}{ts.month:02d}.nc'
        ds = cdo_remapbil(filename_list = [fname], var_name = 'thetao_con')['thetao_con'].to_dataset()
        # Setting time axis appropriately
        ds['time'] = pd.to_datetime([str(_)[0:8]+'01' for _ in ds['time'].values])
        # Saving the file as a single-year file
        ds.to_netcdf(f'./data/thetao_con/{npd_exp}/BILINTERP_PAC_THETAO_CON_{npd_exp}_{ts.year}{ts.month:02d}.nc')
        # Cleaning-up
        cdo.cleanTempDir() 

In [None]:
# =============================================================================
# CALCULATING MOVING-BASELINE ANOMALIES FOR EACH DATASET OVER THE PACIFIC
# =============================================================================

# CMIP6 HighResMIP
bounds_df = pd.read_csv('./data/Centred_BS_bounds_CMIP6_HIGHRESMIP_1950-2050.csv')
for model, exp_type in tqdm(list(product(HIGHRESMIP_MODEL_PATH_SUFFIX, EXP_TYPES))):
    # Creating a {model}_{exp_type} directory if it doesn't exist
    if os.path.exists(f'./data/anomalies/{model}_{exp_type}/') == False:
        os.mkdir(f'./data/anomalies/{model}_{exp_type}/')
    ds = xr.open_mfdataset(f'./data/thetao_con/{model}_{exp_type}/BILINTERP_PAC_THETAO_CON_CMIP6_HIGHRESMIP_{model}_{exp_type}_????.nc')
    ds['time'] = pd.date_range(start = '1950-01-01', periods = ds.time.shape[0], freq = '1MS')
    
    for year in tqdm(np.arange(1950, 2050+1), leave = False):
        bs_df = bounds_df[(year>=bounds_df['START_Y']) & (year<=bounds_df['END_Y'])]
        bs_start, bs_end = str(bs_df['BS_START'].item()), str(bs_df['BS_END'].item())
        
        baseline = ds.sel(time = slice(bs_start, bs_end)).groupby('time.month').mean()
        anomalies = ds.sel(time = str(year)).groupby('time.month') - baseline
        anomalies = anomalies.rename({'thetao_con':'anomaly'}).drop_vars('month')
        
        anomalies.attrs['standard_name'] = 'anomaly_sea_water_conservative_temperature'
        anomalies.attrs['long_name'] = f'Anomaly of Sea Water Conservative Temperature with baseline {bs_start}-{bs_end}'
        anomalies.attrs['units'] = 'degC'
        
        anomalies.to_netcdf(f'./data/anomalies/{model}_{exp_type}/BILINTERP_PAC_ANOM_BS{bs_start}-{bs_end}_CMIP6_HIGHRESMIP_{model}_{exp_type}_{year}.nc')

# Observations-based Products and NPD_eORCA025 exps. 
bounds_df = pd.read_csv('./data/Centred_BS_bounds_NPD_eORCA025_OBS_1976-2023.csv')
for dataset in ['ORAS5', 'EN4', 'NPD_eORCA025_ERA5', 'NPD_eORCA025_JRA55']:
    if os.path.exists(f'./data/anomalies/{dataset}/') == False:
            os.mkdir(f'./data/anomalies/{dataset}/')
    for ts in tqdm(pd.date_range(start = '1976-01-01', end = '2023-12-31', freq = '1MS')):
        bs_df = bounds_df[(ts.year>=bounds_df['START_Y']) & (ts.year<=bounds_df['END_Y'])]
        bs_start, bs_end = bs_df['BS_START'].item(), bs_df['BS_END'].item()
        
        bs_fnames = []
        for _ in range(bs_start, bs_end+1):
            bs_fnames += [f'./data/thetao_con/{dataset}/BILINTERP_PAC_THETAO_CON_{dataset}_{_}{ts.month:02d}.nc']
            
        ds = xr.open_dataset(f'./data/thetao_con/{dataset}/BILINTERP_PAC_THETAO_CON_{dataset}_{ts.year}{ts.month:02d}.nc')
        baseline = xr.open_mfdataset(bs_fnames).groupby('time.month').mean()
        anomalies = ds.groupby('time.month') - baseline
        
        anomalies = anomalies.rename({'thetao_con':'anomaly'}).drop_vars('month')
        
        anomalies.attrs['standard_name'] = 'anomaly_sea_water_conservative_temperature'
        anomalies.attrs['long_name'] = f'Anomaly of Sea Water Conservative Temperature with baseline {bs_start}-{bs_end}'
        anomalies.attrs['units'] = 'degC'
        
        anomalies.to_netcdf(f'./data/anomalies/{dataset}/BILINTERP_PAC_ANOM_BS{bs_start}-{bs_end}_{dataset}_{ts.year}{ts.month:02d}.nc')

In [None]:
# =============================================================================
# CALCULATING DEPTH OF 20°C (Z20) FOR EACH MODEL/OBSERVATIONAL DATASET
# =============================================================================

# CMIP6 HighResMIP
for model, exp_type in tqdm(list(product(HIGHRESMIP_MODEL_PATH_SUFFIX, EXP_TYPES))):
    ds_z20 = []
    for year in tqdm(np.arange(1950, 2050+1), leave = False):
        ds = xr.open_dataset(f'./data/thetao_con/{model}_{exp_type}/BILINTERP_PAC_THETAO_CON_CMIP6_HIGHRESMIP_{model}_{exp_type}_{year}.nc')
        ds['time'] = pd.date_range(start = f'{year}-01-01', periods = ds.time.shape[0], freq = '1MS')
        ds = ds.sel(lat = slice(-5,5), lon = slice(140,280))
        ds_z20 += [z20_calculator(ds)]
    ds_z20 = xr.merge(ds_z20)
    ds_z20 = ds_z20.interp(lon = np.arange(140,280.01, 0.1)).mean(dim = ['lat']).expand_dims(dataset = [model])
    ds_z20.to_netcdf(f'./data/z20_depths/Z20_DEPTHS_{model}_{exp_type}_1950-2050.nc')

# Observations-based Products and NPD_eORCA025 exps. 
for dataset in ['ORAS5', 'EN4', 'NPD_eORCA025_ERA5', 'NPD_eORCA025_JRA55']:
    ds_z20 = []
    for ts in tqdm(pd.date_range(start = '1976-01-01', end = '2023-12-31', freq = '1MS')):
        ds = xr.open_dataset(f'./data/thetao_con/{dataset}/BILINTERP_PAC_THETAO_CON_{dataset}_{ts.year}{ts.month:02d}.nc')
        ds['time'] = pd.date_range(start = f'{year}-01-01', periods = ds.time.shape[0], freq = '1MS')
        ds = ds.sel(lat = slice(-5,5), lon = slice(140,280))
        ds_z20 += [z20_calculator(ds)]
    ds_z20 = xr.merge(ds_z20)
    ds_z20 = ds_z20.interp(lon = np.arange(140,280.01, 0.1)).mean(dim = ['lat']).expand_dims(dataset = [model])
    ds_z20.to_netcdf(f'./data/z20_depths/Z20_DEPTHS_{dataset}_1950-2050.nc')

In [None]:
# =============================================================================
# CALCULATING ZONAL WIND ANOMALIES FOR CMIP6 HIGHRESMIP AND ERA5 REANALYSIS
# =============================================================================



In [None]:
client.shutdown()
cluster.close()