In [1]:
import os
import xarray as xr
from tqdm import tqdm
import matplotlib.pyplot as plt

In [3]:
import xarray as xr
import os
from tqdm import tqdm

# Function to split and save datasets along the 'time' dimension from the closest time_start
def split_and_save(path_dataset, output_dir, num_splits=100, time_start=None):
    # Load the dataset from the given path
    dataset = xr.open_dataset(path_dataset)
    
    # Get the 'time' dimension
    time = dataset.coords['time']
    
    # If a specific time_start is provided, find the closest time in the dataset
    if time_start is not None:
        # Find the index of the closest time
        start_idx = abs(time - time_start).argmin().item()  # Get the index of the closest time
        closest_time = time[start_idx].values
        print(f"Closest time to {time_start} is {closest_time}")
    else:
        # Default to the first time step if no time_start is provided
        start_idx = 0

    # Calculate the split size (remaining data from the start_idx onwards)
    split_size = (len(time) - start_idx) // num_splits
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Split the dataset along the 'time' dimension starting from the closest time_start
    for i in tqdm(range(num_splits)):
        if i>2:
            break
        else:
            end_idx = start_idx + split_size if i < num_splits - 1 else len(time)
            
            # Slice the dataset along the 'time' dimension
            if i==2:
                time_slice = dataset.sel(time=slice(time[start_idx], time[end_idx - 1]))  # Adjust end time to avoid overflow
                # Save the split dataset to a new NetCDF file
                split_filename = os.path.join(output_dir, f"split_{i + 1}.nc")
                time_slice.to_netcdf(split_filename)
                print(f"Saved: {split_filename}")
            else:
                print('pass')

            
            # Update start_idx for the next split
            start_idx = end_idx


In [3]:
def interpolate_dataset_on_new_time_grid(ds, new_time_array, output_file):
    """
    Interpolate all variables in the input dataset over a new time array.
    
    Parameters:
    ds (xr.Dataset): Input dataset containing a 'time' coordinate.
    new_time_array (array-like): The new time points for interpolation.
    output_file (str): Path to save the interpolated dataset as a .nc file.
    
    Returns:
    xr.Dataset: Interpolated dataset.
    """
    
    # Create a new xarray DataArray for the new time coordinate
    new_time = xr.DataArray(new_time_array, dims='time', name='time')
    
    # Interpolate all variables over the new time coordinate
    ds_interpolated = ds.interp(time=new_time)
    
    # Save the interpolated dataset to a .nc file
    ds_interpolated.to_netcdf(output_file)
    
    return ds_interpolated

In [4]:
ds_1 = xr.open_dataset('/Volumes/LaCie/000_POSTDOC_2025/long_high_res/RCE_T300_U8.0_SAM1MOM_B1_128x128x64.nc')
ds_2 = xr.open_dataset('/Volumes/LaCie/000_POSTDOC_2025/long_high_res/dataset_2d_sub.nc')
# Define your new time array for interpolation (replace with your desired times)
new_time_array = new_time_array = ds_2.time.values  # Example new time array

# Interpolate the dataset and save it to a .nc file
output_file = '/Volumes/LaCie/000_POSTDOC_2025/long_high_res/dataset_1d.nc'
interpolated_ds = interpolate_dataset_on_new_time_grid(ds_1, new_time_array, output_file)

# Print the interpolated dataset
print(interpolated_ds)


<xarray.Dataset> Size: 2GB
Dimensions:     (z: 64, time: 25968)
Coordinates:
  * z           (z) float32 256B 25.0 75.0 127.8 ... 2.589e+04 2.698e+04
  * time        (time) float32 104kB 30.25 30.25 30.25 ... 66.31 66.31 66.32
Data variables: (12/330)
    p           (z) float32 256B 1.004e+03 998.4 992.4 ... 23.87 20.53 17.38
    SST         (time) float32 104kB 300.0 300.0 300.0 ... 300.0 300.0 300.0
    Ps          (time) float32 104kB 1.007e+03 1.007e+03 ... 1.007e+03 1.007e+03
    CLDSHD      (time) float32 104kB 0.1942 0.193 0.1917 ... 0.137 0.1372 0.1375
    AREAPREC    (time) float32 104kB 0.04137 0.04127 0.04116 ... 0.05318 0.05363
    CLD245      (time) float32 104kB 0.103 0.1013 0.09959 ... 0.06158 0.06176
    ...          ...
    QG          (time, z) float32 7MB 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
    QCCLD       (time, z) float32 7MB -9.999e+03 -9.999e+03 ... -9.999e+03
    QICLD       (time, z) float32 7MB -9.999e+03 -9.999e+03 ... -9.999e+03
    QRCLD       (time, z

In [5]:
ds_1d = xr.open_dataset('/Volumes/LaCie/000_POSTDOC_2025/long_high_res/dataset_1d.nc')

In [6]:
ds_1d.dims, ds_2.dims



In [7]:
path_dataset = '/Volumes/LaCie/000_POSTDOC_2025/long_high_res/dataset_1d.nc'
output_dir = '/Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D'

# Call the function to split and save the dataset
split_and_save(path_dataset, output_dir, num_splits=48)

  2%|▏         | 1/48 [00:00<00:26,  1.75it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_1.nc


  4%|▍         | 2/48 [00:01<00:28,  1.64it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_2.nc


  6%|▋         | 3/48 [00:01<00:26,  1.67it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_3.nc


  8%|▊         | 4/48 [00:02<00:26,  1.65it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_4.nc


 10%|█         | 5/48 [00:02<00:25,  1.68it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_5.nc


 12%|█▎        | 6/48 [00:03<00:24,  1.69it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_6.nc


 15%|█▍        | 7/48 [00:04<00:23,  1.71it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_7.nc


 17%|█▋        | 8/48 [00:04<00:22,  1.74it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_8.nc


 19%|█▉        | 9/48 [00:05<00:22,  1.70it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_9.nc


 21%|██        | 10/48 [00:05<00:22,  1.68it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_10.nc


 23%|██▎       | 11/48 [00:06<00:21,  1.70it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_11.nc


 25%|██▌       | 12/48 [00:07<00:21,  1.71it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_12.nc


 27%|██▋       | 13/48 [00:07<00:20,  1.71it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_13.nc


 29%|██▉       | 14/48 [00:08<00:19,  1.74it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_14.nc


 31%|███▏      | 15/48 [00:08<00:18,  1.74it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_15.nc


 33%|███▎      | 16/48 [00:09<00:18,  1.76it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_16.nc


 35%|███▌      | 17/48 [00:10<00:19,  1.59it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_17.nc


 38%|███▊      | 18/48 [00:10<00:18,  1.58it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_18.nc


 40%|███▉      | 19/48 [00:11<00:17,  1.66it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_19.nc


 42%|████▏     | 20/48 [00:11<00:17,  1.59it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_20.nc


 44%|████▍     | 21/48 [00:12<00:16,  1.63it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_21.nc


 46%|████▌     | 22/48 [00:13<00:16,  1.54it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_22.nc


 48%|████▊     | 23/48 [00:13<00:16,  1.56it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_23.nc


 50%|█████     | 24/48 [00:14<00:14,  1.60it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_24.nc


 52%|█████▏    | 25/48 [00:15<00:14,  1.61it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_25.nc


 54%|█████▍    | 26/48 [00:15<00:13,  1.64it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_26.nc


 56%|█████▋    | 27/48 [00:16<00:12,  1.62it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_27.nc


 58%|█████▊    | 28/48 [00:17<00:13,  1.53it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_28.nc


 60%|██████    | 29/48 [00:17<00:12,  1.56it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_29.nc


 62%|██████▎   | 30/48 [00:18<00:11,  1.61it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_30.nc


 65%|██████▍   | 31/48 [00:18<00:10,  1.64it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_31.nc


 67%|██████▋   | 32/48 [00:19<00:09,  1.68it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_32.nc


 69%|██████▉   | 33/48 [00:20<00:09,  1.58it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_33.nc


 71%|███████   | 34/48 [00:20<00:08,  1.64it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_34.nc


 73%|███████▎  | 35/48 [00:21<00:08,  1.53it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_35.nc


 75%|███████▌  | 36/48 [00:22<00:07,  1.53it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_36.nc


 77%|███████▋  | 37/48 [00:22<00:06,  1.59it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_37.nc


 79%|███████▉  | 38/48 [00:23<00:06,  1.58it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_38.nc


 81%|████████▏ | 39/48 [00:23<00:05,  1.58it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_39.nc


 83%|████████▎ | 40/48 [00:24<00:04,  1.63it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_40.nc


 85%|████████▌ | 41/48 [00:25<00:04,  1.51it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_41.nc


 88%|████████▊ | 42/48 [00:25<00:03,  1.52it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_42.nc


 90%|████████▉ | 43/48 [00:26<00:03,  1.59it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_43.nc


 92%|█████████▏| 44/48 [00:27<00:02,  1.46it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_44.nc


 94%|█████████▍| 45/48 [00:27<00:01,  1.50it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_45.nc


 96%|█████████▌| 46/48 [00:28<00:01,  1.52it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_46.nc


 98%|█████████▊| 47/48 [00:29<00:00,  1.56it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_47.nc


100%|██████████| 48/48 [00:29<00:00,  1.62it/s]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/1D/split_48.nc





In [5]:
path_dataset = '/Volumes/LaCie/000_POSTDOC_2025/long_high_res/dataset_2d_sub.nc'
output_dir = '/Volumes/LaCie/000_POSTDOC_2025/long_high_res/2D'

# Call the function to split and save the dataset
split_and_save(path_dataset, output_dir, num_splits=48)

  2%|▏         | 1/48 [00:26<20:33, 26.25s/it]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/2D/split_1.nc


  4%|▍         | 2/48 [00:49<18:44, 24.45s/it]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/2D/split_2.nc


  6%|▋         | 3/48 [01:14<18:42, 24.94s/it]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/2D/split_3.nc


  8%|▊         | 4/48 [01:40<18:23, 25.08s/it]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/2D/split_4.nc





In [4]:
path_dataset = '/Volumes/LaCie/000_POSTDOC_2025/long_high_res/dataset_3d.nc'
output_dir = '/Volumes/LaCie/000_POSTDOC_2025/long_high_res/3D_test'

# Call the function to split and save the dataset
split_and_save(path_dataset, output_dir, num_splits=48)

  0%|          | 0/48 [00:00<?, ?it/s]

pass
pass


  6%|▋         | 3/48 [08:15<2:03:52, 165.17s/it]

Saved: /Volumes/LaCie/000_POSTDOC_2025/long_high_res/3D_test/split_3.nc





In [12]:
dataset = xr.open_dataset('/Users/sophieabramian/Documents/DeepCloudLab/data/dataset_3d.nc')
output_dir = '/Users/sophieabramian/Documents/DeepCloudLab/data/3D'


time = dataset.coords['time']

start_idx = 0
num_splits=10
# Calculate the split size (remaining data from the start_idx onwards)
split_size = (len(time) - start_idx) // num_splits

In [13]:
split_size

480

In [15]:
""""
dataset = xr.open_dataset('/Users/sophieabramian/Documents/DeepCloudLab/data/dataset_3d.nc')
output_dir = '/Users/sophieabramian/Documents/DeepCloudLab/data/3D'

time = dataset.coords['time']

start_idx = 0
num_splits=10
# Calculate the split size (remaining data from the start_idx onwards)
split_size = (len(time) - start_idx) // num_splits

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

end_idx = len(time)
    
# Slice the dataset along the 'time' dimension
start_idx = int(9*split_size)
time_slice = dataset.sel(time=slice(time[start_idx], time[end_idx-1]))  # Adjust end time to avoid overflow
print(time[start_idx], time[end_idx-1])

# Save the split dataset to a new NetCDF file
split_filename = os.path.join(output_dir, f"split_{10}.nc")
time_slice.to_netcdf(split_filename)
print(f"Saved: {split_filename}")

# Update start_idx for the next split
start_idx = end_idx
"""

'"\ndataset = xr.open_dataset(\'/Users/sophieabramian/Documents/DeepCloudLab/data/dataset_3d.nc\')\noutput_dir = \'/Users/sophieabramian/Documents/DeepCloudLab/data/3D\'\n\ntime = dataset.coords[\'time\']\n\nstart_idx = 0\nnum_splits=10\n# Calculate the split size (remaining data from the start_idx onwards)\nsplit_size = (len(time) - start_idx) // num_splits\n\n# Create output directory if it doesn\'t exist\nos.makedirs(output_dir, exist_ok=True)\n\nend_idx = len(time)\n    \n# Slice the dataset along the \'time\' dimension\nstart_idx = int(9*split_size)\ntime_slice = dataset.sel(time=slice(time[start_idx], time[end_idx-1]))  # Adjust end time to avoid overflow\nprint(time[start_idx], time[end_idx-1])\n\n# Save the split dataset to a new NetCDF file\nsplit_filename = os.path.join(output_dir, f"split_{10}.nc")\ntime_slice.to_netcdf(split_filename)\nprint(f"Saved: {split_filename}")\n\n# Update start_idx for the next split\nstart_idx = end_idx\n'

In [16]:
for i in range(1,11):
    path_1d = '/Users/sophieabramian/Documents/DeepCloudLab/data/1D/split_2.nc'
    path_2d = '/Users/sophieabramian/Documents/DeepCloudLab/data/2D/split_2.nc'
    path_3d = '/Users/sophieabramian/Documents/DeepCloudLab/data/3D/split_2.nc'


    ds_1d = xr.open_dataset(path_1d)
    ds_2d = xr.open_dataset(path_2d)
    ds_3d = xr.open_dataset(path_3d)
    print(len(ds_1d.time), len(ds_2d.time), len(ds_3d.time))

480 480 480
480 480 480
480 480 480
480 480 480
480 480 480
480 480 480
480 480 480
480 480 480
480 480 480
480 480 480


In [17]:
print(len(ds_1d.time), ds_2d.dims, ds_3d.dims)

