# ML-Ready Data

In this tutorial, we will go over some of the basics to create dataloaders.

![](../assets/ml_ready_data.png)

In [1]:
import autoroot
import os
import xarray as xr
import matplotlib.pyplot as plt
from xrpatcher import XRDAPatcher
from torch.utils.data import Dataset, DataLoader, ConcatDataset
import numpy as np
import itertools
from dotenv import load_dotenv
from rs_tools._src.utils.io import get_list_filenames


xr.set_options(
    keep_attrs=True, 
    display_expand_data=False, 
    display_expand_coords=False, 
    display_expand_data_vars=False, 
    display_expand_indexes=False
)
np.set_printoptions(threshold=10, edgeitems=2)


save_dir = os.getenv("ITI_DATA_SAVEDIR")

  within=pd.to_timedelta(config["nearesttime"].get("within", "1H")),
  within=pd.to_timedelta(config["nearesttime"].get("within", "1H")),


## ML-Ready Datasets


In [2]:
list_of_files = get_list_filenames(f"{save_dir}/goes16/analysis", ".nc")
len(list_of_files)

0

In [3]:
ds = xr.open_dataset(list_of_files[0], engine="netcdf4")
ds

IndexError: list index out of range

***

### PyTorch Integration

In [6]:
from rs_tools._src.utils.io import get_list_filenames
from rs_tools._src.datamodule.utils import load_nc_file
from rs_tools._src.datamodule.editor import StackDictEditor, CoordNormEditor
from toolz import compose_left

We will create a very simple demo dataloader

In [7]:
from torch.utils.data import Dataset, DataLoader
from typing import Optional, Callable

class NCDataReader(Dataset):
    def __init__(self, data_dir: str, ext: str=".nc", transforms: Optional[Callable]=None):
        self.data_dir = data_dir
        self.data_filenames = get_list_filenames(data_dir, ext)
        self.transforms = transforms

    def __getitem__(self, ind) -> np.ndarray:
        nc_path = self.data_filenames[ind]
        x = load_nc_file(nc_path)
        if self.transforms is not None:
            x = self.transforms(x)
        return x

    def __len__(self):
        return len(self.data_filenames)

In [8]:
ds = NCDataReader(f"{save_dir}/goes16/analysis")
dl = DataLoader(ds, batch_size=1)

In [9]:
out = next(iter(dl))

In [10]:
list(out.keys())

['data', 'wavelengths', 'coords', 'cloud_mask']

In [11]:
out["data"].shape, out["coords"].shape

(torch.Size([1, 16, 8, 8]), torch.Size([1, 2, 8, 8]))

### Transforms/Editors

We can also use custom transformations within the dataset (just like standard PyTorch) to transform our dataset

In [12]:
transforms = compose_left(
    CoordNormEditor(), 
    StackDictEditor(),
)

In [13]:
# initialize dataset with transforms
ds = NCDataReader(f"{save_dir}/aqua/analysis", transforms=transforms)

# initialize dataloader
dl = DataLoader(ds, batch_size=1)

# do one iteration
out = next(iter(dl))

# inspect a batch
out.shape

torch.Size([1, 41, 32, 32])