### Developing the GEO Dataset/Dataloader/Editors

In [6]:
from typing import Optional, List, Union, Tuple
from omegaconf import DictConfig
from datetime import datetime
import pandas as pd
from glob import glob
import xarray as xr
import numpy as np
from rs_tools._src.datamodule.utils import split_train_val

#### Splitting the dataset based on train/val characteristics

In [2]:
list_of_dates = []
list_of_files = []
for year in [2019, 2020, 2021]:
    for month in range(1, 13):
        for day in range(1, 32):
            try:
                date = datetime(year, month, day, hour=0, minute=0, second=0).strftime('%Y%m%d%H%M%S')
                filename = f'{date}_test.nc'
                list_of_dates.append(date)
                list_of_files.append(filename)
            except ValueError:
                continue


In [3]:
split_dict = {
    "train": {"years": None, "months": [1, 3, 5, 7, 9, 11], "days": None},
    "val": {"years": None, "months": [2, 4, 6, 8, 10, 12], "days": None},
}

In [5]:
train_files, val_files = split_train_val(list_of_files, split_dict)

{'years': [2019, 2020, 2021], 'months': [1, 3, 5, 7, 9, 11], 'days': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]}
{'years': [2019, 2020, 2021], 'months': [2, 4, 6, 8, 10, 12], 'days': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]}


In [7]:
len(train_files), len(val_files), len(list_of_files)

(552, 544, 1096)

#### Loading the data

In [7]:
from rs_tools._src.utils.io import get_list_filenames

In [8]:
filenames = get_list_filenames('/Users/anna.jungbluth/Desktop/git/rs_tools/data/goes16/analysis', 'nc')

In [52]:
ds: xr.Dataset = xr.load_dataset(filenames[0], engine="netcdf4")

In [53]:
ds

In [48]:
data = ds.Rad.to_numpy()

In [49]:
band_wavelengths = ds.band_wavelength.compute().to_numpy()

##### Testing the GeoDataset

In [1]:
from rs_tools._src.datamodule.datasets import GeoDataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
splits_dict = { 
    "train": {"years": None, "months": None, "days": None},
    "val": {"years": None, "months": [2, 4, 6, 8, 10, 12], "days": None},
}

In [8]:
geo = GeoDataset(
    data_dir='/Users/anna.jungbluth/Desktop/git/rs_tools/data/goes16/analysis',
    editors=None,
    splits_dict=splits_dict['train'],
    load_coords=True,
    load_cloudmask=True,
)

In [17]:
for i in range(2):
    print(geo[i]['data'].shape)

(16, 256, 256)
(16, 256, 256)
