# Subsetting

The subset operation makes use of `clisops.core.subset` to process the datasets and to set the output type and the output file names.

In [None]:
from clisops.utils import get_file
# fetch files locally or from github
tas_files = get_file([
    "cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_200512-203011.nc",
    "cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_203012-205511.nc",
    "cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_205512-208011.nc",
    "cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_208012-209912.nc",
    "cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_209912-212411.nc",
    "cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_212412-214911.nc",
    "cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_214912-217411.nc",
    "cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_217412-219911.nc",
    "cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_219912-222411.nc",
    "cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_222412-224911.nc",
    "cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_224912-227411.nc",
    "cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_227412-229911.nc",
    "cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_229912-229912.nc"
])

o3_file = get_file("cmip6/o3_Amon_GFDL-ESM4_historical_r1i1p1f1_gr1_185001-194912.nc")

# remove previously created example file
import os
if os.path.exists("./output_001.nc"):
    os.remove("./output_001.nc")

In [None]:
from clisops.ops.subset import subset
import xarray as xr

The `subset` process takes several parameters:

## Subsetting Parameters

    ds: Union[xr.Dataset, str, Path]
    time: Optional[Union[str, TimeParameter]]
    area: Optional[
        Union[
            str,
            Tuple[
                Union[int, float, str],
                Union[int, float, str],
                Union[int, float, str],
                Union[int, float, str],
            ],
            AreaParameter,
        ]
    ]
    level: Optional[
        Union[
            str, LevelParameter
        ]
    ]
    time_components: Optional[Union[str, Dict, TimeComponentsParameter]]
    output_dir: Optional[Union[str, Path]]
    output_type: {"netcdf", "nc", "zarr", "xarray"}
    split_method: {"time:auto"}
    file_namer: {"standard"}
    
    
The output is a list containing the outputs in the format selected.    

In [None]:
ds = xr.open_mfdataset(tas_files, use_cftime=True, combine="by_coords")

### Output to xarray

There will only be one output for this example.

In [None]:
outputs = subset(
        ds=ds,
        time="2007-01-01T00:00:00/2200-12-30T00:00:00",
        area=(0.0, 10.0, 175.0, 90.0),
        output_type="xarray",
    )

print(f"There is only {len(outputs)} output.")
outputs[0]

### Output to netCDF with simple namer

There is only one output as the file size is under the memory limit so does not need to be split.
This example uses the simple namer which numbers output files.

In [None]:
outputs = subset(
        ds=ds,
        time="2007-01-01T00:00:00/2200-12-30T00:00:00",
        area=(0.0, 10.0, 175.0, 90.0),
        output_type="nc",
        output_dir=".",
        split_method="time:auto",
        file_namer="simple"
    )

In [None]:
# To open the file

subset_ds = xr.open_mfdataset("./output_001.nc", use_cftime=True, combine="by_coords")
subset_ds

### Output to netCDF with standard namer

There is only one output as the file size is under the memory limit so does not need to be split.
This example uses the standard namer which names output filesa ccording the the input file and how it has been subsetted.

In [None]:
outputs = subset(
        ds=ds,
        time="2007-01-01T00:00:00/2200-12-30T00:00:00",
        area=(0.0, 10.0, 175.0, 90.0),
        output_type="nc",
        output_dir=".",
        split_method="time:auto",
        file_namer="standard"
    )

### Subsetting by level

In [None]:
ds = xr.open_dataset(o3_file, use_cftime=True)

#### No subsetting applied

In [None]:
result = subset(ds=ds,  
                output_type="xarray")

result[0].coords

#### Subsetting over level

In [None]:
# subsetting over pressure level (plev)

result = subset(ds=ds,  
                level="600/100",
                output_type="xarray")

print(result[0].coords)
print(f"\nplev has been subsetted and now only has {len(result[0].coords)} values.")

### Use time components

In [None]:
ds = xr.open_mfdataset(tas_files, use_cftime=True, combine="by_coords")

In [None]:
outputs = subset(
        ds=ds,
        time_components="year: 2010, 2020, 2030|month: 12, 1, 2",
        output_type="xarray",
    )

print(f"There is only {len(outputs)} output.")
outputs[0]

### Using parameter classes

In [None]:
from roocs_utils.parameter.param_utils import (
    level_interval,
    level_series,
    time_components,
    time_interval,
    time_series,
)

In [None]:
ds = xr.open_mfdataset(tas_files, use_cftime=True, combine="by_coords")

In [None]:
outputs = subset(
        ds=ds,
        time=time_interval("2007-01-01T00:00:00", "2200-12-30T00:00:00"),
        time_components=time_components(month=["dec", "jan", "feb"]),
        output_type="xarray",
    )

print(f"There is only {len(outputs)} output.")
outputs[0]