In [1]:
from pathlib import Path
from typing import List, Dict, Union

import pandas as pd
import xarray
from tqdm import tqdm

In [5]:

def preprocess_camels_br_dataset(data_dir: Path):
    """Preprocess CAMELS-BR data set and create per-basin files for more flexible and faster data loading.
    
    This function will read-in all time series text files and create per-basin csv files containing all timeseries 
    features at once in a new subfolder called "preprocessed". Will only consider the 897 basin for which streamflow and
    forcings exist. Note that simulated streamflow only exists for 593 out of 897 basins.
    
    Parameters
    ----------
    data_dir : Path
        Path to the CAMELS-BR data set containing the different subdirectories that can be downloaded as individual zip
        archives.

    Raises
    ------
    FileExistsError
        If a sub-folder called 'preprocessed' already exists in `data_dir`.
    FileNotFoundError
        If any of the subdirectories of CAMELS-BR is not found in `data_dir`, specifically the folders starting with 
        `03_*` up to `13_*`.
    """
    # check if data has already been pre-processed other-wise create dst folder
    dst_dir = data_dir / "preprocessed"
   # if dst_dir.is_dir():
   #     raise FileExistsError(
   #         "Subdirectory 'preprocessed' already exists. Delete this folder if you want to reprocess the data.")
   # dst_dir.mkdir()

    # Streamflow and forcing data are stored in different subdirectories that start with a numeric value each. The first
    # one is streamflow mm/d starting with 03 and the last is max temp starting with 13.
    timeseries_folders = [data_dir / subdirectory for subdirectory in _CAMELS_BR_TIMESERIES_SUBDIRS]
    if any([not p.is_dir() for p in timeseries_folders]):
        missing_subdirectories = [p.name for p in timeseries_folders if not p.is_dir()]
        raise FileNotFoundError(
            f"The following directories were expected in {data_dir} but do not exist: {missing_subdirectories}")

    # Since files is sorted, we can pick the first one, streamflow, and extract the basins names from there
    basins = [x.stem.split('_')[0] for x in timeseries_folders[0].glob('*.txt')]
    print(f"Found {len(basins)} basin files under {timeseries_folders[0].name}")

    for basin in tqdm(basins, desc="Combining timeseries data from different subdirectories into one file per basin"):
        data = {}
        for timeseries_folder in timeseries_folders:
            basin_file = list(timeseries_folder.glob(f'{basin}_*'))
            if basin_file:
                df = pd.read_csv(basin_file[0], sep=' ')
                df["date"] = pd.to_datetime(df.year.map(str) + "/" + df.month.map(str) + "/" + df.day.map(str),
                                            format="%Y/%m/%d")
                df = df.set_index('date')
                feat_cols = [c for c in df.columns if c not in ['year', 'month', 'day']]
                for col in feat_cols:
                    data[col] = df[col]
        df = pd.DataFrame(data)
        df.to_csv(dst_dir / f"{basin}.csv")

    print(f"Finished processing the CAMELS-BR data set. Resulting per-basin csv files have been stored at {dst_dir}")


In [3]:

_CAMELS_BR_TIMESERIES_SUBDIRS = [
    '03_CAMELS_BR_streamflow_selected_catchments',
#    '04_CAMELS_BR_streamflow_simulated',
    '05_CAMELS_BR_precipitation',
    '06_CAMELS_BR_actual_evapotransp',
    '07_CAMELS_BR_potential_evapotransp',
    '08_CAMELS_BR_reference_evapotransp',
    '09_CAMELS_BR_temperature',
    '10_CAMELS_BR_soil_moisture',
]

In [6]:
preprocess_camels_br_dataset(Path("../../DATA/1.Spatial_data/global/sw_surfacewater_streamflow_runoff_river_network_waterstress/camels-br/"))

FileExistsError: [Errno 17] File exists: '../../DATA/1.Spatial_data/global/sw_surfacewater_streamflow_runoff_river_network_waterstress/camels-br/preprocessed'