# Preprocessing for LSTM-based models

In [None]:
# general
from tqdm.notebook import tqdm
from pathlib import Path
import pandas as pd
import shutil
import glob
import os

# geospatial
import geopandas as gpd
import xarray as xr

# hydrology
from neuralhydrology.datasetzoo import camelscl

os.chdir("/home/rooda/OneDrive/Projects/DeepHydro/")
path_pmet = "/home/rooda/OneDrive/Projects/PatagoniaMet/"
path_disk = "/home/rooda/Pipeline/DeepHydro/"

## PMET basins

In [None]:
selection  = gpd.read_file("data/GIS/Basins_PMETobs_points_subset.gpkg")

In [None]:
# observed streamflow and basin data (PMET-obs v1.1)
q_metadata = pd.read_csv("data/Attributes_all_basins_pmet.csv", index_col = 0)
q_metadata["record_period_start"] = "1990-01-01"
q_metadata["record_period_end"]   = "2019-12-31"
q_metadata = q_metadata.loc[selection.gauge_id]
q_metadata.to_csv(path_disk + "NEURAL/data/historical_PMET/1_CAMELScl_attributes.txt", sep="\t")

# q to mm day-1
q_obs = pd.read_csv(path_pmet + "data/Zenodo/v11/Q_PMETobs_1950_2020_v11d.csv", index_col = 0)
q_obs  = q_obs[(q_obs.index >= "1990-01-01") & (q_obs.index <= "2019-12-31")] 
q_obs = q_obs.loc[:, selection.gauge_id]
q_obs.index.names = ['date']
q_obs.index = pd.date_range("1990-01-01", "2019-12-31", freq='D')
q_obs = (q_obs*1000*86400) / (q_metadata.total_area*1e6)
q_obs.index.names = ['date']
q_obs.to_csv(path_disk + "NEURAL/data/historical_PMET/PMET_q_mm_day.csv")

# climate (PMET-sim v1.1)
climate_vars = ['precip', 'tmean', 'tmax', 'tmin', 'pet']
file_suffixes = ['PP', 'T2M', 'TMAX', 'TMIN', 'PET']
output_files = ['PMET_precip_full_mm_day.csv', 'PMET_tmean_full_degC_day.csv', 'PMET_tmax_full_degC_day.csv', 'PMET_tmin_full_degC_day.csv', 'PMET_pet_full_mm_day.csv']

for var, suffix, output_file in zip(climate_vars, file_suffixes, output_files):
    data = pd.read_parquet(f"{path_disk}CLIMATE/catchments/{suffix}_ref_PMET_basins_full.parquet")
    data = data[["date"] + q_metadata.index.tolist()]
    data.to_csv(f"{path_disk}NEURAL/data/historical_PMET/{output_file}", index=False)

## glacier melt from OGGM
q_glacier = pd.read_csv("results/runoff/glacier_melt_historical_pmet.csv", index_col = 0)
q_glacier = q_glacier[selection.gauge_id]
q_glacier.index = pd.date_range("1990-01-01", "2019-12-31", freq='D')
q_glacier.index.names = ['date']
q_glacier = (q_glacier*1000*86400) / (q_metadata.total_area*1e6)
q_glacier.to_csv(path_disk + "NEURAL/data/historical_PMET/PMET_glacier_melt_mm_day.csv", index = True)

In [None]:
camelscl.preprocess_camels_cl_dataset(Path(path_disk + "/NEURAL/data/historical_PMET"))

## All basins

### Historical period

In [None]:
q_metadata_pmet  = pd.read_csv("data/Attributes_all_basins_pmet.csv", index_col = 0)
q_metadata_pmet  = q_metadata_pmet.loc[selection.gauge_id]
q_metadata = pd.read_csv("data/Attributes_all_basins.csv", index_col = 0)
q_metadata = pd.concat([q_metadata_pmet, q_metadata])
q_metadata["record_period_start"] = "1990-01-01"
q_metadata["record_period_end"]   = "2019-12-31"
q_metadata.to_csv(path_disk + "NEURAL/data/historical_ALL/1_CAMELScl_attributes.txt", sep="\t")

# q to mm day-1
q_obs = pd.read_csv(path_pmet + "data/Zenodo/v11/Q_PMETobs_1950_2020_v11d.csv", index_col = 0)
q_obs = q_obs[(q_obs.index >= "1990-01-01") & (q_obs.index <= "2019-12-31")] 
q_obs = q_obs.reindex(columns = q_metadata.index.tolist())
q_obs.index.names = ['date']
q_obs.index = pd.date_range("1990-01-01", "2019-12-31", freq='D')
q_obs = (q_obs*1000*86400) / (q_metadata.total_area*1e6)
q_obs.index.names = ['date']
q_obs.to_csv(path_disk + "NEURAL/data/historical_ALL/PMET_q_mm_day.csv")

In [None]:
climate_vars = ['precip', 'tmean', 'tmax', 'tmin', 'pet']
file_suffixes = ['PP', 'T2M', 'TMAX', 'TMIN', 'PET']
output_files = ['PMET_precip_full_mm_day.csv', 'PMET_tmean_full_degC_day.csv', 'PMET_tmax_full_degC_day.csv', 'PMET_tmin_full_degC_day.csv', 'PMET_pet_full_mm_day.csv']

for var, suffix, output_file in zip(climate_vars, file_suffixes, output_files):
    data_pmet = pd.read_parquet(f"{path_disk}CLIMATE/catchments/{suffix}_ref_PMET_basins_full.parquet").set_index("date")
    data_pmet = data_pmet[selection.gauge_id.tolist()]
    data_pmet.index = pd.to_datetime(data_pmet.index)
    
    data_all = pd.read_parquet(f"{path_disk}CLIMATE/catchments/{suffix}_ref_all_basins_full.parquet").set_index("date")
    data_all.index = pd.to_datetime(data_all.index)
    
    data_combined = pd.concat([data_pmet, data_all], axis=1)
    data_combined.index.name = "date"
    data_combined.to_csv(f"{path_disk}NEURAL/data/historical_ALL/{output_file}")

    # glacier melt from OGGM
q_glacier_pmet = pd.read_csv("results/runoff/glacier_melt_historical_pmet.csv", index_col = 0)
q_glacier_pmet = q_glacier_pmet[selection.gauge_id.tolist()]
q_glacier = pd.read_csv("results/runoff/glacier_melt_historical_all.csv", index_col = 0)
q_glacier = pd.concat([q_glacier_pmet, q_glacier], axis = 1) 
q_glacier.index = pd.date_range("1990-01-01", "2019-12-31", freq='D')
q_glacier.index.names = ['date']
q_glacier = (q_glacier*1000*86400) / (q_metadata.total_area*1e6)
q_glacier = q_glacier.fillna(0).round(3)
q_glacier.to_csv(path_disk + "NEURAL/data/historical_ALL/PMET_glacier_melt_mm_day.csv", index = True)

In [None]:
camelscl.preprocess_camels_cl_dataset(Path(path_disk + "/NEURAL/data/historical_ALL"))

for file in glob.glob(path_disk + "/NEURAL/data/historical_ALL/*.csv"):
    os.remove(file)

### Future period

In [None]:
start_date = '2021-01-01'
end_date   = '2098-12-31'
gcm_list   = ["GFDL-ESM4", "IPSL-CM6A-LR", "MIROC6", "MPI-ESM1-2-LR", "MRI-ESM2-0"]
ssp_list   = ["ssp126", "ssp585"]

In [None]:
q_metadata_pmet  = pd.read_csv("data/Attributes_all_basins_pmet.csv", index_col = 0)
q_metadata_pmet  = q_metadata_pmet.loc[selection.gauge_id]
q_metadata = pd.read_csv("data/Attributes_all_basins.csv", index_col = 0)
q_metadata = pd.concat([q_metadata_pmet, q_metadata])
q_metadata["record_period_start"] = "1990-01-01"
q_metadata["record_period_end"]   = "2019-12-31"

glacier_melt = xr.open_dataset("results/runoff/glacier_melt_future_all.nc").sel(time = slice(start_date, end_date))
glacier_melt = glacier_melt.rename({"rgi_id": "gauge_id"}).melt_on_glacier_daily

# future climate
for ssp in ssp_list:
    for gcm in gcm_list: 
        path_run = path_disk + "NEURAL/data/future_ALL_" + gcm + "_" + ssp
        
        os.mkdir(path_run)
        shutil.copyfile(path_disk + "NEURAL/data/historical_ALL/1_CAMELScl_attributes.txt", 
                        path_run + "/1_CAMELScl_attributes.txt")

        #shutil.copyfile(path_disk + "data/historical_ALL/PMET_q_mm_day.csv",         
        #                path_run + "/PMET_q_mm_day.csv")
        
        climate_vars = ['PP', 'T2M', 'TASMAX', 'TASMIN', 'PET']
        output_files = ['PMET_precip_full_mm_day.csv', 'PMET_tmean_full_degC_day.csv', 'PMET_tmax_full_degC_day.csv', 'PMET_tmin_full_degC_day.csv', 'PMET_pet_full_mm_day.csv']

        for var, output_file in zip(climate_vars, output_files):
            future_data = pd.read_parquet(f"{path_disk}CLIMATE/catchments/{var}_{gcm}_{ssp}_all_basins_full.parquet").set_index("date")
            future_data.index = pd.to_datetime(future_data.index)
            future_data = future_data.loc[start_date:end_date]
            future_data.to_csv(f"{path_run}/{output_file}")

        ## glacier melt from OGGM
        q_glacier = glacier_melt.sel(gcm = gcm).sel(ssp = ssp).to_dataframe()[["melt_on_glacier_daily"]].reset_index()
        q_glacier = q_glacier.pivot(index='time', columns='gauge_id', values='melt_on_glacier_daily')
        q_glacier = (q_glacier*1000*86400) / (q_metadata.total_area*1e6)
        q_glacier = q_glacier.fillna(0).round(3)
        q_glacier.index.names = ['date']
        q_glacier.to_csv(path_run + "/PMET_glacier_melt_mm_day.csv")

        ## process everything and remove intermediate files
        camelscl.preprocess_camels_cl_dataset(Path(path_run))

        for file in glob.glob(path_run + "/*.csv"):
            os.remove(file)

        print(gcm)