In [None]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

In [None]:
if not os.path.exists("data/network_input"):
    os.makedirs("data/network_input")

In [None]:
def normalize_data(column):
    min_val = min(column.apply(lambda d: np.min(d)))
    max_val = max(column.apply(lambda d: np.max(d)))
    column = column.apply(lambda d: (d - min_val) / (max_val - min_val))
    return column, min_val, max_val

In [None]:
def read_and_preprocess_numeric_data(data_dir, prefix):
    pickle = pd.read_pickle(data_dir)
    pickle["value"] = pickle["value"].apply(lambda x: np.nan_to_num(x=x, nan=np.nanmean(pickle["value"])))
    pickle["value"], min_val, max_val = normalize_data(pickle["value"])
    pickle = pickle.rename(columns={"value": prefix + "_value"})
    pickle = pickle.sort_values(by="date", ignore_index=True)
    pickle = pickle.set_index("date")
    return pickle, min_val, max_val

In [None]:
def read_and_preprocess_data(data_dir, prefix):
    pickle = pd.read_pickle(data_dir)
    pickle["value"] = pickle["value"].apply(lambda x: np.nan_to_num(x=x, nan=np.nanmean(x)))
    pickle["value"], _, _ = normalize_data(pickle["value"])
    pickle = pickle.rename(columns={"value": prefix + "_value"})
    pickle = pickle.sort_values(by="date", ignore_index=True)
    pickle = pickle.set_index("date")
    return pickle

In [None]:
def read_and_preprocess_data_with_mask(data_dir, prefix):
    pickle = pd.read_pickle(data_dir)
    pickle["value"] = pickle["value"].apply(lambda x: np.nan_to_num(x=x, nan=np.nanmean(x)))
    pickle["value"], _, _ = normalize_data(pickle["value"])
    pickle["mask"] = pickle["mask"].apply(lambda x: x.astype(float))
    pickle = pickle.rename(columns={"value": prefix + "_value", "mask": prefix + "_mask"})
    pickle = pickle.sort_values(by="date", ignore_index=True)
    pickle = pickle.set_index("date")
    return pickle

In [None]:
def read_soil_lvls(lvl1_dir, lvl2_dir, lvl3_dir, lvl4_dir):
    
    return read_and_preprocess_data(lvl1_dir, "lvl1"), read_and_preprocess_data(lvl2_dir, "lvl2"), read_and_preprocess_data(lvl3_dir, "lvl3"), read_and_preprocess_data(lvl4_dir, "lvl4")

In [None]:
def fill_missing_value(nans, idx):
    l = []
    for i in range(nans):
        arr = np.full(shape=(8, 13), fill_value=-999999)
        l.append(arr)
    l = pd.Series(l, idx)
    return l

In [None]:
def fill_missing_nans(nans, idx):
    l = []
    for i in range(nans):
        arr = np.ones(shape=(8, 13), dtype=np.float32)
        arr[0] = 0.0
        l.append(arr)
    l = pd.Series(l, idx)
    return l

In [None]:
def fillna_with_zero(a):
    where_are_NaNs = np.isnan(a)
    a[where_are_NaNs] = 0
    return a

Files generated using prepare_input.ipynb

In [None]:
II_112_1 = "II_112_1.pickle"
II_113_1 = "II_113_1.pickle"
II_114_1 = "II_114_1.pickle"
II_115_1 = "II_115_1.pickle" # too few data
II_116_1 = "II_116_1.pickle" # too few data
II_131_1 = "II_131_1.pickle"
II_132_1 = "II_132_1.pickle"
II_292_1 = "II_292_1.pickle"
II_297_1 = "II_297_1.pickle"
II_298_1 = "II_298_1.pickle"
II_472_1 = "II_472_1.pickle"
II_922_1 = "II_922_1.pickle" # too few data
II_924_1 = "II_924_1.pickle"
II_931_1 = "II_931_1.pickle"
II_932_1 = "II_932_1.pickle" # too few data
II_936_1 = "II_936_1.pickle" # too few data
II_940_1 = "II_940_1.pickle"
II_949_1 = "II_949_1.pickle" # too few data
II_951_1 = "II_951_1.pickle" # too few data
II_952_1 = "II_952_1.pickle"
II_957_1 = "II_957_1.pickle" # too few data
II_1345_1 = "II_1345_1.pickle"
II_1346_1 = "II_1346_1.pickle"
II_1351_1 = "II_1351_1.pickle"
II_1352_1 = "II_1352_1.pickle"


In [None]:
stations = [II_112_1, II_113_1, II_114_1, II_131_1, II_132_1, II_292_1, II_297_1, II_298_1, II_472_1, II_924_1, II_931_1, II_940_1, II_952_1, II_1345_1, II_1346_1, II_1351_1, II_1352_1]

In [None]:
data_path = "data"

era_5_evap_dir = os.path.join(data_path,"era5_evaporation.pickle")
era_5_soil_lvl1_dir = os.path.join(data_path, "era5_vol_soil_lvl_1.pickle")
era_5_soil_lvl2_dir = os.path.join(data_path, "era5_vol_soil_lvl_2.pickle")
era_5_soil_lvl3_dir = os.path.join(data_path, "era5_vol_soil_lvl_3.pickle")
era_5_soil_lvl4_dir = os.path.join(data_path, "era5_vol_soil_lvl_4.pickle")
precip_dir = os.path.join(data_path, "gpm-imerg_df.pickle")
#precip_dir = os.path.join(data_path, "gpm-imerg_weekly_df.pickle")
grace_dir = os.path.join(data_path, "grace_df.pickle")

In [None]:
evaporation_data = read_and_preprocess_data(era_5_evap_dir, "evap")
lvl1, lvl2, lvl3, lvl4 = read_soil_lvls(era_5_soil_lvl1_dir, era_5_soil_lvl2_dir, era_5_soil_lvl3_dir, era_5_soil_lvl4_dir)
precip_data = read_and_preprocess_data(precip_dir, "precip")
grace_data = read_and_preprocess_data_with_mask(grace_dir,"grace")

In [None]:
grace_data

Function for generating full dataset (including all features)

In [None]:
def generate_full_data(data_path, station, evaporation_data, lvl1, lvl2, lvl3, lvl4, precip_data, grace_data):
    target_dir = os.path.join(data_path, "measurements", station)
    target = pd.read_pickle(target_dir)
    target = target.rename(columns = {"value": "target_value"})
    min_val = target["target_value"].min()
    max_val = target["target_value"].max()
    target["target_value"] = (target["target_value"] - min_val) / (max_val - min_val)
    target = target.sort_values(by="date", ignore_index=True)
    target = target.set_index("date")
    dfs = [evaporation_data, lvl1, lvl2, lvl3, lvl4, precip_data, grace_data, target]
    data = pd.concat(dfs, axis = 1)
    mask = data['target_value'].isna()
    groups = (mask != mask.shift()).cumsum()

    valid_groups = groups[~mask]
    group_lengths = valid_groups.groupby(valid_groups).size()
    longest_group_index = group_lengths.idxmax()
    longest_group_mask = (groups == longest_group_index) & (~mask)
    longest_group = data[longest_group_mask]

    data = longest_group

    data["merged"] = data.apply(lambda row: np.hstack((row.loc[data.columns != "target_value"])), axis=1)
    data["merged"] = data["merged"].apply(lambda x: np.hstack((x))) 
    columns_to_drop = [x for x in data.columns if x not in ["target_value", "merged"]]
    data = data.drop(columns=columns_to_drop)
    data.to_pickle("data/network_input/full_" + station)
    normalization_values = pd.DataFrame({"min": [min_val], "max": [max_val]})
    normalization_values.to_pickle("data/normalization/normalization_values_" + station)

In [None]:
for station in stations:
    generate_full_data(data_path, station, evaporation_data, lvl1, lvl2, lvl3, lvl4, precip_data, grace_data)

Function for generating dataset with only GRACE feature.

In [None]:
def generate_grace_data(data_path, station, grace_data):
    target_dir = os.path.join(data_path, "measurements", station)
    target, _, _ = read_and_preprocess_numeric_data(target_dir, "target")
    dfs = [grace_data, target]
    data = pd.concat(dfs, axis = 1)
    mask = data['target_value'].isna()
    groups = (mask != mask.shift()).cumsum()

    valid_groups = groups[~mask]
    group_lengths = valid_groups.groupby(valid_groups).size()
    longest_group_index = group_lengths.idxmax()
    longest_group_mask = (groups == longest_group_index) & (~mask)
    longest_group = data[longest_group_mask]

    data = longest_group

    data["merged"] = data.apply(lambda row: np.hstack((row.loc[data.columns != "target_value"])), axis=1)
    data["merged"] = data["merged"].apply(lambda x: np.hstack((x))) 
    columns_to_drop = [x for x in data.columns if x not in ["target_value", "merged"]]
    data = data.drop(columns=columns_to_drop)
    data.to_pickle("data/network_input/grace_" + station)

In [None]:
for station in stations:
    generate_grace_data(data_path, station, grace_data)