In [42]:
import pandas as pd
import numpy as np
import os

Define dir to data pickles

In [70]:
data_path = "data"

era_5_evap_dir = os.path.join(data_path,"era5_evaporation.pickle")
era_5_soil_lvl1_dir = os.path.join(data_path, "era5_vol_soil_lvl_1.pickle")
era_5_soil_lvl2_dir = os.path.join(data_path, "era5_vol_soil_lvl_2.pickle")
era_5_soil_lvl3_dir = os.path.join(data_path, "era5_vol_soil_lvl_3.pickle")
era_5_soil_lvl4_dir = os.path.join(data_path, "era5_vol_soil_lvl_4.pickle")
precip_dir = os.path.join(data_path, "gpm-imerg_df.pickle")
grace_dir = os.path.join(data_path, "grace_df.pickle")
target_dir = os.path.join(data_path, "II_297_1.pickle")

In [71]:
def read_and_preprocess_data(data_dir, prefix):
    pickle = pd.read_pickle(data_dir)
    pickle["value"] = pickle["value"].apply(lambda x: np.nan_to_num(x=x, nan=np.nanmean(x)))
    pickle["mask"] = pickle["mask"].apply(lambda x: x.astype(float))
    pickle["value"] = pickle.apply(lambda x: np.array([x["value"], x["mask"]]), axis=1)
    pickle = pickle.drop(columns=['mask'])
    pickle = pickle.rename(columns={"value": prefix + "_value"})
    pickle = pickle.sort_values(by="date", ignore_index=True)
    pickle = pickle.set_index("date")
    return pickle

In [72]:
def normalize_data(column):
    min_val = min(column.apply(lambda d: np.min(d)))
    max_val = max(column.apply(lambda d: np.max(d)))
    column = column.apply(lambda d: (d - min_val) / (max_val - min_val))
    return column

In [73]:
def read_and_preprocess_data(data_dir, prefix):
    pickle = pd.read_pickle(data_dir)
    pickle["value"] = pickle["value"].apply(lambda x: np.nan_to_num(x=x, nan=np.nanmean(x)))
    pickle["value"] = normalize_data(pickle["value"])
    pickle = pickle.rename(columns={"value": prefix + "_value"})
    pickle = pickle.sort_values(by="date", ignore_index=True)
    pickle = pickle.set_index("date")
    return pickle

In [74]:
def read_and_preprocess_data_with_mask(data_dir, prefix):
    pickle = pd.read_pickle(data_dir)
    pickle["value"] = pickle["value"].apply(lambda x: np.nan_to_num(x=x, nan=np.nanmean(x)))
    pickle["value"] = normalize_data(pickle["value"])
    pickle["mask"] = pickle["mask"].apply(lambda x: x.astype(float))
    pickle = pickle.rename(columns={"value": prefix + "_value", "mask": prefix + "_mask"})
    pickle = pickle.sort_values(by="date", ignore_index=True)
    pickle = pickle.set_index("date")
    return pickle

In [75]:
def read_soil_lvls(lvl1_dir, lvl2_dir, lvl3_dir, lvl4_dir):
    return read_and_preprocess_data(lvl1_dir, "lvl1"), read_and_preprocess_data(lvl2_dir, "lvl2"), read_and_preprocess_data(lvl3_dir, "lvl3"), read_and_preprocess_data(lvl4_dir, "lvl4")

In [117]:
evaporation_data = read_and_preprocess_data(era_5_evap_dir, "evap")
lvl1, lvl2, lvl3, lvl4 = read_soil_lvls(era_5_soil_lvl1_dir, era_5_soil_lvl2_dir, era_5_soil_lvl3_dir, era_5_soil_lvl4_dir)
precip_data = read_and_preprocess_data_with_mask(precip_dir, "precip")
grace_data = read_and_preprocess_data(grace_dir,"grace")

In [118]:
target = pd.read_pickle(target_dir)
target = target.rename(columns = {"value": "target"})
target = target.sort_values(by="date", ignore_index=True)
target = target.set_index("date")
target = read_and_preprocess_data(target_dir, "target")

In [119]:
target.head()

Unnamed: 0_level_0,target_value
date,Unnamed: 1_level_1
2002-01-01,0.363889
2002-02-01,0.116667
2002-03-01,0.075
2002-04-01,0.222222
2002-05-01,0.366667


In [120]:
dfs = [evaporation_data, lvl1, lvl2, lvl3, lvl4, precip_data, grace_data, target]

In [121]:
#Uncomment if you only want to use grace

#dfs = [grace_data, target]
#dfs

In [122]:
data = pd.concat(dfs, axis = 1)

In [123]:
mask = data['target_value'].isna()

# stwórz serię liczb całkowitych, które będą używane do stworzenia grup
groups = (mask != mask.shift()).cumsum()

# stwórz grupy, dla których w kolumnie nie ma wartości NaN
valid_groups = groups[~mask]

# policz długości grup
group_lengths = valid_groups.groupby(valid_groups).size()

# znajdź indeks grupy o największej długości
longest_group_index = group_lengths.idxmax()

# stwórz maskę dla wierszy należących do grupy o największej długości
longest_group_mask = (groups == longest_group_index) & (~mask)

# wybierz wiersze należące do grupy o największej długości
longest_group = data[longest_group_mask]


In [124]:
data = longest_group

In [125]:
mask = data['grace_value'].isna()

# stwórz serię liczb całkowitych, które będą używane do stworzenia grup
groups = (mask != mask.shift()).cumsum()

# stwórz grupy, dla których w kolumnie nie ma wartości NaN
valid_groups = groups[~mask]

# policz długości grup
group_lengths = valid_groups.groupby(valid_groups).size()

# znajdź indeks grupy o największej długości
longest_group_index = group_lengths.idxmax()

# stwórz maskę dla wierszy należących do grupy o największej długości
longest_group_mask = (groups == longest_group_index) & (~mask)

# wybierz wiersze należące do grupy o największej długości
longest_group_grace = data[longest_group_mask]

In [126]:
data = longest_group_grace

In [127]:
#data = data.iloc[:239]

In [128]:
nan = data.loc[data["grace_value"].isna()]

In [129]:
def fill_missing_value(nans, idx):
    l = []
    for i in range(nans):
        arr = np.full(shape=(8, 13), fill_value=-999999)
        l.append(arr)
    l = pd.Series(l, idx)
    return l

In [130]:
def fill_missing_nans(nans, idx):
    l = []
    for i in range(nans):
        arr = np.ones(shape=(8, 13), dtype=np.float32)
        arr[0] = 0.0
        l.append(arr)
    l = pd.Series(l, idx)
    return l

In [131]:
#Use it if you have nan values in grace

#idx = data['grace_value'].loc[data['grace_value'].isna()].index 
#data.loc[data['grace_value'].isna(), "grace_value"] = fill_missing_value(36, idx)
#data.loc[data['grace_mask'].isna(), "grace_mask"] = fill_missing_nans(36, idx)

In [132]:
data = data.reset_index()
data = data.drop(columns=["date"])

In [133]:
def fillna_with_zero(a):
    where_are_NaNs = np.isnan(a)
    a[where_are_NaNs] = 0
    return a

In [134]:
data["grace_value"] = data["grace_value"].apply(lambda x: fillna_with_zero(x))

In [135]:
data["merged"] = data.apply(lambda row: np.hstack((row.loc[data.columns != "target_value"])), axis=1)

In [136]:
columns_to_drop = [x for x in data.columns if x not in ["target_value", "merged"]]
data = data.drop(columns=columns_to_drop)

In [137]:
data["merged"] = data["merged"].apply(lambda x: x[0])

In [138]:
data

Unnamed: 0,target_value,merged
0,0.758333,"[0.40286391973495483, 0.40286391973495483, 0.4..."
1,0.883333,"[0.612511396408081, 0.612511396408081, 0.63432..."
2,0.972222,"[0.7316049337387085, 0.7316049337387085, 0.764..."
3,1.000000,"[0.8035080432891846, 0.8035080432891846, 0.806..."
4,0.947222,"[0.9123387336730957, 0.9123387336730957, 0.912..."
...,...,...
85,0.483333,"[0.3601520359516144, 0.3601520359516144, 0.354..."
86,0.338889,"[0.5864202976226807, 0.5864202976226807, 0.588..."
87,0.144444,"[0.7909024357795715, 0.7909024357795715, 0.794..."
88,0.366667,"[0.8571354150772095, 0.8571354150772095, 0.860..."


In [139]:
data.to_pickle("data/ready_dataset_.pickle")