In [30]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

Define dir to data pickles

In [31]:
def normalize_data(column):
    min_val = min(column.apply(lambda d: np.min(d)))
    max_val = max(column.apply(lambda d: np.max(d)))
    column = column.apply(lambda d: (d - min_val) / (max_val - min_val))
    return column

In [102]:
def read_and_preprocess_numeric_data(data_dir, prefix):
    pickle = pd.read_pickle(data_dir)
    pickle["value"] = pickle["value"].apply(lambda x: np.nan_to_num(x=x, nan=np.nanmean(pickle["value"])))
    pickle["value"] = normalize_data(pickle["value"])
    pickle = pickle.rename(columns={"value": prefix + "_value"})
    pickle = pickle.sort_values(by="date", ignore_index=True)
    pickle = pickle.set_index("date")
    return pickle

In [109]:
def read_and_preprocess_data(data_dir, prefix):
    pickle = pd.read_pickle(data_dir)
    pickle["value"] = pickle["value"].apply(lambda x: np.nan_to_num(x=x, nan=np.nanmean(x)))
    pickle["value"] = normalize_data(pickle["value"])
    pickle = pickle.rename(columns={"value": prefix + "_value"})
    pickle = pickle.sort_values(by="date", ignore_index=True)
    pickle = pickle.set_index("date")
    return pickle

In [110]:
def read_and_preprocess_data_with_mask(data_dir, prefix):
    pickle = pd.read_pickle(data_dir)
    pickle["value"] = pickle["value"].apply(lambda x: np.nan_to_num(x=x, nan=np.nanmean(x)))
    pickle["value"] = normalize_data(pickle["value"])
    pickle["mask"] = pickle["mask"].apply(lambda x: x.astype(float))
    pickle = pickle.rename(columns={"value": prefix + "_value", "mask": prefix + "_mask"})
    pickle = pickle.sort_values(by="date", ignore_index=True)
    pickle = pickle.set_index("date")
    return pickle

In [111]:
def read_soil_lvls(lvl1_dir, lvl2_dir, lvl3_dir, lvl4_dir):
    return read_and_preprocess_data(lvl1_dir, "lvl1"), read_and_preprocess_data(lvl2_dir, "lvl2"), read_and_preprocess_data(lvl3_dir, "lvl3"), read_and_preprocess_data(lvl4_dir, "lvl4")

In [112]:
def fill_missing_value(nans, idx):
    l = []
    for i in range(nans):
        arr = np.full(shape=(8, 13), fill_value=-999999)
        l.append(arr)
    l = pd.Series(l, idx)
    return l

In [113]:
def fill_missing_nans(nans, idx):
    l = []
    for i in range(nans):
        arr = np.ones(shape=(8, 13), dtype=np.float32)
        arr[0] = 0.0
        l.append(arr)
    l = pd.Series(l, idx)
    return l

In [114]:
def fillna_with_zero(a):
    where_are_NaNs = np.isnan(a)
    a[where_are_NaNs] = 0
    return a

In [115]:
II_112_1 = "II_112_1.pickle"
II_113_1 = "II_113_1.pickle"
II_114_1 = "II_114_1.pickle"
II_115_1 = "II_115_1.pickle" # too few data
II_116_1 = "II_116_1.pickle" # too few data
II_131_1 = "II_131_1.pickle"
II_132_1 = "II_132_1.pickle"
II_292_1 = "II_292_1.pickle"
II_297_1 = "II_297_1.pickle"
II_298_1 = "II_298_1.pickle"
II_472_1 = "II_472_1.pickle"
II_922_1 = "II_922_1.pickle" # too few data
II_924_1 = "II_924_1.pickle"
II_931_1 = "II_931_1.pickle"
II_932_1 = "II_932_1.pickle" # too few data
II_936_1 = "II_936_1.pickle" # too few data
II_940_1 = "II_940_1.pickle"
II_949_1 = "II_949_1.pickle" # too few data
II_951_1 = "II_951_1.pickle" # too few data
II_952_1 = "II_952_1.pickle"
II_957_1 = "II_957_1.pickle" # too few data
II_1345_1 = "II_1345_1.pickle"
II_1346_1 = "II_1346_1.pickle"
II_1351_1 = "II_1351_1.pickle"
II_1352_1 = "II_1352_1.pickle"

In [116]:
stations = [II_112_1, II_113_1, II_114_1, II_115_1, II_116_1, II_131_1, II_132_1, II_292_1, II_297_1, II_298_1, II_472_1, II_922_1, II_924_1, II_931_1, II_932_1, II_936_1, II_940_1, II_949_1, II_951_1, II_952_1, II_957_1, II_1345_1, II_1346_1, II_1351_1, II_1352_1]

In [117]:
data_path = "data"
station = II_472_1

era_5_evap_dir = os.path.join(data_path,"era5_evaporation.pickle")
era_5_soil_lvl1_dir = os.path.join(data_path, "era5_vol_soil_lvl_1.pickle")
era_5_soil_lvl2_dir = os.path.join(data_path, "era5_vol_soil_lvl_2.pickle")
era_5_soil_lvl3_dir = os.path.join(data_path, "era5_vol_soil_lvl_3.pickle")
era_5_soil_lvl4_dir = os.path.join(data_path, "era5_vol_soil_lvl_4.pickle")
precip_dir = os.path.join(data_path, "gpm-imerg_df.pickle")
grace_dir = os.path.join(data_path, "grace_df.pickle")
target_dir = os.path.join(data_path, "measurements", station)

In [118]:
evaporation_data = read_and_preprocess_data(era_5_evap_dir, "evap")
lvl1, lvl2, lvl3, lvl4 = read_soil_lvls(era_5_soil_lvl1_dir, era_5_soil_lvl2_dir, era_5_soil_lvl3_dir, era_5_soil_lvl4_dir)
precip_data = read_and_preprocess_data(precip_dir, "precip")
grace_data = read_and_preprocess_data_with_mask(grace_dir,"grace")

In [119]:
evaporation_data

Unnamed: 0_level_0,evap_value
date,Unnamed: 1_level_1
2002-01-01,"[[0.94307655, 0.94307655, 0.9516526, 0.9516526..."
2002-02-01,"[[0.7962416, 0.7962416, 0.807213, 0.807213, 0...."
2002-03-01,"[[0.691287, 0.691287, 0.6995739, 0.6995739, 0...."
2002-04-01,"[[0.5706117, 0.5706117, 0.5845813, 0.5845813, ..."
2002-05-01,"[[0.38151842, 0.38151842, 0.41020945, 0.410209..."
...,...
2022-08-01,"[[0.309327, 0.309327, 0.28793886, 0.28793886, ..."
2022-09-01,"[[0.61232007, 0.61232007, 0.60523677, 0.605236..."
2022-10-01,"[[0.76756763, 0.76756763, 0.7663758, 0.7663758..."
2022-11-01,"[[0.92170465, 0.92170465, 0.92287344, 0.922873..."


In [120]:
def generate_full_data(data_path, station, evaporation_data, lvl1, lvl2, lvl3, lvl4, precip_data, grace_data):
    target_dir = os.path.join(data_path, "measurements", station)
    target = read_and_preprocess_numeric_data(target_dir, "target")
    dfs = [evaporation_data, lvl1, lvl2, lvl3, lvl4, precip_data, grace_data, target]
    data = pd.concat(dfs, axis = 1)
    mask = data['target_value'].isna()
    groups = (mask != mask.shift()).cumsum()

    valid_groups = groups[~mask]
    group_lengths = valid_groups.groupby(valid_groups).size()
    longest_group_index = group_lengths.idxmax()
    longest_group_mask = (groups == longest_group_index) & (~mask)
    longest_group = data[longest_group_mask]

    data = longest_group

    nan = data.loc[data["grace_value"].isna()]
    nans = len(nan)
    idx = data['grace_value'].loc[data['grace_value'].isna()].index 
    data.loc[data['grace_value'].isna(), "grace_value"] = fill_missing_value(nans, idx)
    data.loc[data['grace_mask'].isna(), "grace_mask"] = fill_missing_nans(nans, idx)

    nan = data.loc[data["precip_value"].isna()]
    nans = len(nan)
    idx = data['precip_value'].loc[data['precip_value'].isna()].index
    data.loc[data['precip_value'].isna(), "precip_value"] = fill_missing_value(nans, idx)

    data["grace_value"] = data["grace_value"].apply(lambda x: fillna_with_zero(x))
    data["merged"] = data.apply(lambda row: np.hstack((row.loc[data.columns != "target_value"])), axis=1)
    data["merged"] = data["merged"].apply(lambda x: np.hstack((x))) 
    columns_to_drop = [x for x in data.columns if x not in ["target_value", "merged"]]
    data = data.drop(columns=columns_to_drop)
    data.to_pickle("data/network_input/full_" + station)

In [84]:
generate_full_data(data_path, station, evaporation_data, lvl1, lvl2, lvl3, lvl4, precip_data, grace_data)

In [121]:
for station in stations:
    generate_full_data(data_path, station, evaporation_data, lvl1, lvl2, lvl3, lvl4, precip_data, grace_data)

In [59]:
target = pd.read_pickle(target_dir)
target = target.rename(columns = {"value": "target_value"})
target = target.sort_values(by="date", ignore_index=True)
target = target.set_index("date")

In [498]:
dfs = [evaporation_data, lvl1, lvl2, lvl3, lvl4, precip_data, grace_data, target]

In [510]:
#Uncomment if you only want to use grace

dfs = [grace_data, target]
#dfs

In [511]:
data = pd.concat(dfs, axis = 1)

In [512]:
mask = data['target_value'].isna()

# stwórz serię liczb całkowitych, które będą używane do stworzenia grup
groups = (mask != mask.shift()).cumsum()

# stwórz grupy, dla których w kolumnie nie ma wartości NaN
valid_groups = groups[~mask]

# policz długości grup
group_lengths = valid_groups.groupby(valid_groups).size()

# znajdź indeks grupy o największej długości
longest_group_index = group_lengths.idxmax()

# stwórz maskę dla wierszy należących do grupy o największej długości
longest_group_mask = (groups == longest_group_index) & (~mask)

# wybierz wiersze należące do grupy o największej długości
longest_group = data[longest_group_mask]


In [513]:
data = longest_group

In [514]:
data

Unnamed: 0_level_0,grace_value,grace_mask,target_value
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2005-12-01,"[[0.4456423141383357, 0.4456423141383357, 0.44...","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",292.645
2006-01-01,"[[0.5044058812055382, 0.5044058812055382, 0.50...","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",292.635
2006-02-01,"[[0.5954299501469893, 0.5954299501469893, 0.59...","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",292.640
2006-03-01,"[[0.7143235390068454, 0.7143235390068454, 0.71...","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",292.715
2006-04-01,"[[0.6785855390736845, 0.6785855390736845, 0.67...","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",292.820
...,...,...,...
2021-07-01,"[[0.18272711489209306, 0.18272711489209306, 0....","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",291.795
2021-08-01,"[[0.20195293831187738, 0.20195293831187738, 0....","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",291.745
2021-09-01,"[[0.24025719462417888, 0.24025719462417888, 0....","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",291.760
2021-10-01,"[[0.10029274231134166, 0.10029274231134166, 0....","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",291.775


In [515]:
nan = data.loc[data["grace_value"].isna()]
nan

Unnamed: 0_level_0,grace_value,grace_mask,target_value
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [87]:
mask = data['grace_value'].isna()

# stwórz serię liczb całkowitych, które będą używane do stworzenia grup
groups = (mask != mask.shift()).cumsum()

# stwórz grupy, dla których w kolumnie nie ma wartości NaN
valid_groups = groups[~mask]

# policz długości grup
group_lengths = valid_groups.groupby(valid_groups).size()

# znajdź indeks grupy o największej długości
longest_group_index = group_lengths.idxmax()

# stwórz maskę dla wierszy należących do grupy o największej długości
longest_group_mask = (groups == longest_group_index) & (~mask)

# wybierz wiersze należące do grupy o największej długości
longest_group_grace = data[longest_group_mask]

In [73]:
data = longest_group_grace

In [504]:
data.index = pd.to_datetime(data.index)

In [408]:
data.drop(data["2014-06-01"], inplace=True)

KeyError: '2014-06-01'

In [505]:
data = data[:"2012-12-31"]

In [506]:
#Use it if you have nan values in grace
nans_count = 4
idx = data['grace_value'].loc[data['grace_value'].isna()].index 
data.loc[data['grace_value'].isna(), "grace_value"] = fill_missing_value(nans_count, idx)
data.loc[data['grace_mask'].isna(), "grace_mask"] = fill_missing_nans(nans_count, idx)

In [507]:
data = data.reset_index()
data = data.drop(columns=["date"])

In [516]:
data["grace_value"] = data["grace_value"].apply(lambda x: fillna_with_zero(x))
data["merged"] = data.apply(lambda row: np.hstack((row.loc[data.columns != "target_value"])), axis=1)
data["merged"] = data["merged"].apply(lambda x: np.hstack((x))) 
columns_to_drop = [x for x in data.columns if x not in ["target_value", "merged"]]
data = data.drop(columns=columns_to_drop)
#data["merged"] = data["merged"].apply(lambda x: x[0])

In [509]:
data.to_pickle("data/network_input/full_" + station)

In [517]:
data.to_pickle("data/network_input/grace_" + station)

In [39]:
if not os.path.exists("data/network_input"):
    os.makedirs("data/network_input")

In [70]:
data.head()

Unnamed: 0_level_0,evap_value,lvl1_value,lvl2_value,lvl3_value,lvl4_value,precip_value,grace_value,grace_mask,target_value
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2003-07-01,"[[0.40286392, 0.40286392, 0.4144625, 0.4144625...","[[0.3094942, 0.3094942, 0.31299728, 0.31299728...","[[0.22086294, 0.22086294, 0.22063385, 0.220633...","[[0.17892064, 0.17892064, 0.16340464, 0.163404...","[[0.24920931, 0.24920931, 0.27178395, 0.271783...","[[0.16390909, 0.16390909, 0.16390909, 0.163909...","[[0.4193193697120277, 0.4193193697120277, 0.41...","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.368852
2003-08-01,"[[0.6125114, 0.6125114, 0.6343278, 0.6343278, ...","[[0.18331586, 0.18331586, 0.19718522, 0.197185...","[[0.10998205, 0.10998205, 0.13011064, 0.130110...","[[0.13073571, 0.13073571, 0.080931954, 0.08093...","[[0.21442965, 0.21442965, 0.23381208, 0.233812...","[[0.13415152, 0.13415152, 0.13415152, 0.134151...","[[0.2719976390490679, 0.2719976390490679, 0.27...","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.327869
2003-09-01,"[[0.73160493, 0.73160493, 0.7645573, 0.7645573...","[[0.23141678, 0.23141678, 0.22531618, 0.225316...","[[0.13553885, 0.13553885, 0.14117235, 0.141172...","[[0.19470946, 0.19470946, 0.13006665, 0.130066...","[[0.19243126, 0.19243126, 0.20778099, 0.207780...","[[0.17336364, 0.17336364, 0.17336364, 0.173363...","[[0.18942667945219285, 0.18942667945219285, 0....","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.273224
2003-10-01,"[[0.80350804, 0.80350804, 0.8068523, 0.8068523...","[[0.5462916, 0.5462916, 0.56005967, 0.56005967...","[[0.52794707, 0.52794707, 0.54884624, 0.548846...","[[0.31100032, 0.31100032, 0.3415058, 0.3415058...","[[0.19457708, 0.19457708, 0.21484247, 0.214842...","[[0.13151515, 0.13151515, 0.13151515, 0.131515...","[[0.22333428845309217, 0.22333428845309217, 0....","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.300546
2003-11-01,"[[0.91233873, 0.91233873, 0.91296357, 0.912963...","[[0.53694487, 0.53694487, 0.5525598, 0.5525598...","[[0.5784553, 0.5784553, 0.5964648, 0.5964648, ...","[[0.41986528, 0.41986528, 0.46055716, 0.460557...","[[0.19233415, 0.19233415, 0.21290518, 0.212905...","[[0.07757576, 0.07757576, 0.07757576, 0.077575...","[[0.25046798331017417, 0.25046798331017417, 0....","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.314208


In [122]:
dd = pd.read_pickle("data/network_input/full_" + station)

In [123]:
dd.head()


Unnamed: 0_level_0,target_value,merged
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2002-01-01,0.510857,"[0.9430765509605408, 0.9430765509605408, 0.951..."
2002-02-01,0.510857,"[0.7962415814399719, 0.7962415814399719, 0.807..."
2002-03-01,0.510857,"[0.6912869811058044, 0.6912869811058044, 0.699..."
2002-04-01,0.510857,"[0.5706117153167725, 0.5706117153167725, 0.584..."
2002-05-01,0.510857,"[0.3815184235572815, 0.3815184235572815, 0.410..."
