In [1]:
import pandas as pd
import numpy as np
import os

Define dir to data pickles

In [2]:
data_path = "data"

era_5_evap_dir = os.path.join(data_path,"era5_evaporation.pickle")
era_5_soil_lvl1_dir = os.path.join(data_path, "era5_vol_soil_lvl_1.pickle")
era_5_soil_lvl2_dir = os.path.join(data_path, "era5_vol_soil_lvl_2.pickle")
era_5_soil_lvl3_dir = os.path.join(data_path, "era5_vol_soil_lvl_3.pickle")
era_5_soil_lvl4_dir = os.path.join(data_path, "era5_vol_soil_lvl_4.pickle")
precip_dir = os.path.join(data_path, "gpm-imerg_df.pickle")
grace_dir = os.path.join(data_path, "grace_df.pickle")
target_dir = os.path.join(data_path, "II_297_1.pickle")

In [3]:
def read_and_preprocess_data(data_dir, prefix):
    pickle = pd.read_pickle(data_dir)
    pickle["value"] = pickle["value"].apply(lambda x: np.nan_to_num(x=x, nan=np.nanmean(x)))
    pickle["mask"] = pickle["mask"].apply(lambda x: x.astype(float))
    pickle["value"] = pickle.apply(lambda x: np.array([x["value"], x["mask"]]), axis=1)
    pickle = pickle.drop(columns=['mask'])
    pickle = pickle.rename(columns={"value": prefix + "_value"})
    pickle = pickle.sort_values(by="date", ignore_index=True)
    pickle = pickle.set_index("date")
    return pickle

In [93]:
def normalize_data(column):
    min_val = min(column.apply(lambda d: np.min(d)))
    max_val = max(column.apply(lambda d: np.max(d)))
    column = column.apply(lambda d: (d - min_val) / (max_val - min_val))
    return column

In [94]:
def read_and_preprocess_data(data_dir, prefix):
    pickle = pd.read_pickle(data_dir)
    pickle["value"] = pickle["value"].apply(lambda x: np.nan_to_num(x=x, nan=np.nanmean(x)))
    pickle["value"] = normalize_data(pickle["value"])
    pickle = pickle.rename(columns={"value": prefix + "_value"})
    pickle = pickle.sort_values(by="date", ignore_index=True)
    pickle = pickle.set_index("date")
    return pickle

In [128]:
def read_and_preprocess_target_data(data_dir, prefix):
    pickle = pd.read_pickle(data_dir)
    pickle["value"] = pickle["value"].apply(lambda x: np.nan_to_num(x=x, nan=np.nanmean(x)))
    pickle = pickle.rename(columns={"value": prefix + "_value"})
    pickle = pickle.sort_values(by="date", ignore_index=True)
    pickle = pickle.set_index("date")
    return pickle

In [95]:
def read_and_preprocess_data_with_mask(data_dir, prefix):
    pickle = pd.read_pickle(data_dir)
    pickle["value"] = pickle["value"].apply(lambda x: np.nan_to_num(x=x, nan=np.nanmean(x)))
    pickle["value"] = normalize_data(pickle["value"])
    pickle["mask"] = pickle["mask"].apply(lambda x: x.astype(float))
    pickle = pickle.rename(columns={"value": prefix + "_value", "mask": prefix + "_mask"})
    pickle = pickle.sort_values(by="date", ignore_index=True)
    pickle = pickle.set_index("date")
    return pickle

In [96]:
def read_soil_lvls(lvl1_dir, lvl2_dir, lvl3_dir, lvl4_dir):
    return read_and_preprocess_data(lvl1_dir, "lvl1"), read_and_preprocess_data(lvl2_dir, "lvl2"), read_and_preprocess_data(lvl3_dir, "lvl3"), read_and_preprocess_data(lvl4_dir, "lvl4")

In [97]:
evaporation_data = read_and_preprocess_data(era_5_evap_dir, "evap")
lvl1, lvl2, lvl3, lvl4 = read_soil_lvls(era_5_soil_lvl1_dir, era_5_soil_lvl2_dir, era_5_soil_lvl3_dir, era_5_soil_lvl4_dir)
precip_data = read_and_preprocess_data(precip_dir, "precip")
grace_data = read_and_preprocess_data_with_mask(grace_dir, "grace")
grace_data = grace_data.drop(columns=["grace_mask"])
grace_data["grace_value"] = grace_data["grace_value"].apply(lambda x: np.mean(x))

In [200]:
grace_data.head()

Unnamed: 0_level_0,grace_value
date,Unnamed: 1_level_1
2002-04-01,0.629225
2002-05-01,0.588709
2002-08-01,0.493414
2002-09-01,0.430228
2002-10-01,0.426086


In [105]:
grace_data.tail()

Unnamed: 0_level_0,grace_value,grace_mask
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-10-01,"[[0.10029274231134166, 0.10029274231134166, 0....","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2021-11-01,"[[0.14576012421472964, 0.14576012421472964, 0....","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2021-12-01,"[[0.2611625929848304, 0.2611625929848304, 0.26...","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2022-01-01,"[[0.3256286322875005, 0.3256286322875005, 0.32...","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2022-02-01,"[[0.39533738268446017, 0.39533738268446017, 0....","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [201]:
target = pd.read_pickle(target_dir)
target = target.rename(columns = {"value": "target"})
target = target.sort_values(by="date", ignore_index=True)
target = target.set_index("date")
target = read_and_preprocess_data(target_dir, "target")

In [153]:
target.head()

Unnamed: 0_level_0,target_value
date,Unnamed: 1_level_1
2002-01-01,0.363889
2002-02-01,0.116667
2002-03-01,0.075
2002-04-01,0.222222
2002-05-01,0.366667


In [154]:
dfs = [evaporation_data, lvl1, lvl2, lvl3, lvl4, precip_data, grace_data, target]

In [202]:
dfs = [grace_data, target]
dfs

[            grace_value
 date                   
 2002-04-01     0.629225
 2002-05-01     0.588709
 2002-08-01     0.493414
 2002-09-01     0.430228
 2002-10-01     0.426086
 ...                 ...
 2021-10-01     0.100293
 2021-11-01     0.145760
 2021-12-01     0.261163
 2022-01-01     0.325629
 2022-02-01     0.395337
 
 [206 rows x 1 columns],
             target_value
 date                    
 2002-01-01      0.363889
 2002-02-01      0.116667
 2002-03-01      0.075000
 2002-04-01      0.222222
 2002-05-01      0.366667
 ...                  ...
 2021-07-01      0.650000
 2021-08-01      0.758333
 2021-09-01      0.541667
 2021-10-01      0.500000
 2021-11-01      0.611111
 
 [239 rows x 1 columns]]

In [203]:
data = pd.concat(dfs, axis = 1)

In [157]:
data[["evap_value", "lvl1_value", "lvl2_value", "lvl3_value", "lvl4_value"]]

Unnamed: 0_level_0,evap_value,lvl1_value,lvl2_value,lvl3_value,lvl4_value
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2002-01-01,"[[0.94307655, 0.94307655, 0.9516526, 0.9516526...","[[0.61723346, 0.61723346, 0.6119431, 0.6119431...","[[0.6690769, 0.6690769, 0.6644847, 0.6644847, ...","[[0.63975567, 0.63975567, 0.63791287, 0.637912...","[[0.4109005, 0.4109005, 0.43473163, 0.43473163..."
2002-02-01,"[[0.7962416, 0.7962416, 0.807213, 0.807213, 0....","[[0.64723086, 0.64723086, 0.64631337, 0.646313...","[[0.69498914, 0.69498914, 0.6936823, 0.6936823...","[[0.6909029, 0.6909029, 0.694481, 0.694481, 0....","[[0.4817328, 0.4817328, 0.5045536, 0.5045536, ..."
2002-03-01,"[[0.691287, 0.691287, 0.6995739, 0.6995739, 0....","[[0.5666201, 0.5666201, 0.5670789, 0.5670789, ...","[[0.62679094, 0.62679094, 0.62629116, 0.626291...","[[0.6465567, 0.6465567, 0.64411217, 0.64411217...","[[0.51466644, 0.51466644, 0.52735037, 0.527350..."
2002-04-01,"[[0.5706117, 0.5706117, 0.5845813, 0.5845813, ...","[[0.37791637, 0.37791637, 0.3680863, 0.3680863...","[[0.42826465, 0.42826465, 0.4252344, 0.4252344...","[[0.4570999, 0.4570999, 0.44886914, 0.44886914...","[[0.49274075, 0.49274075, 0.5015108, 0.5015108..."
2002-05-01,"[[0.38151842, 0.38151842, 0.41020945, 0.410209...","[[0.28876242, 0.28876242, 0.28571212, 0.285712...","[[0.22660086, 0.22660086, 0.205608, 0.205608, ...","[[0.22662674, 0.22662674, 0.22327426, 0.223274...","[[0.43381634, 0.43381634, 0.43917346, 0.439173..."
...,...,...,...,...,...
2022-08-01,"[[0.309327, 0.309327, 0.28793886, 0.28793886, ...","[[0.49716803, 0.49716803, 0.5326635, 0.5326635...","[[0.51383585, 0.51383585, 0.55009437, 0.550094...","[[0.17313202, 0.17313202, 0.25369915, 0.253699...","[[0.1446623, 0.1446623, 0.2150945, 0.2150945, ..."
2022-09-01,"[[0.61232007, 0.61232007, 0.60523677, 0.605236...","[[0.5516201, 0.5516201, 0.5700113, 0.5700113, ...","[[0.58051664, 0.58051664, 0.60430545, 0.604305...","[[0.2585704, 0.2585704, 0.3500116, 0.3500116, ...","[[0.13481297, 0.13481297, 0.2039547, 0.2039547..."
2022-10-01,"[[0.76756763, 0.76756763, 0.7663758, 0.7663758...","[[0.50906783, 0.50906783, 0.5342447, 0.5342447...","[[0.5522224, 0.5522224, 0.58302444, 0.58302444...","[[0.4144049, 0.4144049, 0.49458522, 0.49458522...","[[0.13338798, 0.13338798, 0.20589173, 0.205891..."
2022-11-01,"[[0.92170465, 0.92170465, 0.92287344, 0.922873...","[[0.52614, 0.52614, 0.54115313, 0.54115313, 0....","[[0.5677809, 0.5677809, 0.5879407, 0.5879407, ...","[[0.4326245, 0.4326245, 0.4948174, 0.4948174, ...","[[0.13397034, 0.13397034, 0.2104134, 0.2104134..."


In [204]:
mask = data['target_value'].isna()

# stwórz serię liczb całkowitych, które będą używane do stworzenia grup
groups = (mask != mask.shift()).cumsum()

# stwórz grupy, dla których w kolumnie nie ma wartości NaN
valid_groups = groups[~mask]

# policz długości grup
group_lengths = valid_groups.groupby(valid_groups).size()

# znajdź indeks grupy o największej długości
longest_group_index = group_lengths.idxmax()

# stwórz maskę dla wierszy należących do grupy o największej długości
longest_group_mask = (groups == longest_group_index) & (~mask)

# wybierz wiersze należące do grupy o największej długości
longest_group = data[longest_group_mask]


In [205]:
longest_group

Unnamed: 0_level_0,grace_value,target_value
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2002-04-01,0.629225,0.222222
2002-05-01,0.588709,0.366667
2002-08-01,0.493414,0.463889
2002-09-01,0.430228,0.666667
2002-10-01,0.426086,0.694444
...,...,...
2021-07-01,0.182727,0.650000
2021-08-01,0.201953,0.758333
2021-09-01,0.240257,0.541667
2021-10-01,0.100293,0.500000


In [206]:
data = longest_group

In [207]:
mask = data['grace_value'].isna()

# stwórz serię liczb całkowitych, które będą używane do stworzenia grup
groups = (mask != mask.shift()).cumsum()

# stwórz grupy, dla których w kolumnie nie ma wartości NaN
valid_groups = groups[~mask]

# policz długości grup
group_lengths = valid_groups.groupby(valid_groups).size()

# znajdź indeks grupy o największej długości
longest_group_index = group_lengths.idxmax()

# stwórz maskę dla wierszy należących do grupy o największej długości
longest_group_mask = (groups == longest_group_index) & (~mask)

# wybierz wiersze należące do grupy o największej długości
longest_group_grace = data[longest_group_mask]

In [208]:
data = longest_group_grace

In [163]:
#data = data.iloc[:239]

In [209]:
nan = data.loc[data["grace_value"].isna()]

In [210]:
nan

Unnamed: 0_level_0,grace_value,target_value
date,Unnamed: 1_level_1,Unnamed: 2_level_1


In [140]:
len(nan)

0

In [141]:
def fill_missing_value(nans, idx):
    l = []
    for i in range(nans):
        arr = np.full(shape=(8, 13), fill_value=-999999)
        l.append(arr)
    l = pd.Series(l, idx)
    return l

In [142]:
def fill_missing_nans(nans, idx):
    l = []
    for i in range(nans):
        arr = np.ones(shape=(8, 13), dtype=np.float32)
        arr[0] = 0.0
        l.append(arr)
    l = pd.Series(l, idx)
    return l

In [143]:
idx = data['grace_value'].loc[data['grace_value'].isna()].index 
data.loc[data['grace_value'].isna(), "grace_value"] = fill_missing_value(36, idx)

ValueError: Length of values (36) does not match length of index (0)

In [47]:
#data["grace_mask"].loc[data['grace_mask'].isna()] = fill_missing_nans(36, idx)
data.loc[data['grace_mask'].isna(), "grace_mask"] = fill_missing_nans(36, idx)

In [119]:
data.head()

Unnamed: 0_level_0,evap_value,lvl1_value,lvl2_value,lvl3_value,lvl4_value,precip_value,grace_value,grace_mask,target_value
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2003-07-01,"[[0.40286392, 0.40286392, 0.4144625, 0.4144625...","[[0.3094942, 0.3094942, 0.31299728, 0.31299728...","[[0.22086294, 0.22086294, 0.22063385, 0.220633...","[[0.17892064, 0.17892064, 0.16340464, 0.163404...","[[0.24920931, 0.24920931, 0.27178395, 0.271783...","[[0.16390909, 0.16390909, 0.16390909, 0.163909...","[[0.4193193697120277, 0.4193193697120277, 0.41...","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.758333
2003-08-01,"[[0.6125114, 0.6125114, 0.6343278, 0.6343278, ...","[[0.18331586, 0.18331586, 0.19718522, 0.197185...","[[0.10998205, 0.10998205, 0.13011064, 0.130110...","[[0.13073571, 0.13073571, 0.080931954, 0.08093...","[[0.21442965, 0.21442965, 0.23381208, 0.233812...","[[0.13415152, 0.13415152, 0.13415152, 0.134151...","[[0.2719976390490679, 0.2719976390490679, 0.27...","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.883333
2003-09-01,"[[0.73160493, 0.73160493, 0.7645573, 0.7645573...","[[0.23141678, 0.23141678, 0.22531618, 0.225316...","[[0.13553885, 0.13553885, 0.14117235, 0.141172...","[[0.19470946, 0.19470946, 0.13006665, 0.130066...","[[0.19243126, 0.19243126, 0.20778099, 0.207780...","[[0.17336364, 0.17336364, 0.17336364, 0.173363...","[[0.18942667945219285, 0.18942667945219285, 0....","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.972222
2003-10-01,"[[0.80350804, 0.80350804, 0.8068523, 0.8068523...","[[0.5462916, 0.5462916, 0.56005967, 0.56005967...","[[0.52794707, 0.52794707, 0.54884624, 0.548846...","[[0.31100032, 0.31100032, 0.3415058, 0.3415058...","[[0.19457708, 0.19457708, 0.21484247, 0.214842...","[[0.13151515, 0.13151515, 0.13151515, 0.131515...","[[0.22333428845309217, 0.22333428845309217, 0....","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1.0
2003-11-01,"[[0.91233873, 0.91233873, 0.91296357, 0.912963...","[[0.53694487, 0.53694487, 0.5525598, 0.5525598...","[[0.5784553, 0.5784553, 0.5964648, 0.5964648, ...","[[0.41986528, 0.41986528, 0.46055716, 0.460557...","[[0.19233415, 0.19233415, 0.21290518, 0.212905...","[[0.07757576, 0.07757576, 0.07757576, 0.077575...","[[0.25046798331017417, 0.25046798331017417, 0....","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.947222


In [211]:
data = data.reset_index()
data = data.drop(columns=["date"])

In [167]:
def fillna_with_zero(a):
    where_are_NaNs = np.isnan(a)
    a[where_are_NaNs] = 0
    return a

In [168]:
data["grace_value"] = data["grace_value"].apply(lambda x: fillna_with_zero(x))

In [169]:
data.head()

Unnamed: 0,evap_value,lvl1_value,lvl2_value,lvl3_value,lvl4_value,precip_value,grace_value,grace_mask,target_value
0,"[[0.40286392, 0.40286392, 0.4144625, 0.4144625...","[[0.3094942, 0.3094942, 0.31299728, 0.31299728...","[[0.22086294, 0.22086294, 0.22063385, 0.220633...","[[0.17892064, 0.17892064, 0.16340464, 0.163404...","[[0.24920931, 0.24920931, 0.27178395, 0.271783...","[[0.16390909, 0.16390909, 0.16390909, 0.163909...","[[0.4193193697120277, 0.4193193697120277, 0.41...","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.758333
1,"[[0.6125114, 0.6125114, 0.6343278, 0.6343278, ...","[[0.18331586, 0.18331586, 0.19718522, 0.197185...","[[0.10998205, 0.10998205, 0.13011064, 0.130110...","[[0.13073571, 0.13073571, 0.080931954, 0.08093...","[[0.21442965, 0.21442965, 0.23381208, 0.233812...","[[0.13415152, 0.13415152, 0.13415152, 0.134151...","[[0.2719976390490679, 0.2719976390490679, 0.27...","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.883333
2,"[[0.73160493, 0.73160493, 0.7645573, 0.7645573...","[[0.23141678, 0.23141678, 0.22531618, 0.225316...","[[0.13553885, 0.13553885, 0.14117235, 0.141172...","[[0.19470946, 0.19470946, 0.13006665, 0.130066...","[[0.19243126, 0.19243126, 0.20778099, 0.207780...","[[0.17336364, 0.17336364, 0.17336364, 0.173363...","[[0.18942667945219285, 0.18942667945219285, 0....","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.972222
3,"[[0.80350804, 0.80350804, 0.8068523, 0.8068523...","[[0.5462916, 0.5462916, 0.56005967, 0.56005967...","[[0.52794707, 0.52794707, 0.54884624, 0.548846...","[[0.31100032, 0.31100032, 0.3415058, 0.3415058...","[[0.19457708, 0.19457708, 0.21484247, 0.214842...","[[0.13151515, 0.13151515, 0.13151515, 0.131515...","[[0.22333428845309217, 0.22333428845309217, 0....","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1.0
4,"[[0.91233873, 0.91233873, 0.91296357, 0.912963...","[[0.53694487, 0.53694487, 0.5525598, 0.5525598...","[[0.5784553, 0.5784553, 0.5964648, 0.5964648, ...","[[0.41986528, 0.41986528, 0.46055716, 0.460557...","[[0.19233415, 0.19233415, 0.21290518, 0.212905...","[[0.07757576, 0.07757576, 0.07757576, 0.077575...","[[0.25046798331017417, 0.25046798331017417, 0....","[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.947222


In [212]:
data["merged"] = data.apply(lambda row: np.hstack((row.loc[data.columns != "target_value"])), axis=1)

In [213]:
columns_to_drop = [x for x in data.columns if x not in ["target_value", "merged"]]
data = data.drop(columns=columns_to_drop)

In [216]:
data["merged"] = data["merged"].apply(lambda x: x[0])

In [217]:
data

Unnamed: 0,target_value,merged
0,0.222222,0.629225
1,0.366667,0.588709
2,0.463889,0.493414
3,0.666667,0.430228
4,0.694444,0.426086
...,...,...
198,0.650000,0.182727
199,0.758333,0.201953
200,0.541667,0.240257
201,0.500000,0.100293


In [187]:
data.to_pickle("data/ready_dataset_grace_single_297.pickle")

In [174]:
data.merged[0].shape

(8, 104)

In [34]:
data.iloc[0]["merged"]

array([[4.13716573, 4.13716573, 4.13716573, 6.82863511, 6.82863511,
        6.82863511, 3.12786471, 3.12786471, 3.12786471, 3.12786471,
        3.12786471, 3.12786471, 3.12786471, 1.        , 1.        ,
        1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [4.13716573, 4.13716573, 4.13716573, 6.82863511, 6.82863511,
        6.82863511, 3.12786471, 3.12786471, 3.12786471, 3.12786471,
        3.12786471, 3.12786471, 3.12786471, 1.        , 1.        ,
        1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [4.13716573, 4.13716573, 4.13716573, 6.82863511, 6.82863511,
        6.82863511, 3.12786471, 3.12786471, 3.12786471, 3.12786471,
        3.12786471, 3.12786471, 3.12786471, 1.        , 1.        ,
        1.        , 0.        , 0.        , 0.        , 0.        ,
      