In [58]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

Define dir to data pickles

In [107]:
data_path = "data"

era_5_evap_dir = os.path.join(data_path,"era5_evaporation.pickle")
era_5_soil_lvl1_dir = os.path.join(data_path, "era5_vol_soil_lvl_1.pickle")
era_5_soil_lvl2_dir = os.path.join(data_path, "era5_vol_soil_lvl_2.pickle")
era_5_soil_lvl3_dir = os.path.join(data_path, "era5_vol_soil_lvl_3.pickle")
era_5_soil_lvl4_dir = os.path.join(data_path, "era5_vol_soil_lvl_4.pickle")
precip_dir = os.path.join(data_path, "gpm-imerg_df.pickle")
grace_dir = os.path.join(data_path, "grace_df.pickle")
target_dir = os.path.join(data_path, "II_113_1.pickle")

In [173]:
def read_and_preprocess_data(data_dir, prefix):
    pickle = pd.read_pickle(data_dir)
    pickle["value"] = pickle["value"].apply(lambda x: np.nan_to_num(x=x, nan=np.nanmean(x)))
    pickle["mask"] = pickle["mask"].apply(lambda x: x.astype(float))
    pickle = pickle.rename(columns={"value": prefix + "_value", "mask": prefix + "_mask"})
    pickle = pickle.sort_values(by="date", ignore_index=True)
    pickle = pickle.set_index("date")
    return pickle

In [174]:
def read_soil_lvls(lvl1_dir, lvl2_dir, lvl3_dir, lvl4_dir):
    return read_and_preprocess_data(lvl1_dir, "lvl1"), read_and_preprocess_data(lvl2_dir, "lvl2"), read_and_preprocess_data(lvl3_dir, "lvl3"), read_and_preprocess_data(lvl4_dir, "lvl4")

In [175]:
evaporation_data = read_and_preprocess_data(era_5_evap_dir, "evap")

In [176]:
lvl1, lvl2, lvl3, lvl4 = read_soil_lvls(era_5_soil_lvl1_dir, era_5_soil_lvl2_dir, era_5_soil_lvl3_dir, era_5_soil_lvl4_dir)

In [177]:
precip_data = read_and_preprocess_data(precip_dir, "precip")

In [178]:
precip_data.describe()

Unnamed: 0,precip_value,precip_mask
count,244,244
unique,244,244
top,"[50.97333, 50.97333, 50.97333, 50.97333, 50.97...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ..."
freq,1,1


In [179]:
precip_data.head()

Unnamed: 0_level_0,precip_value,precip_mask
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2002-01-01,"[50.97333, 50.97333, 50.97333, 50.97333, 50.97...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ..."
2002-02-01,"[85.69334, 85.69334, 85.69334, 85.69334, 85.69...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ..."
2002-03-01,"[10.026667, 10.026667, 10.026667, 10.026667, 1...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ..."
2002-04-01,"[21.533333, 21.533333, 21.533333, 21.533333, 2...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ..."
2002-05-01,"[246.52, 246.52, 246.52, 246.52, 246.52, 246.5...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ..."


In [182]:
grace_data = pd.read_pickle(grace_dir)
grace_data = grace_data.rename(columns = {"value": "grace_value"})
grace_data = grace_data.sort_values(by="date", ignore_index=True)

grace_data = grace_data.set_index("date")



In [142]:
grace_data.tail()

Unnamed: 0_level_0,grace_value
date,Unnamed: 1_level_1
2021-10-01,-12.427783376349586
2021-11-01,-10.98964275281652
2021-12-01,-7.413302826264958
2022-01-01,-5.304010680615704
2022-02-01,-3.205489802880176


In [180]:
target = pd.read_pickle(target_dir)
target = target.rename(columns = {"value": "target"})
target = target.sort_values(by="date", ignore_index=True)

target = target.set_index("date")

In [181]:
target

Unnamed: 0_level_0,target
date,Unnamed: 1_level_1
2002-01-01,31.940
2002-02-01,31.810
2002-03-01,31.740
2002-04-01,31.820
2002-05-01,31.830
...,...
2021-07-01,32.070
2021-08-01,32.115
2021-09-01,32.060
2021-10-01,32.000


In [183]:
dfs = [evaporation_data, lvl1, lvl2, lvl3, lvl4, precip_data, grace_data, target]

In [185]:
data = pd.concat(dfs, axis = 1)

In [189]:
data

Unnamed: 0_level_0,evap_value,evap_mask,lvl1_value,lvl1_mask,lvl2_value,lvl2_mask,lvl3_value,lvl3_mask,lvl4_value,lvl4_mask,precip_value,precip_mask,grace_value,target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2002-01-01,"[-0.00027116016, -0.00023580343, -0.0002240575...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.25207317, 0.25037944, 0.24847782, 0.2469939...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.25416452, 0.25248224, 0.2503441, 0.2483185,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.25356066, 0.25290644, 0.2512585, 0.2492615,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.2390303, 0.24438423, 0.24717659, 0.24784797...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[50.97333, 50.97333, 50.97333, 50.97333, 50.97...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...",,
2002-02-01,"[-0.00087652216, -0.0008312897, -0.0008104057,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.2616769, 0.26138318, 0.2607385, 0.25950253,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.2636571, 0.26317835, 0.26238298, 0.26104212...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.27171874, 0.27298903, 0.27304816, 0.2719476...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.25494355, 0.2600705, 0.2619683, 0.26132745,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[85.69334, 85.69334, 85.69334, 85.69334, 85.69...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...",,
2002-03-01,"[-0.0013092221, -0.0012750574, -0.0012615719, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.23586917, 0.23601604, 0.23496127, 0.2328746...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.23867369, 0.23849058, 0.23768187, 0.2360148...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.25597513, 0.25510728, 0.25522745, 0.2547906...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.26234245, 0.26519203, 0.26599693, 0.2647648...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[10.026667, 10.026667, 10.026667, 10.026667, 1...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...",,
2002-04-01,"[-0.0018067344, -0.0017491414, -0.0017337895, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.17545515, 0.17230803, 0.17280394, 0.1741772...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.16594648, 0.1648364, 0.16472197, 0.16433668...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.18871498, 0.18579292, 0.18596268, 0.1878051...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.2574166, 0.2593869, 0.2600659, 0.25919807, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[21.533333, 21.533333, 21.533333, 21.533333, 2...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...",4.361454842308495,
2002-05-01,"[-0.002586316, -0.0024680307, -0.0023922285, -...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.14691228, 0.14593571, 0.14469975, 0.1412016...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.092069924, 0.084379494, 0.0795005, 0.072298...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.10689336, 0.105703175, 0.1055544, 0.1039179...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.2441786, 0.24538213, 0.24580365, 0.24505216...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[246.52, 246.52, 246.52, 246.52, 246.52, 246.5...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...",2.9927513831220844,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-07-01 00:00:00,,,,,,,,,,,,,,32.070
2021-08-01 00:00:00,,,,,,,,,,,,,,32.115
2021-09-01 00:00:00,,,,,,,,,,,,,,32.060
2021-10-01 00:00:00,,,,,,,,,,,,,,32.000


In [191]:
data[["evap_value", "lvl1_value", "lvl2_value", "lvl3_value", "lvl4_value"]]

Unnamed: 0_level_0,evap_value,lvl1_value
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2002-01-01,"[-0.00027116016, -0.00023580343, -0.0002240575...","[0.25207317, 0.25037944, 0.24847782, 0.2469939..."
2002-02-01,"[-0.00087652216, -0.0008312897, -0.0008104057,...","[0.2616769, 0.26138318, 0.2607385, 0.25950253,..."
2002-03-01,"[-0.0013092221, -0.0012750574, -0.0012615719, ...","[0.23586917, 0.23601604, 0.23496127, 0.2328746..."
2002-04-01,"[-0.0018067344, -0.0017491414, -0.0017337895, ...","[0.17545515, 0.17230803, 0.17280394, 0.1741772..."
2002-05-01,"[-0.002586316, -0.0024680307, -0.0023922285, -...","[0.14691228, 0.14593571, 0.14469975, 0.1412016..."
...,...,...
2021-07-01 00:00:00,,
2021-08-01 00:00:00,,
2021-09-01 00:00:00,,
2021-10-01 00:00:00,,


In [None]:
def same_merge(x): 
    return ';'.join(x[x.notnull()].astype(str))

In [104]:
df_new = df.groupby(level=0, axis=1).apply(lambda x: x.apply(numpy.concatenate(x), axis=1))

KeyError: 'value'

In [None]:
0