In [1]:
# default_exp core

# core

> API details.

In [14]:
#hide
import pandas as pd
from nbdev.showdoc import *
from fastai.data.external import *
from fastcore.all import *
from pathlib import PosixPath
from fastcore.test import *
from fastai.tabular.all import *
import fastai

In [3]:
#hide
def str_to_path(file: str):
    "Convers a string to a Posixpath."
    if isinstance(file, str) and "~" in file:
        file = os.path.expanduser(file)

    file = Path(file)
    
    return file

In [4]:
#hide
test_eq_type(Path(""), str_to_path(""))
test_eq_type(Path(""), str_to_path(Path("")))

In [5]:
#export
def read_hdf(file:PosixPath, key: str = "/powerdata", key_metadata=None):
    "Reads a hdf5 table based on the given key."
    file = str_to_path(file)
    if "/" not in key: key = "/" + key
    with pd.HDFStore(file, "r") as store:
        if key in store.keys():
            df = store[key]
            if key_metadata is not None:
                df_meta = store[key_metadata]
                for c in df_meta: df[c] = df_meta[c].values[0]
        else:
            df = pd.DataFrame()
    return df

In [6]:
#hide
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
                  index=['a', 'b', 'c'])
df.to_hdf('data.h5', key='df', mode='w')
test_eq(df, read_hdf("data.h5", key="df"))

In [7]:
#export
def read_csv(file:PosixPath, sep:str =";"):
    "Reads a csv file."
    file = str_to_path(file)
    df = pd.read_csv(str(file), sep=sep)
    df.drop(["Unnamed: 0"], inplace=True, axis=1, errors="ignore")
    return df

In [8]:
#hide
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},)
df.to_csv('data.csv', sep=";")
test_eq(df, read_csv("data.csv", sep=";"))

In [9]:
def read_files(
    files:PosixPath,
    key:str ="/powerdata",
    key_metadata=None,
    sep:str=";"
) -> pd.DataFrame:
    "Reads a number of CSV or HDF5 files depending on file ending."
    
    files = listify(files)
    dfs=L()
    for file in files:
        if isinstance(file, str):
            file = str_to_path(file)

        if file.suffix == ".h5":
            df = read_hdf(file, key, key_metadata=key_metadata)
        elif file.suffix == ".csv":
            df = read_csv(file, sep=";")
        else:
            raise f"File ending of file {file} not supported."

        dfs += df
        
    return dfs

In [10]:
#hide
# df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
#                   index=['a', 'b', 'c'])
# df.to_hdf('data.h5', key='df', mode='w')
# test_eq(df, read_files("data.h5", key="df")[0])

# df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},)
# df.to_csv('data.csv', sep=";")
# test_eq(df, read_files("data.csv")[0])

In [11]:
# test_file = "/home/scribbler/data/DAF_ICON_Synthetic_Wind_Power_processed/00011.h5"

In [12]:
class AddSeasonalFeatures(TabularProc):
    order=10
    def encodes(self, to):
        to.items["Month"] = to.items.index.month
        to.items["Day"] = to.items.index.day
        to.items["Hour"] = to.items.index.hour

class DropYear(TabularProc):
    "Drops a complete year."
    order = 10
    def __init__(self, year=2020):
        year = str(year)
        self.year = pd.to_datetime(f"{year}-01-01", utc=True)
        
    def encodes(self, to): 
        mask = to.items.index < self.year
        to.items.drop(to.items[mask].index, inplace=True)
        
class NormalizePerTask(TabularProc):
    "Normalize per TaskId"
    order = 10
    def setups(self, to:Tabular):
        self.means = getattr(to, 'train', to)[to.cont_names + "TaskID"].groupby("TaskID").mean()
        self.stds = getattr(to, 'train', to)[to.cont_names + "TaskID"].groupby("TaskID").std(ddof=0)+1e-7
#         store_attr(but='to', means=dict(getattr(to, 'train', to).conts.mean()),
#                    stds=dict(getattr(to, 'train', to).conts.std(ddof=0)+1e-7))
#         return self(to)
        return self(to)

        
class DropCols(TabularProc):
    "Drops rows by column name."
    order = 10
    def __init__(self, cols):
        self.cols = listify(cols)
        
    def encodes(self, to): 
        to.items.drop(self.cols, axis=1, inplace=True, errors="ignore")
        
class FilterByCol(TabularProc):
    "Drops rows by column."
    order = 10
    def __init__(self, col_name, keep=True, drop_col_after_filter=True):
        self.col_name = col_name
        self.keep = keep
        self.drop_col_after_filter=drop_col_after_filter
        
    def encodes(self, to): 
        mask = to.items[self.col_name].astype(bool).values
        if not self.keep: mask = ~mask
        to.items.drop(to.items[mask].index, inplace=True)
        if self.drop_col_after_filter: to.items.drop(self.col_name, axis=1, inplace=True, errors="ignore")

class FilterMonths(TabularProc):
    "Filter dataframe for specific months."
    order = 10
    def __init__(self, months=range(1,13)):
        self.months = listify(months)
        
    def encodes(self, to): 
        mask = ~to.items.index.month.isin(self.months)
        to.items.drop(to.items[mask].index, inplace=True)

In [35]:
class TabularRenewables(CollBase, GetAttr, FilteredBase):
    def __init__(self, dfs, procs=None, cat_names=None, cont_names=None, 
                 y_names=None, add_y_to_x=False, add_x_to_y=False, pre_process=None, 
                 include_task_id=False, splits=None):
        self.task_id_col = "TaskID"
        self.y_names = listify(y_names)
#         self.procs = procs
        self.pre_process = pre_process
        
        
        self.dfs = L()
        for task_id,df in enumerate(dfs):
            df = TabularPandas(df, y_names=self.y_names, procs=pre_process, 
                                      do_setup=True, reduce_memory=False).items
            df[self.task_id_col] = task_id
            
            self.dfs += df
            
        
        self.cont_names, self.cat_names = cont_cat_split(self.dfs[0], dep_var=y_names, max_card=1000)
        if not include_task_id: self.cat_names = [c for c in self.cat_names if c!= self.task_id_col]
        
        if add_y_to_x:
            self.cont_names += self.y_names
        if add_x_to_y:
            self.y_names += self.cont_names

        merged_df = pd.concat(self.dfs, axis=0)
        if splits is not None: splits = splits(range_of(merged_df))
            
        self.to = TabularPandas(
            merged_df,
            procs=procs,
            cat_names=self.cat_names,
            cont_names=self.cont_names,
            y_names=self.y_names,
            splits=splits,
            do_setup=True,
            inplace=True,
            y_block=RegressionBlock(),
        )
        super().__init__(self.to)
        
    def new(self, df):
        return type(self.to)(df, do_setup=False, reduce_memory=False, y_block=TransformBlock(),
                          **attrdict(self, 'procs','cat_names','cont_names','y_names', 'device'))

    def subset(self, i): return self.to.new(self.items[slice(0,self.to.split) if i==0 else slice(self.to.split,len(self.to))])
    def copy(self): self.items = self.to.copy(); return self
    def decode(self): return self.to.procs.decode(self.to)
    def decode_row(self, row): return self.to.new(pd.DataFrame(row).T).decode().items.iloc[0]
    def show(self, max_n=10, **kwargs): self.to.show(max_n, **kwargs)
    def setup(self): self.to.procs.setup(self)
    def process(self): self.to.procs(self)
    def loc(self): return self.items.loc
    def iloc(self): return _TabIloc(self)
    def targ(self): return self.to.items[self.y_names]
    def x_names (self): return self.to.cat_names + self.to.cont_names
    def n_subsets(self): return 2
    def y(self): return self.to[self.to.y_names[0]]
    def new_empty(self): return self.new(pd.DataFrame({}, columns=self.to.items.columns))
    def to_device(self, d=None):
        self.device = d
        return self
    
    def procs(self):
        return self.to.procs

    def all_col_names (self):
        ys = [n for n in self.to.y_names if n in self.to.items.columns]
        return self.to.x_names + self.to.y_names if len(ys) == len(self.to.y_names) else self.to.x_names
    
properties(TabularRenewables,'loc','iloc','targ','all_col_names','n_subsets','x_names','y', "procs")

# def _add_prop(cls, nm):
#     @property
#     def f(o): return o[list(getattr(o,nm+'_names'))]
#     @f.setter
#     def fset(o, v): o[getattr(o,nm+'_names')] = v
#     setattr(cls, nm+'s', f)
#     setattr(cls, nm+'s', fset)

fastai.tabular.core._add_prop(TabularRenewables, 'cat')
fastai.tabular.core._add_prop(TabularRenewables, 'cont')
fastai.tabular.core._add_prop(TabularRenewables, 'y')
fastai.tabular.core._add_prop(TabularRenewables, 'x')
fastai.tabular.core._add_prop(TabularRenewables, 'all_col')

In [36]:
files = !ls /home/scribbler/data/DAF_ICON_Synthetic_Wind_Power_processed/*.h5
files[0:2]

['/home/scribbler/data/DAF_ICON_Synthetic_Wind_Power_processed/00011.h5',
 '/home/scribbler/data/DAF_ICON_Synthetic_Wind_Power_processed/00090.h5']

In [37]:
# len(files)

In [38]:
n_files = int(len(files)-len(files)/4)
n_files = 5

In [39]:
dfs = read_files(files[0:n_files], key_metadata="metadata")
# dfs[0].head(2)

In [40]:
cols_to_drop = L("long", "lat", "loc_id", "target_file_name", "input_file_name", "num_train_samples", "num_test_samples")
to = TabularRenewables(dfs, y_names="PowerGeneration", 
                    pre_process=[DropCols(cols_to_drop), FilterByCol("TestFlag"), AddSeasonalFeatures],
#                     TODO: Normalize per task, add task embedding and implement normalization trough task id
                    procs=[NormalizePerTask, Categorify], 
                    add_x_to_y=False, 
                    include_task_id=False
#                     splits=RandomSplitter(valid_pct=0.2)
                    )

In [43]:
a = to.procs[-1]

In [44]:
a.means

Unnamed: 0_level_0,T_HAG_2_M,RELHUM_HAG_2_M,PS_SFC_0_M,ASWDIFDS_SFC_0_M,ASWDIRS_SFC_0_M,WindSpeed58m,SinWindDirection58m,CosWindDirection58m,WindSpeed60m,SinWindDirection60m,...,SinWindDirection60mMinus_t_1,CosWindDirection60mMinus_t_1,WindSpeed58mPlus_t_1,SinWindDirection58mPlus_t_1,CosWindDirection58mPlus_t_1,WindSpeed60mPlus_t_1,SinWindDirection60mPlus_t_1,CosWindDirection60mPlus_t_1,rotor_diameter_m,nominal_power_kW
TaskID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,282.318512,76.775322,93288.75,66.925941,102.74099,3.736435,0.051207,0.136615,2.283678,0.076204,...,0.075849,0.200523,3.736496,0.050579,0.13536,2.282969,0.075791,0.199005,82.0,2350.0
1,283.811035,75.026268,98009.882812,61.249619,90.366798,5.071127,0.154919,0.178581,2.732925,0.229174,...,0.229078,0.133823,5.069435,0.155245,0.177251,2.731475,0.229282,0.132734,71.0,2300.0
2,284.938232,73.34951,100246.28125,62.667137,86.571457,3.908382,0.104777,0.094084,2.097298,0.018132,...,0.018527,0.074283,3.910568,0.102588,0.094172,2.097949,0.016884,0.077436,115.699997,3000.0
3,284.547668,74.820831,100950.335938,61.050297,88.107536,6.381981,0.100356,0.224855,3.505336,0.11165,...,0.111939,0.205856,6.37516,0.099794,0.224808,3.504071,0.110773,0.205849,101.0,3500.0
4,283.995758,81.607132,101335.679688,55.902283,87.985382,8.99711,0.10575,0.215004,6.766679,0.10645,...,0.107279,0.194779,8.995748,0.105085,0.215217,6.766384,0.105803,0.195737,101.0,3050.0


In [45]:
for task_id in a.means.index:
    print(task_id)
#     t = to.items[to.cont_names + ["TaskID"]]
#     t[t["TaskID"] == task_id] = t[t["TaskID"] == task_id] - a.means[task_id]

0
1
2
3
4


In [49]:
((to.conts[to.loc[:,"TaskID"] == task_id] - a.means.loc[task_id]) / a.stds.loc[task_id]).describe()

Unnamed: 0,T_HAG_2_M,RELHUM_HAG_2_M,PS_SFC_0_M,ASWDIFDS_SFC_0_M,ASWDIRS_SFC_0_M,WindSpeed58m,SinWindDirection58m,CosWindDirection58m,WindSpeed60m,SinWindDirection60m,...,SinWindDirection60mMinus_t_1,CosWindDirection60mMinus_t_1,WindSpeed58mPlus_t_1,SinWindDirection58mPlus_t_1,CosWindDirection58mPlus_t_1,WindSpeed60mPlus_t_1,SinWindDirection60mPlus_t_1,CosWindDirection60mPlus_t_1,rotor_diameter_m,nominal_power_kW
count,7969.0,7969.0,7969.0,7969.0,7969.0,7969.0,7969.0,7969.0,7969.0,7969.0,...,7969.0,7969.0,7969.0,7969.0,7969.0,7969.0,7969.0,7969.0,7969.0,7969.0
mean,-4.791169e-07,3.120793e-07,3e-06,4.224055e-08,3.47942e-08,9.198302e-08,-1.474116e-08,2.306134e-09,-3.608486e-08,5.582229e-09,...,1.128412e-08,1.198441e-08,1.853564e-08,-5.953723e-09,1.737294e-09,-4.646286e-08,-3.167161e-10,1.196399e-09,0.0,0.0
std,1.000063,1.000063,1.000063,1.000063,1.000063,1.000063,1.000063,1.000063,1.000063,1.000063,...,1.000063,1.000063,1.000063,1.000063,1.000063,1.000063,1.000063,1.000063,0.0,0.0
min,-2.577972,-4.808224,-3.108257,-1.760926,-1.085138,-2.212427,-1.86942,-1.57816,-2.090351,-1.856201,...,-1.858056,-1.54838,-2.208772,-1.86687,-1.579082,-2.087141,-1.854429,-1.550272,0.0,0.0
25%,-0.8451059,-0.6283629,-0.563055,-0.8910852,-0.9001289,-0.7114666,-0.7823349,-1.152534,-0.7430523,-0.7895472,...,-0.790995,-1.15204,-0.7109722,-0.7801612,-1.150823,-0.7428912,-0.7881695,-1.150773,0.0,0.0
50%,-0.1129582,0.1173974,-0.016025,0.02686629,-0.3718356,-0.07144273,0.05450924,0.4348159,-0.08336651,0.06695906,...,0.06657551,0.4051224,-0.06837742,0.05322807,0.4363664,-0.08717584,0.06801957,0.4035129,0.0,0.0
75%,0.906792,0.7692273,0.633815,0.7425537,0.8463575,0.6135024,0.8575842,0.9312022,0.6552916,0.8580122,...,0.8588368,0.9488213,0.615834,0.858963,0.9313463,0.6572144,0.86235,0.9484416,0.0,0.0
max,2.986438,2.047069,3.145025,3.050903,2.71136,4.119863,1.51185,1.019625,3.917355,1.499037,...,1.498019,1.04353,4.114028,1.511821,1.019767,3.91161,1.499567,1.042727,0.0,0.0


In [None]:
to.loc[]

In [None]:
t[t["TaskID"] == task_id] - a.means[task_id]

In [None]:
to.items

In [None]:
# to.cont_names[20:]

In [None]:
# to.cat_names

In [None]:
# dls = to.dataloaders(bs=1024)
# learn = tabular_learner(dls, metrics=rmse)

In [None]:
# learn.fit_one_cycle(5)

In [None]:
# learn.fit_one_cycle(5)

In [None]:
# learn.fit_one_cycle(5)

In [None]:
# learn.model

In [None]:
# learn.model

In [None]:
# errors = L()
# for f in files[n_files:]:
#     df_test = read_files(f, key_metadata="metadata")
#     to_test = tp_from_dtypes(df_test, y_names="PowerGeneration", 
#                     pre_process=[DropCols(cols_to_drop), FilterByCol("TestFlag", keep=False), AddSeasonalFeatures],
# #                     TODO: Normalize per task, add task embedding and implement normalization trough task id
#                     procs=[], 
#                     add_x_to_y=False, ignore_cols="TestFlag")
#     to_test_2 = to.new(to_test.items)
#     dl_test = learn.dls.test_dl(to_test.items, bs=64)
#     targ, preds = learn.get_preds(dl=dl_test)
#     e = (((targ-preds)**2).mean()**0.5)
#     errors += e

In [None]:
# import seaborn as sns
# sns.boxplot(errors)

In [None]:
# sns.distplot(errors)