In [None]:
# default_exp core

# core

> API details.

In [None]:
#hide
#export
import pandas as pd
from nbdev.showdoc import *
from fastai.data.external import *
from fastcore.all import *
from pathlib import PosixPath
from fastcore.test import *
from fastai.tabular.all import *
import fastai

In [None]:
#hide
#export
def str_to_path(file: str):
    "Convers a string to a Posixpath."
    if isinstance(file, str) and "~" in file:
        file = os.path.expanduser(file)

    file = Path(file)
    
    return file

In [None]:
#hide
test_eq_type(Path(""), str_to_path(""))
test_eq_type(Path(""), str_to_path(Path("")))

In [None]:
#export
def read_hdf(file:PosixPath, key: str = "/powerdata", key_metadata=None):
    "Reads a hdf5 table based on the given key."
    file = str_to_path(file)
    if "/" not in key: key = "/" + key
    with pd.HDFStore(file, "r") as store:
        if key in store.keys():
            df = store[key]
            if key_metadata is not None:
                df_meta = store[key_metadata]
                for c in df_meta: df[c] = df_meta[c].values[0]
        else:
            df = pd.DataFrame()
    return df

In [None]:
#hide
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
                  index=['a', 'b', 'c'])
df.to_hdf('data.h5', key='df', mode='w')
test_eq(df, read_hdf("data.h5", key="df"))

In [None]:
#export
def read_csv(file:PosixPath, sep:str =";"):
    "Reads a csv file."
    file = str_to_path(file)
    df = pd.read_csv(str(file), sep=sep)
    df.drop(["Unnamed: 0"], inplace=True, axis=1, errors="ignore")
    return df

In [None]:
#hide
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},)
df.to_csv('data.csv', sep=";")
test_eq(df, read_csv("data.csv", sep=";"))

In [None]:
#export
def read_files(
    files:PosixPath,
    key:str ="/powerdata",
    key_metadata=None,
    sep:str=";",
    add_task_id=True
) -> pd.DataFrame:
    "Reads a number of CSV or HDF5 files depending on file ending."
    
    files = listify(files)
    dfs=L()
    for task_id,file in enumerate(files):
        if isinstance(file, str):
            file = str_to_path(file)

        if file.suffix == ".h5":
            df = read_hdf(file, key, key_metadata=key_metadata)
        elif file.suffix == ".csv":
            df = read_csv(file, sep=";")
        else:
            raise f"File ending of file {file} not supported."
        if add_task_id:df["TaskID"]=task_id
        dfs += df
        
    return dfs

In [None]:
# hide
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
                  index=['a', 'b', 'c'])
df.to_hdf('data.h5', key='df', mode='w')
test_eq(df, read_files("data.h5", key="df", add_task_id=False)[0])

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},)
df.to_csv('data.csv', sep=";")
test_eq(df, read_files("data.csv", add_task_id=False)[0])

In [None]:
# test_file = "/home/scribbler/data/DAF_ICON_Synthetic_Wind_Power_processed/00011.h5"

In [None]:
#export
class AddSeasonalFeatures(TabularProc):
    order=0
    def encodes(self, to):
        to.items["Month"] = to.items.index.month
        to.items["Day"] = to.items.index.day
        to.items["Hour"] = to.items.index.hour

class DropYear(TabularProc):
    "Drops a complete year."
    order = 10
    def __init__(self, year=2020):
        year = str(year)
        self.year = pd.to_datetime(f"{year}-01-01", utc=True)
        
    def encodes(self, to): 
        mask = to.items.index.year != self.year
        to.items.drop(to.items[mask].index, inplace=True)
        
class NormalizePerTask(TabularProc):
    "Normalize per TaskId"
    order = 1
    def __init__(self, task_id_col="TaskID"):
        self.task_id_col = task_id_col
    def setups(self, to:Tabular):
        print(to.items.shape)
        self.means = getattr(to, 'train', to)[to.cont_names + "TaskID"].groupby("TaskID").mean()
        self.stds = getattr(to, 'train', to)[to.cont_names + "TaskID"].groupby("TaskID").std(ddof=0)+1e-7

        return self(to)

    def encodes(self, to):
        for task_id in to.items[self.task_id_col].unique():
            # in case this is a new task, we update the means and stds
            if task_id not in self.means.index:
                print("new one")
                mu = getattr(to, 'train', to)[to.cont_names + "TaskID"].groupby("TaskID").mean()
                print("**************")
                print(mu)
                print("**************")
                self.means= self.means.append(mu)
                self.stds = self.stds.append(getattr(to, 'train', to)[to.cont_names + "TaskID"].groupby("TaskID").std(ddof=0)+1e-7)
                
                print(self.means)
                print("**************")
                
            mask = to.loc[:,self.task_id_col] == task_id
                   
            to.loc[mask, to.cont_names] = ((to.conts[mask] - self.means.loc[task_id]) / self.stds.loc[task_id])

        
class DropCols(TabularProc):
    "Drops rows by column name."
    order = 10
    def __init__(self, cols):
        self.cols = listify(cols)
        
    def encodes(self, to): 
        to.items.drop(self.cols, axis=1, inplace=True, errors="ignore")
        
class FilterByCol(TabularProc):
    "Drops rows by column."
    order = 0
    def __init__(self, col_name, keep=True, drop_col_after_filter=True):
        self.col_name = col_name
        self.keep = keep
        self.drop_col_after_filter=drop_col_after_filter
        
    def encodes(self, to): 
        mask = to.items[self.col_name].astype(bool).values
        if not self.keep: mask = ~mask
        to.items.drop(to.items[mask].index, inplace=True)
        if self.drop_col_after_filter: to.items.drop(self.col_name, axis=1, inplace=True, errors="ignore")

class FilterMonths(TabularProc):
    "Filter dataframe for specific months."
    order = 10
    def __init__(self, months=range(1,13)):
        self.months = listify(months)
        
    def encodes(self, to): 
        mask = ~to.items.index.month.isin(self.months)
        to.items.drop(to.items[mask].index, inplace=True)

In [None]:
# export
class TabularRenewables(TabularPandas):
    def __init__(self, dfs, procs=None, cat_names=None, cont_names=None, do_setup=True, reduce_memory=True,
                 y_names=None, add_y_to_x=False, add_x_to_y=False, pre_process=None, device=None, splits=None, y_block=RegressionBlock()):
        self.pre_process = pre_process
        if do_setup and pre_process is not None:
            self.prepared_to = TabularPandas(dfs, y_names=y_names, procs=pre_process, cont_names=cont_names,
                                          do_setup=True, reduce_memory=False)
            prepared_df = self.prepared_to.items
            if splits is not None: splits = splits(range_of(prepared_df))
        else:
            prepared_df = dfs
            
        super().__init__(prepared_df, 
            procs=procs,
            cat_names=cat_names,
            cont_names=cont_names,
            y_names=y_names,
            splits=splits,
            do_setup=do_setup,
            inplace=True,
            y_block=y_block, 
            reduce_memory=reduce_memory)
        # TODO add custom pre_process, e.g., for different test data with a different year         
#         def new(self, df):
#             return type(self)(df, do_setup=False, reduce_memory=False, y_block=TransformBlock(),
#                               **attrdict(self, 'procs','cat_names','cont_names','y_names', 'device'))


In [None]:
# # export
# def preproces_and_merge_dfs(dfs, task_id_col, y_names, pre_process, offset=0):
#     new_dfs = L()
#     for task_id,df in enumerate(dfs):
#             df = TabularPandas(df, y_names=y_names, procs=pre_process, 
#                                       do_setup=True, reduce_memory=False).items
            
#             df[task_id_col] = task_id+offset

#             new_dfs += df
        
#     merged_df = pd.concat(new_dfs, axis=0)
    
#     return merged_df
        
# class TabularRenewables(CollBase, GetAttr, FilteredBase):
#     def __init__(self, dfs, procs=None, cat_names=None, cont_names=None, 
#                  y_names=None, add_y_to_x=False, add_x_to_y=False, pre_process=None, 
#                  include_task_id=False, splits=None, device=None, do_setup=True):
#         self.task_id_col = "TaskID"
#         self.y_names = listify(y_names)
#         self.pre_process = pre_process
        
#         merged_df = preproces_and_merge_dfs(dfs, self.task_id_col, self.y_names, self.pre_process, offset=0)
        
#         self.cont_names, self.cat_names = cont_cat_split(merged_df, dep_var=y_names, max_card=1000)
#         if not include_task_id: self.cat_names = [c for c in self.cat_names if c!= self.task_id_col]
        
#         if add_y_to_x:
#             self.cont_names += self.y_names
#         if add_x_to_y:
#             self.y_names += self.cont_names

        
#         if splits is not None: splits = splits(range_of(merged_df))
#         self.split = len(df) if splits is None else len(splits[0])
#         self.to_device(device)
#         self.to = TabularPandas(
#             merged_df,
#             procs=procs,
#             cat_names=self.cat_names,
#             cont_names=self.cont_names,
#             y_names=self.y_names,
#             splits=splits,
#             do_setup=do_setup,
#             inplace=True,
#             y_block=RegressionBlock(),
#         )
#         super().__init__(self.to.items)
#     def new(self, df):
#         return type(self.to)(df, do_setup=False, reduce_memory=False, y_block=TransformBlock(),
#                           **attrdict(self, 'procs','cat_names','cont_names','y_names', 'device'))
        
# #     def new(self, df):
# #         # TODO: correct to TabularRenewables
# #         return type(self)(df, do_setup=False,**attrdict(self, 'procs','cat_names','cont_names','y_names', \
# #                                          'add_y_to_x', 'add_x_to_y','pre_process', 'include_task_id', 'device', ))
# #         self.c = copy(self)
# #         merged_dfs = preproces_and_merge_dfs(df, self.task_id_col, 
# #                                              self.y_names, self.pre_process, offset=self.items[self.task_id_col].max()+1)
# #         to_new = self.to.new(merged_dfs, )
# #         return to_new

# #     def subset(self, i): return self.to.new(self.items[slice(0,self.to.split) if i==0 else slice(self.to.split,len(self.to))])
#     def subset(self, i): return self.new(self.to.items[slice(0,self.split) if i==0 else slice(self.split,len(self))])
#     def copy(self): self.items = self.to.copy(); return self
#     def decode(self): return self.to.procs.decode(self.to)
#     def decode_row(self, row): return self.to.new(pd.DataFrame(row).T).decode().items.iloc[0]
#     def show(self, max_n=10, **kwargs): display_df(self.to.new(self.all_cols[:max_n]).decode().items)
#     #   TODO: fix self.to.new to self.new
# #     def show(self, max_n=10, **kwargs): display_df(self.to.new(self.all_cols[:max_n]).decode().items)
#     def setup(self): self.to.procs.setup(self.to)
#     def process(self): self.to.procs(self.to)
#     def loc(self): return self.items.loc
#     def iloc(self): return _TabIloc(self)
#     def targ(self): return self.to.items[self.y_names]
#     def x_names (self): return self.to.cat_names + self.to.cont_names
#     def n_subsets(self): return 2
#     def y(self): return self.to[self.to.y_names[0]]
#     def new_empty(self): return self.new(pd.DataFrame({}, columns=self.to.items.columns))
#     def to_device(self, d=None):
#         self.device = d
#         return self
    
#     def procs(self):
#         return self.to.procs

#     def all_col_names (self):
#         ys = [n for n in self.to.y_names if n in self.to.items.columns]
#         return self.to.x_names + self.to.y_names if len(ys) == len(self.to.y_names) else self.to.x_names
    
# properties(TabularRenewables,'loc','iloc','targ','all_col_names','n_subsets','x_names','y', "procs")

# fastai.tabular.core._add_prop(TabularRenewables, 'cat')
# fastai.tabular.core._add_prop(TabularRenewables, 'cont')
# fastai.tabular.core._add_prop(TabularRenewables, 'y')
# fastai.tabular.core._add_prop(TabularRenewables, 'x')
# fastai.tabular.core._add_prop(TabularRenewables, 'all_col')

In [None]:
# import sys, glob
# pd.options.mode.chained_assignment=None
# # sys.path.append("../")
# # from fastai.tabular.all import *
# # from fastrenewables.core import *


# files = glob.glob("../data/*.h5")
# len(files), files[0:2]
# n_files = 2
# dfs = read_files(files[0:n_files], key_metadata="metadata")
# cols_to_drop = L(
#     "long",
#     "lat",
#     "loc_id",
#     "target_file_name",
#     "input_file_name",
#     "num_train_samples",
#     "num_test_samples",
# )
# to = TabularRenewables(
#     dfs,
#     y_names="PowerGeneration",
#     pre_process=[DropCols(cols_to_drop), FilterByCol("TestFlag"), AddSeasonalFeatures],
#     procs=,
#     add_x_to_y=False,
#     include_task_id=False,
#     splits=RandomSplitter(valid_pct=0.2)
# )


In [None]:
# def preproces_and_merge_dfs(dfs, task_id_col, y_names, pre_process, offset=0):
#     new_dfs = L()
#     for task_id,df in enumerate(dfs):
# #             df = TabularPandas(df, y_names=y_names, procs=pre_process, 
# #                                       do_setup=True, reduce_memory=False).items
            
#             df[task_id_col] = task_id+offset

#             new_dfs += df
        
#     merged_df = pd.concat(new_dfs, axis=0)
    
#     return merged_df


In [None]:
# files = glob.glob("../data/*.h5")
# n_files = 2


# cont_names = ['T_HAG_2_M', 'RELHUM_HAG_2_M', 'PS_SFC_0_M', 'ASWDIFDS_SFC_0_M',
#        'ASWDIRS_SFC_0_M', 'WindSpeed58m',
#        'SinWindDirection58m', 'CosWindDirection58m', 'WindSpeed60m',
#        'SinWindDirection60m', 'CosWindDirection60m', 'WindSpeed58mMinus_t_1',
#        'SinWindDirection58mMinus_t_1', 'CosWindDirection58mMinus_t_1',
#        'WindSpeed60mMinus_t_1', 'SinWindDirection60mMinus_t_1',
#        'CosWindDirection60mMinus_t_1', 'WindSpeed58mPlus_t_1',
#        'SinWindDirection58mPlus_t_1', 'CosWindDirection58mPlus_t_1',
#        'WindSpeed60mPlus_t_1', 'SinWindDirection60mPlus_t_1',
#        'CosWindDirection60mPlus_t_1', ]

# cat_names = ["Month", "Day", "Hour", "rotor_diameter_m", "hub_height_m"]
# y_names = 'PowerGeneration'
# pd.options.mode.chained_assignment=None
# pre_process=[AddSeasonalFeatures, 
#              FilterByCol("TestFlag", drop_col_after_filter=False), 
# #              DropYear(year=2020),
# #              FilterMonths([1,2,3,4]), 
#              NormalizePerTask]

# procs = [NormalizePerTask, Categorify]
# procs = [Categorify]

# dfs = read_files(files[0:n_files], key_metadata="metadata")

# to = TabularRenewables(pd.concat(dfs, axis=0), 
#                   cont_names = cont_names, 
#                   cat_names=cat_names, 
#                   y_names=y_names, 
#                   pre_process=pre_process, 
#                     procs=procs,
#                 splits=RandomSplitter(valid_pct=0.2))
# print(to.items.index)
# dls = to.dataloaders(bs=256)
# dls.show_batch()
# learner = tabular_learner(dls, metrics=rmse)
# print(learner.dls.train_ds.cat_names)
# print(learner.model)
# learner.fit_one_cycle(10)

(15826, 40)
DatetimeIndex(['2019-10-27 15:00:00+00:00', '2019-02-03 19:00:00+00:00',
               '2019-04-03 14:00:00+00:00', '2019-02-08 02:00:00+00:00',
               '2019-02-27 20:00:00+00:00', '2019-10-20 11:00:00+00:00',
               '2019-07-27 00:00:00+00:00', '2019-07-19 08:00:00+00:00',
               '2019-05-11 01:00:00+00:00', '2019-09-24 11:00:00+00:00',
               ...
               '2019-10-24 15:00:00+00:00', '2019-05-04 02:00:00+00:00',
               '2019-03-05 10:00:00+00:00', '2019-11-27 22:00:00+00:00',
               '2019-02-28 17:00:00+00:00', '2019-05-05 17:00:00+00:00',
               '2019-02-28 09:00:00+00:00', '2019-07-09 21:00:00+00:00',
               '2019-05-05 14:00:00+00:00', '2019-07-27 06:00:00+00:00'],
              dtype='datetime64[ns, UTC]', name='TimeUTC', length=15826, freq=None)


Unnamed: 0,Month,Day,Hour,rotor_diameter_m,hub_height_m,T_HAG_2_M,RELHUM_HAG_2_M,PS_SFC_0_M,ASWDIFDS_SFC_0_M,ASWDIRS_SFC_0_M,WindSpeed58m,SinWindDirection58m,CosWindDirection58m,WindSpeed60m,SinWindDirection60m,CosWindDirection60m,WindSpeed58mMinus_t_1,SinWindDirection58mMinus_t_1,CosWindDirection58mMinus_t_1,WindSpeed60mMinus_t_1,SinWindDirection60mMinus_t_1,CosWindDirection60mMinus_t_1,WindSpeed58mPlus_t_1,SinWindDirection58mPlus_t_1,CosWindDirection58mPlus_t_1,WindSpeed60mPlus_t_1,SinWindDirection60mPlus_t_1,CosWindDirection60mPlus_t_1,PowerGeneration
0,5,2,10,115.7,92,0.468416,-1.194797,-0.502458,0.502105,0.797864,-0.231152,-0.779812,1.156806,0.227371,-0.671907,1.17552,-0.194975,-0.870952,1.108169,0.314154,-0.759932,1.125164,-0.402073,-0.959248,1.044332,-3e-06,-0.840573,1.062503,0.004
1,10,28,0,71.0,114,-0.704064,0.659828,0.711843,-1.635641,-1.118504,-0.75594,-0.819562,1.046224,-1.068783,0.193894,1.128889,-0.89192,-0.871803,1.025146,-1.118487,0.454499,1.011089,-0.531675,-1.075785,0.918494,-0.939393,-0.584809,1.207543,0.003
2,7,27,4,71.0,114,1.008298,-0.477016,-1.083498,-0.254646,1.551122,-0.158465,-0.606647,-1.616294,-0.895593,-0.524751,-1.591566,-0.235503,-0.638582,-1.609063,-0.808917,-0.759177,-1.548602,0.009939,-0.702879,-1.587947,-0.69636,-0.907832,-1.501788,0.0
3,4,17,8,71.0,114,0.048179,-1.05482,0.783855,-0.343809,0.93647,-0.508757,-0.210341,-1.659963,-0.056437,-0.341358,-1.600702,-0.235833,0.069045,-1.631911,0.267286,-0.077069,-1.57721,-0.947675,-0.348903,-1.65373,-0.875131,-0.571864,-1.584077,0.003
4,2,8,11,71.0,114,-0.698318,0.126111,0.007155,-1.247913,-1.015346,2.743097,0.974236,0.591287,2.84102,0.879641,0.649184,2.578131,0.973026,0.591782,2.686041,0.868003,0.664287,2.272893,1.063665,0.46668,2.296342,0.980645,0.506622,0.397
5,2,19,8,115.7,92,-1.216702,1.246966,0.389476,-1.205278,-0.406978,-0.984563,0.992411,0.716313,-1.03411,1.068016,-1.004337,-0.941445,1.261816,-0.180552,-0.792406,-0.451879,-1.491956,-1.11432,0.944928,0.782557,-0.850432,-1.33529,-0.575878,0.0
6,7,5,9,71.0,114,1.298436,-1.770229,0.350207,0.426632,1.640235,-0.689306,-0.42734,1.145344,-0.220591,-0.568248,1.206932,-0.624902,-0.446515,1.143421,-0.141817,-0.580628,1.206177,-0.815181,-0.442173,1.145114,-0.38598,-0.560779,1.210804,0.088
7,8,14,19,71.0,114,0.582884,-0.746091,0.14873,1.054999,1.35569,-0.9967,1.248183,-0.510412,-0.761042,0.719979,-1.197783,-0.662964,1.087523,-0.928524,-0.626952,0.878093,-1.029554,-0.644197,1.141105,0.326916,-0.830321,1.149683,0.083748,0.004
8,10,30,2,71.0,114,-1.12334,1.017866,1.225238,-0.627595,-0.813618,-0.102076,-0.563579,-1.625949,-0.790489,-0.877896,-1.51432,-0.130413,-0.567948,-1.62575,-0.791916,-0.965537,-1.480791,-0.137887,-0.562799,-1.623908,-0.839386,-0.811226,-1.533313,0.001
9,3,21,13,71.0,114,0.222105,-1.187809,1.861269,0.067636,0.362551,-1.569966,-1.099708,-1.40435,-1.46806,-1.80948,-0.61544,-1.49115,-1.539891,-0.955255,-1.348715,-1.826128,-0.560117,-1.512342,0.721765,-1.338484,-1.429774,0.239576,-1.489553,0.007


['Month', 'Day', 'Hour', 'rotor_diameter_m', 'hub_height_m']
TabularModel(
  (embeds): ModuleList(
    (0): Embedding(13, 7)
    (1): Embedding(32, 11)
    (2): Embedding(25, 10)
    (3): Embedding(3, 3)
    (4): Embedding(3, 3)
  )
  (emb_drop): Dropout(p=0.0, inplace=False)
  (bn_cont): BatchNorm1d(23, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): LinBnDrop(
      (0): BatchNorm1d(57, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Linear(in_features=57, out_features=200, bias=False)
      (2): ReLU(inplace=True)
    )
    (1): LinBnDrop(
      (0): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Linear(in_features=200, out_features=100, bias=False)
      (2): ReLU(inplace=True)
    )
    (2): LinBnDrop(
      (0): Linear(in_features=100, out_features=1, bias=True)
    )
  )
)


epoch,train_loss,valid_loss,_rmse,time
0,0.032086,0.021251,0.145778,00:00
1,0.019747,0.013956,0.118136,00:00
2,0.01511,0.013652,0.116842,00:00
3,0.012132,0.012196,0.110436,00:00
4,0.010425,0.011824,0.108736,00:00
5,0.009349,0.010812,0.103981,00:00
6,0.008182,0.010515,0.102541,00:00
7,0.007496,0.010145,0.10072,00:00
8,0.006889,0.010254,0.101264,00:00
9,0.006429,0.010182,0.100906,00:00
