In [1]:
# default_exp core

# core

> API details.

In [2]:
#hide
#export
import pandas as pd
from nbdev.showdoc import *
from fastai.data.external import *
from fastcore.all import *
from pathlib import PosixPath
from fastcore.test import *
from fastai.tabular.all import *
import fastai

In [3]:
#hide
#export
def str_to_path(file: str):
    "Convers a string to a Posixpath."
    if isinstance(file, str) and "~" in file:
        file = os.path.expanduser(file)

    file = Path(file)
    
    return file

In [4]:
#hide
test_eq_type(Path(""), str_to_path(""))
test_eq_type(Path(""), str_to_path(Path("")))

In [5]:
#export
def read_hdf(file:PosixPath, key: str = "/powerdata", key_metadata=None):
    "Reads a hdf5 table based on the given key."
    file = str_to_path(file)
    if "/" not in key: key = "/" + key
    with pd.HDFStore(file, "r") as store:
        if key in store.keys():
            df = store[key]
            if key_metadata is not None:
                df_meta = store[key_metadata]
                for c in df_meta: df[c] = df_meta[c].values[0]
        else:
            df = pd.DataFrame()
    return df

In [6]:
#hide
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
                  index=['a', 'b', 'c'])
df.to_hdf('data.h5', key='df', mode='w')
test_eq(df, read_hdf("data.h5", key="df"))

In [7]:
#export
def read_csv(file:PosixPath, sep:str =";"):
    "Reads a csv file."
    file = str_to_path(file)
    df = pd.read_csv(str(file), sep=sep)
    df.drop(["Unnamed: 0"], inplace=True, axis=1, errors="ignore")
    return df

In [8]:
#hide
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},)
df.to_csv('data.csv', sep=";")
test_eq(df, read_csv("data.csv", sep=";"))

In [9]:
#export
def read_files(
    files:PosixPath,
    key:str ="/powerdata",
    key_metadata=None,
    sep:str=";"
) -> pd.DataFrame:
    "Reads a number of CSV or HDF5 files depending on file ending."
    
    files = listify(files)
    dfs=L()
    for file in files:
        if isinstance(file, str):
            file = str_to_path(file)

        if file.suffix == ".h5":
            df = read_hdf(file, key, key_metadata=key_metadata)
        elif file.suffix == ".csv":
            df = read_csv(file, sep=";")
        else:
            raise f"File ending of file {file} not supported."

        dfs += df
        
    return dfs

In [10]:
# hide
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
                  index=['a', 'b', 'c'])
df.to_hdf('data.h5', key='df', mode='w')
test_eq(df, read_files("data.h5", key="df")[0])

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},)
df.to_csv('data.csv', sep=";")
test_eq(df, read_files("data.csv")[0])

In [11]:
# test_file = "/home/scribbler/data/DAF_ICON_Synthetic_Wind_Power_processed/00011.h5"

In [304]:
#export
class AddSeasonalFeatures(TabularProc):
    order=10
    def encodes(self, to):
        to.items["Month"] = to.items.index.month
        to.items["Day"] = to.items.index.day
        to.items["Hour"] = to.items.index.hour

class DropYear(TabularProc):
    "Drops a complete year."
    order = 10
    def __init__(self, year=2020):
        year = str(year)
        self.year = pd.to_datetime(f"{year}-01-01", utc=True)
        
    def encodes(self, to): 
        mask = to.items.index < self.year
        to.items.drop(to.items[mask].index, inplace=True)
        
class NormalizePerTask(TabularProc):
    "Normalize per TaskId"
    order = 10
    def __init__(self, task_id_col="TaskID"):
        self.task_id_col = task_id_col
    def setups(self, to:Tabular):
        self.means = getattr(to, 'train', to)[to.cont_names + "TaskID"].groupby("TaskID").mean()
        self.stds = getattr(to, 'train', to)[to.cont_names + "TaskID"].groupby("TaskID").std(ddof=0)+1e-7

        return self(to)

    def encodes(self, to):
        for task_id in to.items[self.task_id_col].unique():
            # in case this is a new task, we update the means and stds
            if task_id not in self.means.index:
                print("new one")
                mu = getattr(to, 'train', to)[to.cont_names + "TaskID"].groupby("TaskID").mean()
                print("**************")
                print(mu)
                print("**************")
                self.means= self.means.append(mu)
                self.stds = self.stds.append(getattr(to, 'train', to)[to.cont_names + "TaskID"].groupby("TaskID").std(ddof=0)+1e-7)
                
                print(self.means)
                print("**************")
                
            mask = to.loc[:,self.task_id_col] == task_id
                   
            to.loc[mask, to.cont_names] = ((to.conts[mask] - self.means.loc[task_id]) / self.stds.loc[task_id])

        
class DropCols(TabularProc):
    "Drops rows by column name."
    order = 10
    def __init__(self, cols):
        self.cols = listify(cols)
        
    def encodes(self, to): 
        to.items.drop(self.cols, axis=1, inplace=True, errors="ignore")
        
class FilterByCol(TabularProc):
    "Drops rows by column."
    order = 10
    def __init__(self, col_name, keep=True, drop_col_after_filter=True):
        self.col_name = col_name
        self.keep = keep
        self.drop_col_after_filter=drop_col_after_filter
        
    def encodes(self, to): 
        mask = to.items[self.col_name].astype(bool).values
        if not self.keep: mask = ~mask
        to.items.drop(to.items[mask].index, inplace=True)
        if self.drop_col_after_filter: to.items.drop(self.col_name, axis=1, inplace=True, errors="ignore")

class FilterMonths(TabularProc):
    "Filter dataframe for specific months."
    order = 10
    def __init__(self, months=range(1,13)):
        self.months = listify(months)
        
    def encodes(self, to): 
        mask = ~to.items.index.month.isin(self.months)
        to.items.drop(to.items[mask].index, inplace=True)

In [305]:
# export
def preproces_and_merge_dfs(dfs, task_id_col, y_names, pre_process, offset=0):
    new_dfs = L()
    for task_id,df in enumerate(dfs):
            df = TabularPandas(df, y_names=y_names, procs=pre_process, 
                                      do_setup=True, reduce_memory=False).items
            
            df[task_id_col] = task_id+offset

            new_dfs += df
        
    merged_df = pd.concat(new_dfs, axis=0)
    
    return merged_df
        
class TabularRenewables(CollBase, GetAttr, FilteredBase):
    def __init__(self, dfs, procs=None, cat_names=None, cont_names=None, 
                 y_names=None, add_y_to_x=False, add_x_to_y=False, pre_process=None, 
                 include_task_id=False, splits=None, device=None, do_setup=True):
        self.task_id_col = "TaskID"
        self.y_names = listify(y_names)
        self.pre_process = pre_process
        
        merged_df = preproces_and_merge_dfs(dfs, self.task_id_col, self.y_names, self.pre_process, offset=0)
        
        self.cont_names, self.cat_names = cont_cat_split(merged_df, dep_var=y_names, max_card=1000)
        if not include_task_id: self.cat_names = [c for c in self.cat_names if c!= self.task_id_col]
        
        if add_y_to_x:
            self.cont_names += self.y_names
        if add_x_to_y:
            self.y_names += self.cont_names

        
        if splits is not None: splits = splits(range_of(merged_df))
        self.to_device(device)
        self.to = TabularPandas(
            merged_df,
            procs=procs,
            cat_names=self.cat_names,
            cont_names=self.cont_names,
            y_names=self.y_names,
            splits=splits,
            do_setup=do_setup,
            inplace=True,
            y_block=RegressionBlock(),
        )
        super().__init__(self.to.items)
        
    def new(self, df):
        # TODO: correct to TabularRenewables
#         return type(self)(df, do_setup=False,**attrdict(self, 'procs','cat_names','cont_names','y_names', \
#                                          'add_y_to_x', 'add_x_to_y','pre_process', 'include_task_id', 'device', ))
        self.c = copy(self)
        merged_dfs = preproces_and_merge_dfs(df, self.task_id_col, 
                                             self.y_names, self.pre_process, offset=self.items[self.task_id_col].max()+1)
        to_new = self.to.new(merged_dfs, )
        return to_new

    def subset(self, i): return self.to.new(self.items[slice(0,self.to.split) if i==0 else slice(self.to.split,len(self.to))])
    def copy(self): self.items = self.to.copy(); return self
    def decode(self): return self.to.procs.decode(self.to)
    def decode_row(self, row): return self.to.new(pd.DataFrame(row).T).decode().items.iloc[0]
#     def show(self, max_n=10, **kwargs): self.to.show(max_n, **kwargs)
    #   TODO: fix self.to.new to self.new
    def show(self, max_n=10, **kwargs): display_df(self.to.new(self.all_cols[:max_n]).decode().items)
    def setup(self): self.to.procs.setup(self.to)
    def process(self): self.to.procs(self.to)
    def loc(self): return self.items.loc
    def iloc(self): return _TabIloc(self)
    def targ(self): return self.to.items[self.y_names]
    def x_names (self): return self.to.cat_names + self.to.cont_names
    def n_subsets(self): return 2
    def y(self): return self.to[self.to.y_names[0]]
    def new_empty(self): return self.new(pd.DataFrame({}, columns=self.to.items.columns))
    def to_device(self, d=None):
        self.device = d
        return self
    
    def procs(self):
        return self.to.procs

    def all_col_names (self):
        ys = [n for n in self.to.y_names if n in self.to.items.columns]
        return self.to.x_names + self.to.y_names if len(ys) == len(self.to.y_names) else self.to.x_names
    
properties(TabularRenewables,'loc','iloc','targ','all_col_names','n_subsets','x_names','y', "procs")

fastai.tabular.core._add_prop(TabularRenewables, 'cat')
fastai.tabular.core._add_prop(TabularRenewables, 'cont')
fastai.tabular.core._add_prop(TabularRenewables, 'y')
fastai.tabular.core._add_prop(TabularRenewables, 'x')
fastai.tabular.core._add_prop(TabularRenewables, 'all_col')

In [306]:
import sys, glob

# sys.path.append("../")
# from fastai.tabular.all import *
# from fastrenewables.core import *


files = glob.glob("../data/*.h5")
len(files), files[0:2]
n_files = 2
dfs = read_files(files[0:n_files], key_metadata="metadata")
cols_to_drop = L(
    "long",
    "lat",
    "loc_id",
    "target_file_name",
    "input_file_name",
    "num_train_samples",
    "num_test_samples",
)
to = TabularRenewables(
    dfs,
    y_names="PowerGeneration",
    pre_process=[DropCols(cols_to_drop), FilterByCol("TestFlag"), AddSeasonalFeatures],
    procs=[NormalizePerTask, Categorify],
    add_x_to_y=False,
    include_task_id=False,
                        splits=RandomSplitter(valid_pct=0.2)
)


  warn("Using inplace with splits will trigger a pandas error. Set `pd.options.mode.chained_assignment=None` to avoid it.")


In [307]:
2 in to.procs.normalize_per_task.means.index

False

In [308]:
to.items.TaskID.max()

1

In [309]:
a = to.new(read_files(files[n_files], key_metadata="metadata"))

In [310]:
a.process()

new one
**************
         T_HAG_2_M  RELHUM_HAG_2_M   PS_SFC_0_M  ASWDIFDS_SFC_0_M  \
TaskID                                                              
2       282.318524       76.775322  93288.75035         66.925939   

        ASWDIRS_SFC_0_M  WindSpeed58m  SinWindDirection58m  \
TaskID                                                       
2            102.740988      3.736435             0.051207   

        CosWindDirection58m  WindSpeed60m  SinWindDirection60m  ...  \
TaskID                                                          ...   
2                  0.136615      2.283678             0.076204  ...   

        SinWindDirection60mMinus_t_1  CosWindDirection60mMinus_t_1  \
TaskID                                                               
2                           0.075849                      0.200523   

        WindSpeed58mPlus_t_1  SinWindDirection58mPlus_t_1  \
TaskID                                                      
2                   3.736496       

In [311]:
a.items

Unnamed: 0_level_0,T_HAG_2_M,RELHUM_HAG_2_M,PS_SFC_0_M,ASWDIFDS_SFC_0_M,ASWDIRS_SFC_0_M,PowerGeneration,WindSpeed58m,SinWindDirection58m,CosWindDirection58m,WindSpeed60m,...,SinWindDirection60mPlus_t_1,CosWindDirection60mPlus_t_1,turbine,hub_height_m,rotor_diameter_m,nominal_power_kW,Month,Day,Hour,TaskID
TimeUTC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-01 00:00:00+00:00,-0.858319,1.423129,1.751965,-1.686394,-1.186579,0.000,-1.084286,-1.158532,0.642395,-1.116730,...,0.498562,1.175568,0,0,0.0,0.0,1,1,1,2
2019-01-01 01:00:00+00:00,-0.880154,1.422517,1.809663,-1.708480,-1.188843,0.000,-0.628330,-1.214216,0.518541,-0.714204,...,-1.281344,0.294197,0,0,0.0,0.0,1,1,2,2
2019-01-01 02:00:00+00:00,-0.903045,1.420250,1.745841,-1.728925,-1.190974,0.000,-0.715126,-1.111120,0.731233,-0.909445,...,-1.218690,0.474283,0,0,0.0,0.0,1,1,3,2
2019-01-01 03:00:00+00:00,-0.961507,1.423068,1.678533,-1.747831,-1.192912,0.000,-1.062709,-0.230798,1.392244,-0.761583,...,-0.315483,1.348358,0,0,0.0,0.0,1,1,4,2
2019-01-01 04:00:00+00:00,-1.092634,1.422394,1.650093,-1.765399,-1.194764,0.000,-0.784807,0.939937,0.796086,-0.568621,...,0.709075,0.988219,0,0,0.0,0.0,1,1,5,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-11-30 20:00:00+00:00,-1.010224,0.371668,0.213161,-1.189413,-1.043226,0.027,-0.625993,-0.181384,-1.843882,-0.848956,...,-0.234990,-2.046295,0,0,0.0,0.0,11,30,21,2
2019-11-30 21:00:00+00:00,-1.063638,0.387232,0.140241,-1.212736,-1.047669,0.005,-0.903508,0.261339,-1.797070,-1.188718,...,-0.589427,-1.923247,0,0,0.0,0.0,11,30,22,2
2019-11-30 22:00:00+00:00,-1.084534,0.419094,0.068745,-1.235089,-1.051931,0.009,-0.890779,0.941646,-1.238209,-0.774355,...,0.949988,-1.321480,0,0,0.0,0.0,11,30,23,2
2019-11-30 23:00:00+00:00,-1.100852,0.506348,0.047185,-1.256472,-1.056035,0.007,-0.752556,0.947480,-1.228916,-0.653795,...,0.935241,0.666731,0,0,0.0,0.0,11,30,24,2


In [None]:
# !nbdev_build_lib