In [1]:
# default_exp core

# core

> API details.

In [2]:
#hide
#export
import pandas as pd
from nbdev.showdoc import *
from fastai.data.external import *
from fastcore.all import *
from pathlib import PosixPath
from fastcore.test import *
from fastai.tabular.all import *
import fastai

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
#hide
#export
def str_to_path(file: str):
    "Convers a string to a Posixpath."
    if isinstance(file, str) and "~" in file:
        file = os.path.expanduser(file)

    file = Path(file)
    
    return file

In [4]:
#hide
test_eq_type(Path(""), str_to_path(""))
test_eq_type(Path(""), str_to_path(Path("")))

In [5]:
#export
def read_hdf(file:PosixPath, key: str = "/powerdata", key_metadata=None):
    "Reads a hdf5 table based on the given key."
    file = str_to_path(file)
    if "/" not in key: key = "/" + key
    with pd.HDFStore(file, "r") as store:
        if key in store.keys():
            df = store[key]
            if key_metadata is not None:
                df_meta = store[key_metadata]
                for c in df_meta: df[c] = df_meta[c].values[0]
        else:
            df = pd.DataFrame()
    return df

In [6]:
#hide
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
                  index=['a', 'b', 'c'])
df.to_hdf('data.h5', key='df', mode='w')
test_eq(df, read_hdf("data.h5", key="df"))

In [7]:
#export
def read_csv(file:PosixPath, sep:str =";"):
    "Reads a csv file."
    file = str_to_path(file)
    df = pd.read_csv(str(file), sep=sep)
    df.drop(["Unnamed: 0"], inplace=True, axis=1, errors="ignore")
    return df

In [8]:
#hide
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},)
df.to_csv('data.csv', sep=";")
test_eq(df, read_csv("data.csv", sep=";"))

In [9]:
#export
def read_files(
    files:PosixPath,
    key:str ="/powerdata",
    key_metadata=None,
    sep:str=";"
) -> pd.DataFrame:
    "Reads a number of CSV or HDF5 files depending on file ending."
    
    files = listify(files)
    dfs=L()
    for file in files:
        if isinstance(file, str):
            file = str_to_path(file)

        if file.suffix == ".h5":
            df = read_hdf(file, key, key_metadata=key_metadata)
        elif file.suffix == ".csv":
            df = read_csv(file, sep=";")
        else:
            raise f"File ending of file {file} not supported."

        dfs += df
        
    return dfs

In [10]:
# hide
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
                  index=['a', 'b', 'c'])
df.to_hdf('data.h5', key='df', mode='w')
test_eq(df, read_files("data.h5", key="df")[0])

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},)
df.to_csv('data.csv', sep=";")
test_eq(df, read_files("data.csv")[0])

In [11]:
# test_file = "/home/scribbler/data/DAF_ICON_Synthetic_Wind_Power_processed/00011.h5"

In [12]:
#export
class AddSeasonalFeatures(TabularProc):
    order=10
    def encodes(self, to):
        to.items["Month"] = to.items.index.month
        to.items["Day"] = to.items.index.day
        to.items["Hour"] = to.items.index.hour

class DropYear(TabularProc):
    "Drops a complete year."
    order = 10
    def __init__(self, year=2020):
        year = str(year)
        self.year = pd.to_datetime(f"{year}-01-01", utc=True)
        
    def encodes(self, to): 
        mask = to.items.index < self.year
        to.items.drop(to.items[mask].index, inplace=True)
        
class NormalizePerTask(TabularProc):
    "Normalize per TaskId"
    order = 10
    def __init__(self, task_id_col="TaskID"):
        self.task_id_col = task_id_col
    def setups(self, to:Tabular):
        self.means = getattr(to, 'train', to)[to.cont_names + "TaskID"].groupby("TaskID").mean()
        self.stds = getattr(to, 'train', to)[to.cont_names + "TaskID"].groupby("TaskID").std(ddof=0)+1e-7
#         store_attr(but='to', means=dict(getattr(to, 'train', to).conts.mean()),
#                    stds=dict(getattr(to, 'train', to).conts.std(ddof=0)+1e-7))
#         return self(to)
        return self(to)

    def encode(self, to):
        for task_id in to.items[self.task_id_col].unique():
            to.conts[to.loc[:,self.task_id_col] == task_id] = ((to.conts[to.loc[:,self.task_id_col] == task_id] - self.means.loc[task_id]) / self.stds.loc[task_id]).describe()

        
class DropCols(TabularProc):
    "Drops rows by column name."
    order = 10
    def __init__(self, cols):
        self.cols = listify(cols)
        
    def encodes(self, to): 
        to.items.drop(self.cols, axis=1, inplace=True, errors="ignore")
        
class FilterByCol(TabularProc):
    "Drops rows by column."
    order = 10
    def __init__(self, col_name, keep=True, drop_col_after_filter=True):
        self.col_name = col_name
        self.keep = keep
        self.drop_col_after_filter=drop_col_after_filter
        
    def encodes(self, to): 
        mask = to.items[self.col_name].astype(bool).values
        if not self.keep: mask = ~mask
        to.items.drop(to.items[mask].index, inplace=True)
        if self.drop_col_after_filter: to.items.drop(self.col_name, axis=1, inplace=True, errors="ignore")

class FilterMonths(TabularProc):
    "Filter dataframe for specific months."
    order = 10
    def __init__(self, months=range(1,13)):
        self.months = listify(months)
        
    def encodes(self, to): 
        mask = ~to.items.index.month.isin(self.months)
        to.items.drop(to.items[mask].index, inplace=True)

In [13]:
# export
class TabularRenewables(CollBase, GetAttr, FilteredBase):
    def __init__(self, dfs, procs=None, cat_names=None, cont_names=None, 
                 y_names=None, add_y_to_x=False, add_x_to_y=False, pre_process=None, 
                 include_task_id=False, splits=None):
        self.task_id_col = "TaskID"
        self.y_names = listify(y_names)
        self.pre_process = pre_process
        
        
        self.dfs = L()
        for task_id,df in enumerate(dfs):
            df = TabularPandas(df, y_names=self.y_names, procs=pre_process, 
                                      do_setup=True, reduce_memory=False).items
            df[self.task_id_col] = task_id
            
            self.dfs += df
            
        
        self.cont_names, self.cat_names = cont_cat_split(self.dfs[0], dep_var=y_names, max_card=1000)
        if not include_task_id: self.cat_names = [c for c in self.cat_names if c!= self.task_id_col]
        
        if add_y_to_x:
            self.cont_names += self.y_names
        if add_x_to_y:
            self.y_names += self.cont_names

        merged_df = pd.concat(self.dfs, axis=0)
        if splits is not None: splits = splits(range_of(merged_df))
            
        self.to = TabularPandas(
            merged_df,
            procs=procs,
            cat_names=self.cat_names,
            cont_names=self.cont_names,
            y_names=self.y_names,
            splits=splits,
            do_setup=True,
            inplace=True,
            y_block=RegressionBlock(),
        )
        super().__init__(self.to)
        
    def new(self, df):
        return type(self.to)(df, do_setup=False, reduce_memory=False, y_block=TransformBlock(),
                          **attrdict(self, 'procs','cat_names','cont_names','y_names', 'device'))

    def subset(self, i): return self.to.new(self.items[slice(0,self.to.split) if i==0 else slice(self.to.split,len(self.to))])
    def copy(self): self.items = self.to.copy(); return self
    def decode(self): return self.to.procs.decode(self.to)
    def decode_row(self, row): return self.to.new(pd.DataFrame(row).T).decode().items.iloc[0]
    def show(self, max_n=10, **kwargs): self.to.show(max_n, **kwargs)
    def setup(self): self.to.procs.setup(self)
    def process(self): self.to.procs(self)
    def loc(self): return self.items.loc
    def iloc(self): return _TabIloc(self)
    def targ(self): return self.to.items[self.y_names]
    def x_names (self): return self.to.cat_names + self.to.cont_names
    def n_subsets(self): return 2
    def y(self): return self.to[self.to.y_names[0]]
    def new_empty(self): return self.new(pd.DataFrame({}, columns=self.to.items.columns))
    def to_device(self, d=None):
        self.device = d
        return self
    
    def procs(self):
        return self.to.procs

    def all_col_names (self):
        ys = [n for n in self.to.y_names if n in self.to.items.columns]
        return self.to.x_names + self.to.y_names if len(ys) == len(self.to.y_names) else self.to.x_names
    
properties(TabularRenewables,'loc','iloc','targ','all_col_names','n_subsets','x_names','y', "procs")

fastai.tabular.core._add_prop(TabularRenewables, 'cat')
fastai.tabular.core._add_prop(TabularRenewables, 'cont')
fastai.tabular.core._add_prop(TabularRenewables, 'y')
fastai.tabular.core._add_prop(TabularRenewables, 'x')
fastai.tabular.core._add_prop(TabularRenewables, 'all_col')

In [14]:
!nbdev_build_lib

Converted 00_core.ipynb.
Converted index.ipynb.
