In [None]:
# default_exp core

# core

> API details.

In [None]:
#hide
import pandas as pd
from nbdev.showdoc import *
from fastai.data.external import *
from fastcore.all import *
from pathlib import PosixPath
from fastcore.test import *
from fastai.tabular.all import *

  return torch._C._cuda_getDeviceCount() > 0


In [None]:
#hide
def str_to_path(file: str):
    "Convers a string to a Posixpath."
    if isinstance(file, str) and "~" in file:
        file = os.path.expanduser(file)

    file = Path(file)
    
    return file

In [None]:
#hide
test_eq_type(Path(""), str_to_path(""))
test_eq_type(Path(""), str_to_path(Path("")))

In [None]:
#export
def read_hdf(file:PosixPath, key: str = "/powerdata", key_metadata=None):
    "Reads a hdf5 table based on the given key."
    file = str_to_path(file)
    if "/" not in key: key = "/" + key
    with pd.HDFStore(file, "r") as store:
        if key in store.keys():
            df = store[key]
            if key_metadata is not None:
                df_meta = store[key_metadata]
                for c in df_meta: df[c] = df_meta[c].values[0]
        else:
            df = pd.DataFrame()
    return df

In [None]:
#hide
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
                  index=['a', 'b', 'c'])
df.to_hdf('data.h5', key='df', mode='w')
test_eq(df, read_hdf("data.h5", key="df"))

In [None]:
#export
def read_csv(file:PosixPath, sep:str =";"):
    "Reads a csv file."
    file = str_to_path(file)
    df = pd.read_csv(str(file), sep=sep)
    df.drop(["Unnamed: 0"], inplace=True, axis=1, errors="ignore")
    return df

In [None]:
#hide
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},)
df.to_csv('data.csv', sep=";")
test_eq(df, read_csv("data.csv", sep=";"))

In [None]:
def read_files(
    files:PosixPath,
    key:str ="/powerdata",
    key_metadata=None,
    sep:str=";"
) -> pd.DataFrame:
    "Reads a number of CSV or HDF5 files depending on file ending."
    
    files = listify(files)
    dfs=L()
    for file in files:
        if isinstance(file, str):
            file = str_to_path(file)

        if file.suffix == ".h5":
            df = read_hdf(file, key, key_metadata=key_metadata)
        elif file.suffix == ".csv":
            df = read_csv(file, sep=";")
        else:
            raise f"File ending of file {file} not supported."

        dfs += df
        
    return pd.concat(dfs, axis=0)

In [None]:
#hide
# df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
#                   index=['a', 'b', 'c'])
# df.to_hdf('data.h5', key='df', mode='w')
# test_eq(df, read_files("data.h5", key="df")[0])

# df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},)
# df.to_csv('data.csv', sep=";")
# test_eq(df, read_files("data.csv")[0])

In [None]:
# test_file = "/home/scribbler/data/DAF_ICON_Synthetic_Wind_Power_processed/00011.h5"

In [None]:
def tp_from_dtypes(df: pd.DataFrame, y_names: list, pre_process:list, procs: list, 
                   add_y_to_x=False, add_x_to_y=False, ignore_cols="", 
                   add_seasonal_feautres=True,splits=None):
    
    y_names = listify(y_names)
    ignore_cols = listify(ignore_cols)
    
    to = TabularPandas(df, y_names=y_names, procs=pre_process, do_setup=True, reduce_memory=False)
    df = to.items
    
    x_columns, cat_columns = cont_cat_split(df, dep_var=y_names, max_card=1000)
    
    if add_y_to_x:
        x_columns += y_names
    if add_x_to_y:
        y_names += x_columns
        
    if splits is not None: splits = splits(range_of(df))
            
    to = TabularPandas(
        df,
        procs=procs,
        cat_names=[c for c in cat_columns if c  not in ignore_cols],
        cont_names=[c for c in x_columns if c  not in ignore_cols],
        y_names=[c for c in y_names if c  not in ignore_cols],
        splits=splits,
        do_setup=True,
        inplace=True,
        y_block=RegressionBlock(),
    )

    return to

In [None]:
class AddSeasonalFeatures(TabularProc):
    order=1
    def encodes(self, to):
        to.items["Month"] = to.items.index.month
        to.items["Day"] = to.items.index.day
        to.items["Hour"] = to.items.index.hour

class DropYear(TabularProc):
    "Drops a complete year."
    order = 1
    def __init__(self, year=2020):
        year = str(year)
        self.year = pd.to_datetime(f"{year}-01-01", utc=True)
        
    def encodes(self, to): 
        mask = to.items.index < self.year
        to.items.drop(to.items[mask].index, inplace=True)
        
class DropCols(TabularProc):
    "Drops rows by column."
    order = 0
    def __init__(self, cols):
        self.cols = listify(cols)
        
    def encodes(self, to): 
        to.items.drop(self.cols, axis=1, inplace=True, errors="ignore")
        
class FilterByCol(TabularProc):
    "Drops rows by column."
    order = 0
    def __init__(self, col_name, keep=True):
        self.col_name = col_name
        self.keep = keep
        
    def encodes(self, to): 
        mask = to.items[self.col_name].astype(bool).values
        if not self.keep: mask = ~mask
        to.items.drop(to.items[mask].index, inplace=True)

class FilterMonths(TabularProc):
    "Filter dataframe for specific months."
    order = 2
    def __init__(self, months=range(1,13)):
        self.months = listify(months)
        
    def encodes(self, to): 
        mask = ~to.items.index.month.isin(self.months)
        to.items.drop(to.items[mask].index, inplace=True)

In [None]:
# files = !ls /home/scribbler/data/DAF_ICON_Synthetic_Wind_Power_processed/*.h5
# files[0:2]

In [None]:
# len(files)

In [None]:
# len(files)
# n_files = int(len(files)-len(files)/4)
# n_files

In [None]:
# dfs = read_files(files[0:n_files], key_metadata="metadata")
# dfs.head(2)

In [None]:
# cols_to_drop = L("long", "lat", "loc_id", "target_file_name", "input_file_name", "num_train_samples", "num_test_samples")
# to = tp_from_dtypes(dfs, y_names="PowerGeneration", 
#                     pre_process=[DropCols(cols_to_drop), FilterByCol("TestFlag"), AddSeasonalFeatures],
# #                     TODO: Normalize per task, add task embedding and implement normalization trough task id
#                     procs=[Normalize, Categorify], 
#                     add_x_to_y=False, ignore_cols="TestFlag", splits=RandomSplitter(valid_pct=0.2))

In [None]:
# to.items

In [None]:
# to.cont_names[20:]

In [None]:
# to.cat_names

In [None]:
# dls = to.dataloaders(bs=1024)
# learn = tabular_learner(dls, metrics=rmse)

In [None]:
# learn.fit_one_cycle(5)

In [None]:
# learn.fit_one_cycle(5)

In [None]:
# learn.fit_one_cycle(5)

In [None]:
# learn.model

In [None]:
# learn.model

In [None]:
# errors = L()
# for f in files[n_files:]:
#     df_test = read_files(f, key_metadata="metadata")
#     to_test = tp_from_dtypes(df_test, y_names="PowerGeneration", 
#                     pre_process=[DropCols(cols_to_drop), FilterByCol("TestFlag", keep=False), AddSeasonalFeatures],
# #                     TODO: Normalize per task, add task embedding and implement normalization trough task id
#                     procs=[], 
#                     add_x_to_y=False, ignore_cols="TestFlag")
#     to_test_2 = to.new(to_test.items)
#     dl_test = learn.dls.test_dl(to_test.items, bs=64)
#     targ, preds = learn.get_preds(dl=dl_test)
#     e = (((targ-preds)**2).mean()**0.5)
#     errors += e

In [None]:
# import seaborn as sns
# sns.boxplot(errors)

In [None]:
# sns.distplot(errors)