In [24]:
from pandas import HDFStore

# Reader functions

In [25]:
def HDF_reader(file_location, df_name):
    store = HDFStore(file_location)
    data = input_store.get(df_name)
    return data

In [26]:
import pandas as pd
def csv_reader(file_location):
    data = pd.read_csv(file_location)
    return data

# Writer functions

In [193]:
def HDF_writer(df, file_location, reference):
    store = HDFStore(file_location)
    store.open()
    store.put(reference, df, 't')
    store.close()

# Operator definition

In [28]:
from sklearn.model_selection import train_test_split

class Splitter():

    def __init__(self, file_location, output_location, test_size):
        self.file_location = file_location 
        self.output_location = output_location # can be given default value
        self.test_size = test_size # to be unpacked from node configuration yaml

    def input(self): 
        return csv_reader(self.file_location)

    def output(self):
        return {'train': lambda x: HDF_writer(x, self.output_location, 'train'),
                'test': lambda x: HDF_writer(x, self.output_location, 'test')}

    def run(self):
        df = self.input()

        train, test = train_test_split(self.input(), test_size= self.test_size)

        self.output()['train'](train)
        self.output()['test'](test)

In [29]:
splitter = Splitter(file_location='/home/sanjay/Documents/Iris/Iris_data/Iris.csv'
                    ,output_location='/home/sanjay/Documents/Iris/Iris_data/chk.h5'
                   ,test_size= 0.2)

In [30]:
splitter.run()

In [31]:
store = HDFStore('/home/sanjay/Documents/Iris/Iris_data/chk.h5')

In [32]:
store.info()

u"<class 'pandas.io.pytables.HDFStore'>\nFile path: /home/sanjay/Documents/Iris/Iris_data/chk.h5\n/test             frame_table  (typ->appendable,nrows->30,ncols->6,indexers->[index]) \n/train            frame_table  (typ->appendable,nrows->120,ncols->6,indexers->[index])"

In [194]:
from sklearn.model_selection import KFold

class KFold_splitter():
    
    def __init__(self, file_location, output_location, n_splits):
        self.file_location = file_location 
        self.output_location = output_location # can be given default value
        self.n_splits = n_splits # to be unpacked from node configuration yaml

    def input(self): 
        return csv_reader(self.file_location)

    def output(self):
        out = {}
        
        for i in range(self.n_splits):
            train_df_name = 'train_{}'.format(i)
            test_df_name = 'test_{}'.format(i)
            out[train_df_name]= lambda x,ref=train_df_name: HDF_writer(x, self.output_location, ref)
            out[test_df_name]= lambda x,ref=test_df_name: HDF_writer(x, self.output_location, ref)
        
        return out

    def run(self):
        df = self.input()
        kf = KFold(n_splits= self.n_splits, shuffle= True)

        n= 0
        for train_index, test_index in kf.split(df):
            train, test = df.iloc[train_index], df.iloc[test_index]
            self.output()['train_{}'.format(n)](train)
            self.output()['test_{}'.format(n)](test)
            n= n+1

In [195]:
k_splitter = KFold_splitter(file_location='/home/sanjay/Documents/Iris/Iris_data/Iris.csv'
                    ,output_location='/home/sanjay/Documents/Iris/Iris_data/chk1.h5'
                   ,n_splits= 2)

In [196]:
k_splitter.run()

In [197]:
store = HDFStore('/home/sanjay/Documents/Iris/Iris_data/chk1.h5')

In [198]:
store.info()

u"<class 'pandas.io.pytables.HDFStore'>\nFile path: /home/sanjay/Documents/Iris/Iris_data/chk1.h5\n/test_0             frame_table  (typ->appendable,nrows->75,ncols->6,indexers->[index])\n/test_1             frame_table  (typ->appendable,nrows->75,ncols->6,indexers->[index])\n/train_0            frame_table  (typ->appendable,nrows->75,ncols->6,indexers->[index])\n/train_1            frame_table  (typ->appendable,nrows->75,ncols->6,indexers->[index])"