In [1]:
from src.static_variables import (HDF5_DATA_FILENAME, X_TRAIN_DIR, X_TEST_DIR, Y_TRAIN_DIR, Y_TEST_DIR, SPLIT_DATASETS_DIR)
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import h5py
import pickle
from scipy.io import arff

In [2]:
from src.files_io import FilesIO
from src.models_container import ModelsContainer
from src.run_experiments import RunExperiments

## Run experiments for one dataset only

In [3]:
#load dataset
abalone = arff.loadarff('Delgado_data/abalone/abalone.arff')
abalone = pd.DataFrame(abalone[0])
abalone['clase'] = pd.Series(abalone['clase'], dtype=int)

In [4]:
#create new instance of FilesIO class
f = FilesIO('data/test.hdf5')

In [5]:
#save original dataset in HDF5 database
metadata = [
    {'class_name':'clase', 'dataset_name':'abalone'},
    {'class_name':'clase', 'dataset_name':'wine_quality'},       
           ]
f.save_datasets(datasets =[abalone], 
                dataset_names=['abalone'], 
                dts_metadata=metadata)

In [6]:
#create train / test split
X_train, X_test, y_train, y_test = f.split_dataset('abalone', test_size=0.33)
y_train = np.array(y_train, dtype=int).ravel()
y_test = np.array(y_test, dtype=int).ravel()

In [7]:
# optional: create train /test split and save to database
#f.split_and_save(dataset_paths=['abalone','wine_quality'], save_loc='split_datasets')

In [8]:
# instantiate machine learning strategies
models_container = ModelsContainer()
instantiated_models = models_container.instantiate_models(RandomForestClassifier=None, SVM=None)

In [9]:
#run machine learning experiments
experiments = RunExperiments()
trained_models, _ = experiments.run_experiments( X_train=X_train, 
                                                y_train=y_train, 
                                                model_container=instantiated_models)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:    9.0s finished


Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:    8.4s finished


Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    3.8s finished


## Run experiments on multiple datasets

In [10]:
from src.test_orchestrator import TestOrchestrator

In [11]:
files_io = FilesIO('data/orchestrator_test.hdf5')

In [12]:
#load datasets in pandas dataframes and save them in the hdf5 database
abalone = arff.loadarff('Delgado_data/abalone/abalone.arff')
abalone = pd.DataFrame(abalone[0])
abalone['clase'] = pd.Series(abalone['clase'], dtype=int)

wine_quality_red = arff.loadarff('Delgado_data/wine-quality-red/wine-quality-red.arff')
wine_quality_red = pd.DataFrame(wine_quality_red[0])
wine_quality_red['clase'] = pd.Series(wine_quality_red['clase'], dtype=int)

wine_quality_white = arff.loadarff('Delgado_data/wine-quality-white/wine-quality-white.arff')
wine_quality_white = pd.DataFrame(wine_quality_white[0])
wine_quality_white['clase'] = pd.Series(wine_quality_white['clase'], dtype=int)

metadata = [
    {'class_name':'clase', 'dataset_name':'abalone'},
    {'class_name':'clase', 'dataset_name':'wine_quality_red'},
    {'class_name':'clase', 'dataset_name':'wine_quality_white'},       

           ]
files_io.save_datasets(datasets =[abalone, wine_quality_red,wine_quality_white ], 
                dataset_names=['abalone', 'wine_quality_red', 'wine_quality_white'], 
                dts_metadata=metadata)

In [13]:
#split datasets to train and test
files_io.split_and_save(dataset_paths=['abalone', 'wine_quality_white', 'wine_quality_red'],
                        save_loc ='split_datasets')

In [14]:
# instantiate machine learning strategies
models_container = ModelsContainer()
instantiated_models = models_container.instantiate_models(RandomForestClassifier=None, SVM=None)

In [15]:
#run experiments
f = FilesIO('data/orchestrator_test.hdf5')
datasets = files_io.list_datasets('split_datasets')
experiments = RunExperiments()
test_orchestrator = TestOrchestrator('split_datasets', files_io, None, experiments, None)
test_orchestrator.run_experiments(datasets, instantiated_models)


RuntimeError: Unable to create link (name already exists)