In [None]:
from download_delgado.delgado_datasets import DownloadAndConvertDelgadoDatasets
from mlaut.data import Data
from mlaut.estimators.estimators import instantiate_default_estimators
from mlaut.experiments import Orchestrator
from mlaut.analyze_results import AnalyseResults
from download_delgado.delgado_datasets import DownloadAndConvertDelgadoDatasets
from mlaut.analyze_results.scores import ScoreAccuracy
import pandas as pd
import numpy as np
from mlaut.estimators.generic_estimator import Generic_Estimator

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from mlaut.estimators.nn_estimators import Deep_NN_Classifier


In [None]:
data = Data()
input_io = data.open_hdf5('data/delgado.h5', mode='r')
out_io = data.open_hdf5('data/delgado-classification-deep.h5', mode='r')

analyze = AnalyseResults(hdf5_output_io=out_io, 
                        hdf5_input_io=input_io, 
                        input_h5_original_datasets_group='openml/', 
                        output_h5_predictions_group='experiments/predictions/')

In [None]:
from mlaut.estimators.nn_estimators import Deep_NN_Classifier
hyperparameters = {'epochs': [50,100], 
                    'batch_size': [0, 50, 100]}
def keras_model1(num_classes, input_dim):
    model = OverwrittenSequentialClassifier()
    model.add(Dense(288, input_dim=input_dim, activation='relu'))
    model.add(Dense(144, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(12, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))

    model_optimizer = optimizers.Adam(lr=0.001)
    model.compile(loss='mean_squared_error', optimizer=model_optimizer, metrics=['accuracy'])

    return model

deep_nn_4_layer_thin_dropout = Deep_NN_Classifier(keras_model=keras_model1, 
                            properties={'name':'NN-4-layer_thin_dropout'})


def keras_model2(num_classes, input_dim):
    nn_deep_model = OverwrittenSequentialClassifier()
    nn_deep_model.add(Dense(2500, input_dim=input_dim, activation='relu'))
    nn_deep_model.add(Dense(2000, activation='relu'))
    nn_deep_model.add(Dense(1500, activation='relu'))
    nn_deep_model.add(Dense(num_classes, activation='softmax'))

    model_optimizer = optimizers.Adam(lr=0.001)
    nn_deep_model.compile(loss='mean_squared_error', optimizer=model_optimizer, metrics=['accuracy'])
    return nn_deep_model

deep_nn_4_layer_wide_no_dropout = Deep_NN_Classifier(hyperparameters=hyperparameters,
                            keras_model=keras_model2,
                            properties={'name':'NN-4-layer_wide_no_dropout'})


def keras_model3(num_classes, input_dim):
    nn_deep_model = OverwrittenSequentialClassifier()
    nn_deep_model.add(Dense(2500, input_dim=input_dim, activation='relu'))
    nn_deep_model.add(Dense(2000, activation='relu'))
    nn_deep_model.add(Dropout(0.5))
    nn_deep_model.add(Dense(1500, activation='relu'))
    nn_deep_model.add(Dense(num_classes, activation='softmax'))

    model_optimizer = optimizers.Adam(lr=0.001)
    nn_deep_model.compile(loss='mean_squared_error', optimizer=model_optimizer, metrics=['accuracy'])
    return nn_deep_model

deep_nn_4_layer_wide_with_dropout = Deep_NN_Classifier(hyperparameters=hyperparameters,
                            keras_model=keras_model3,
                            properties={'name':'NN-4-layer_wide_with_dropout'})


def keras_model4(num_classes, input_dim):
    nn_deep_model = OverwrittenSequentialClassifier()
    nn_deep_model.add(Dense(5000, input_dim=input_dim, activation='relu'))
    nn_deep_model.add(Dense(4500, activation='relu'))
    nn_deep_model.add(Dense(4000, activation='relu'))
    nn_deep_model.add(Dropout(0.5))

    nn_deep_model.add(Dense(3500, activation='relu'))
    nn_deep_model.add(Dense(3000, activation='relu'))
    nn_deep_model.add(Dense(2500, activation='relu'))
    nn_deep_model.add(Dropout(0.5))


    nn_deep_model.add(Dense(2000, activation='relu'))
    nn_deep_model.add(Dense(1500, activation='relu'))
    nn_deep_model.add(Dense(1000, activation='relu'))
    nn_deep_model.add(Dropout(0.5))

    nn_deep_model.add(Dense(500, activation='relu'))
    nn_deep_model.add(Dense(250, activation='relu'))
    nn_deep_model.add(Dense(num_classes, activation='softmax'))

    model_optimizer = optimizers.Adam(lr=0.001)
    nn_deep_model.compile(loss='mean_squared_error', optimizer=model_optimizer, metrics=['accuracy'])
    return nn_deep_model

deep_nn_12_layer_wide_with_dropout = Deep_NN_Classifier(hyperparameters=hyperparameters,
                            keras_model=keras_model4,
                            properties={'name':'NN-12-layer_wide_with_dropout'})


estimators = [deep_nn_4_layer_thin_dropout,
            deep_nn_4_layer_wide_no_dropout, 
            deep_nn_4_layer_wide_with_dropout,
            deep_nn_12_layer_wide_with_dropout]

estim = instantiate_default_estimators(['Classification'])
# estimators = []
for e in estim:
    if e.properties['name'] is not 'NeuralNetworkDeepClassifier':
        estimators.append(e)

In [None]:
score_accuracy = ScoreAccuracy()

(errors_per_estimator, 
 errors_per_dataset_per_estimator, 
 errors_per_dataset_per_estimator_df) = analyze.prediction_errors(score_accuracy, estimators)

In [None]:
_, dts_run_times_full_path = data.list_datasets('run_times', out_io)
estimator_dict = {estimator.properties['name']: [] for estimator in estimators}

for dts in dts_run_times_full_path:
    run_times_per_estimator,_ = out_io.load_dataset_pd(dataset_path=dts, return_metadata=False)
    run_times_estimator_names = run_times_per_estimator['strategy_name'].tolist()
    
    for strat in estimator_dict.keys():
        try:
            strat_run_time = run_times_per_estimator.loc[run_times_per_estimator['strategy_name']==strat]
            in_sec = np.float(strat_run_time['total_seconds'])
        except:
            in_sec = np.nan
        estimator_dict[strat].append(in_sec)
        
#         estimator_dict[strat].append(in_sec)
    #check whether we have data on all estimators that were passed as an argument
#     run_times_all_estimators_exist = (set(run_times_estimator_names) == set(estimator_dict.keys()))
#     if exact_match and not run_times_all_estimators_exist:
#         continue
#     #TODO come up with a more efficient solution to avoid loop
#     for i in range(run_times_per_estimator.shape[0]):

#         strategy_name = run_times_per_estimator.iloc[i]['strategy_name']
#         total_seconds = run_times_per_estimator.iloc[i]['total_seconds']
#         estimator_dict[strategy_name].append(total_seconds)
# #the long notation is necessary to handle situations when there are unequal number of obeservations per estimator
# training_time_per_dataset = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in estimator_dict.items() ]))
# training_time_per_dataset = training_time_per_dataset.round(3)
# avg_training_time = pd.DataFrame(training_time_per_dataset.mean(axis=0))
# avg_training_time.columns = ['avg training time (in sec)']
# avg_training_time = avg_training_time.sort_values('avg training time (in sec)',ascending=True).round(3)
# return avg_training_time, training_time_per_dataset

In [None]:
run_times_per_estimator['strategy_name'] == 'dd'

In [None]:
np.float(strat_run_time['total_seconds'])

In [None]:
a = pd.DataFrame.from_dict(estimator_dict)

In [None]:
a.mean(axis=0, numeric_only=True)

In [None]:
run_times_per_estimator

In [None]:
b = run_times_per_estimator.loc[run_times_per_estimator['strategy_name']=='SVC']

In [None]:
np.float(b['total_seconds'])