In [1]:
from sklearn.model_selection import GridSearchCV
from demo_utils.general import get_data
from sklearn.tree import DecisionTreeClassifier
from demo_utils.learning import get_model
import numpy as np
from IPython.display import Markdown as md
from demo_utils.general import SUPPORTED_DATASETS

In [2]:
from scipy.linalg import LinAlgError

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [5]:
#testing_dataset = 'segment'
testing_dataset = None
dts_size = 1000

In [6]:
model_params = {
    'model_name': 'dt',
    'sampler_name': 'identity',
    'pca_bool': False,
    'pca_first': True,
    #'box_type': 'none'
    'box_type': 'black_bag',
    'n_estim': 15,
}

In [7]:
hp_names = {'dt': 'min_impurity_decrease', 'logit': 'C', 'linear_svc': 'C'}
hp_options = {
    'dt': [10**i for i in range(-10, 1)],
    'logit': [10**i for i in range(-5, 4)],
    'linear_svc': [10**i for i in range(-5, 4)]
}
rbf_gamma_options = [10**i for i in range(-5, 2)]
nystroem_gamma_options = [10**i for i in range(-5, 2)]

In [8]:
def find_hparams(model_params, options, dts_name):
    n_comp = 500
    #prefix = 'model__'
    #prefix = 'base_estimator__'
    prefix = 'model__base_estimator__'
    tunning_params = {prefix + hp_names[model_params['model_name']]: options}
    model = get_model(**model_params)

    if model_params['sampler_name'] == 'rbf':
        tunning_params['sampler__' + 'gamma'] = rbf_gamma_options
        model.set_params(sampler__n_components=n_comp)
    elif model_params['sampler_name'] == 'nystroem':
        tunning_params['sampler__' + 'gamma'] = nystroem_gamma_options
        model.set_params(sampler__n_components=n_comp)

    clf = GridSearchCV(model, tunning_params, cv=10, iid=False)
    
    data = get_data(dts_name, n_ins=dts_size)
    data_train = data['data_train']
    target_train = data['target_train']
    
    #times = 5
    #g = []
    #for i in range(times):
    #    clf.fit(data_train, target_train)
    #    bp = clf.best_params_
    #    nbp = list(bp.values())[0]
    #    g.append(nbp)
    #m = np.mean(g)
    #key = list(bp.keys())[0]
    
    #clf.fit(data_train, target_train)
    #bp = clf.best_params_
    is_failing = True
    iterat = 0
    while is_failing:
        try:
            clf.fit(data_train, target_train)
            is_failing = False
        except LinAlgError:
            print('Error detectado, intento {}'.format(iterat))
            iterat += 1
            pass
    bp = clf.best_params_
            
    #print(model_params['model_name'], key)
    print(model_params['model_name'])
    print(bp)
    print()

In [9]:
def test_dataset(dts_name):
    display(md('# ' + dts_name))
    global testing_dataset
    testing_dataset = dts_name
    test_simple()
    test_simple_pca()
    display(md('## RFF'))
    test_sampler('rbf')
    display(md('## Nystroem'))
    test_sampler('nystroem')

In [10]:
############
def test_simple():
    display(md('## Simple'))
    model_params['pca_bool'] = False
    model_params['sampler_name'] = 'identity'
    for model_name in hp_names:
        model_params['model_name'] = model_name
        print('testing_dataset es {}'.format(testing_dataset))
        find_hparams(model_params, options=hp_options[model_params['model_name']], dts_name=testing_dataset)
############

def test_simple_pca():
    display(md('## Simple PCA'))
    model_params['pca_bool'] = True
    model_params['sampler_name'] = 'identity'
    for model_name in hp_names:
        model_params['model_name'] = model_name
        find_hparams(model_params, options=hp_options[model_params['model_name']], dts_name=testing_dataset)

In [11]:
##############
def test_sampler(sampler_name):
    test_sampler_no_pca(sampler_name)
    test_sampler_pca_first(sampler_name)
    test_sampler_pca_last(sampler_name)
##############

In [12]:
############
def test_sampler_no_pca(sampler_name):
    display(md('### No PCA'))
    model_params['pca_bool'] = False
    model_params['sampler_name'] = sampler_name
    for model_name in hp_names:
        model_params['model_name'] = model_name
        find_hparams(model_params, options=hp_options[model_params['model_name']], dts_name=testing_dataset)
############

def test_sampler_pca_first(sampler_name):
    display(md('### PCA First'))
    model_params['pca_bool'] = True
    model_params['pca_first'] = True
    model_params['sampler_name'] = sampler_name
    for model_name in hp_names:
        model_params['model_name'] = model_name
        find_hparams(model_params, options=hp_options[model_params['model_name']], dts_name=testing_dataset)
        
def test_sampler_pca_last(sampler_name):
    display(md('### PCA Last'))
    model_params['pca_bool'] = True
    model_params['pca_first'] = False
    model_params['sampler_name'] = sampler_name
    for model_name in hp_names:
        model_params['model_name'] = model_name
        find_hparams(model_params, options=hp_options[model_params['model_name']], dts_name=testing_dataset)

In [13]:
for sd in SUPPORTED_DATASETS:
    test_dataset(sd)

# segment

## Simple

testing_dataset es segment
dt
{'model__base_estimator__min_impurity_decrease': 1e-10}

testing_dataset es segment
logit
{'model__base_estimator__C': 1000}

testing_dataset es segment
linear_svc
{'model__base_estimator__C': 1}



## Simple PCA

dt
{'model__base_estimator__min_impurity_decrease': 1e-07}

logit
{'model__base_estimator__C': 100}

linear_svc
{'model__base_estimator__C': 100}



## RFF

### No PCA

dt
{'model__base_estimator__min_impurity_decrease': 0.0001, 'sampler__gamma': 0.0001}

logit
{'model__base_estimator__C': 1000, 'sampler__gamma': 0.01}

linear_svc
{'model__base_estimator__C': 1000, 'sampler__gamma': 0.01}



### PCA First

dt
{'model__base_estimator__min_impurity_decrease': 1e-10, 'sampler__gamma': 1e-05}



KeyboardInterrupt: 

for sd in SUPPORTED_DATASETS[3:]:
    test_dataset(sd)

for sd in SUPPORTED_DATASETS[5:]:
    test_dataset(sd)