In [1]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.neural_network import MLPRegressor
from scipy.stats import uniform, randint
from time import time
import pandas as pd
import os

In [2]:
if '__file__' in locals():
    current_folder = os.path.dirname(os.path.abspath(__file__))
else:
    current_folder = os.getcwd()

In [3]:
merge_features = '"{}"'.format(os.path.join(current_folder, '..', 'Features', 'Merge features.ipynb'))
calcular_auc = '"{}"'.format(os.path.join(current_folder, '..', 'Calcular AUC.ipynb'))
set_de_entrenamiento_testing_y_prediccion = '"{}"'.format(os.path.join(
    current_folder,
    '..',
    'Set de entrenamiento, testing y predicción.ipynb'
))
hiperparametros_csv = os.path.join(current_folder, 'hiperparametros', 'perceptron.csv')

In [4]:
pd.options.mode.chained_assignment = None
%run $merge_features

KeyboardInterrupt: La limpieza ya corrió en este Kernel

KeyboardInterrupt: La limpieza ya corrió en este Kernel

KeyboardInterrupt: La limpieza ya corrió en este Kernel

In [5]:
assert(df_features.shape[0] == df['person'].unique().shape[0])

Cargo los sets de entrenamiento, testing y predicción.

In [6]:
%run $set_de_entrenamiento_testing_y_prediccion

## Entrenamiento rápido

In [7]:
training = labels_training.merge(df_features, how='inner', on='person')

Hiperparámetros:

In [8]:
regr = MLPRegressor()
regr.fit(training.drop('label', axis=1), training['label'])

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(100,), learning_rate='constant',
             learning_rate_init=0.001, max_fun=15000, max_iter=200,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=None, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False)

Predigo:

In [9]:
labels_test = labels_test.merge(df_features, how='inner', on='person')

In [10]:
labels_test['label_predicted'] = regr.predict(labels_test.drop('label', axis=1))

In [11]:
%run $calcular_auc

auc score: 0.5161498305219658


## Hiperparámetros

In [12]:
labels_with_features = labels.merge(df_features, how='inner', on='person')
regr = MLPRegressor()

In [13]:
param_dist = {
    "hidden_layer_sizes": [[randint(0, 100).rvs() for i in range(randint(1, 10).rvs())] for i in range(10)],
    "activation": ['identity', 'logistic', 'tanh', 'relu'],
    "solver": ['lbfgs', 'adam'],
    "alpha": uniform(0, 1),
    "learning_rate": ['constant', 'invscaling', 'adaptive']
}

splits = 10 # cantidad de splits en el cross validation
n_iter_search = 2 # cantidad de combinaciones, en total splits*n_iter_search RF a probar

Nota: hay más info en la consola desde la cual se corre jupyter.

Se puede aumentar *n_jobs* para que corra más procesos en paralelo, pero se corre el riesgo de que se cuelgue por falta de memoria. Recomiendo que prueben ir aumentando *n_jobs* con un *n_iter_search* bajo hasta encontrar el mayor *n_jobs* que se banque su compu.

In [14]:
random_search = RandomizedSearchCV(regr, param_distributions=param_dist, iid=False, refit=True, verbose=10,
                                   return_train_score=True, cv=splits, scoring=make_scorer(roc_auc_score), 
                                   n_iter=n_iter_search, n_jobs=2);

start = time()
random_search.fit(labels_with_features.drop('label', axis=1), labels_with_features['label'])
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   14.4s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:   28.3s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:  1.1min
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:  1.8min
[Parallel(n_jobs=2)]: Done  20 out of  20 | elapsed:  3.2min remaining:    0.0s
[Parallel(n_jobs=2)]: Done  20 out of  20 | elapsed:  3.2min finished


RandomizedSearchCV took 219.95 seconds for 2 candidates parameter settings.


El **mejor** Perceptron fue:

In [15]:
print('score: {}'.format(random_search.best_score_))
random_search.best_params_

score: 0.4997046412063925


{'activation': 'logistic',
 'alpha': 0.7099651384960377,
 'hidden_layer_sizes': [44, 3, 89, 76, 85, 56, 64, 94, 50],
 'learning_rate': 'invscaling',
 'solver': 'adam'}

El resultado de la búsqueda la podemos importar a un DataFrame de Pandas y analizarlo.

In [16]:
stats_training = pd.DataFrame(data=random_search.cv_results_)
stats_training.head(2)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_activation,param_alpha,param_hidden_layer_sizes,param_learning_rate,param_solver,params,split0_test_score,split1_test_score,...,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,13.18862,0.332529,0.009513,0.003443,relu,0.691999,[64],constant,lbfgs,"{'activation': 'relu', 'alpha': 0.691998997936...",0.382296,0.425328,...,0.427377,0.454029,0.436363,0.529495,0.485704,0.482536,0.544792,0.550133,0.50264,0.492648,0.490572,0.04063
1,24.206446,6.778347,0.05228,0.009449,logistic,0.709965,"[44, 3, 89, 76, 85, 56, 64, 94, 50]",invscaling,adam,"{'activation': 'logistic', 'alpha': 0.70996513...",0.500543,0.499457,...,0.5,0.5,0.5,0.5,0.5,0.50003,0.5,0.49994,0.49997,0.5,0.499994,2.3e-05
