# Random Forest

In [1]:
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint
from time import time
import pandas as pd
import os

In [2]:
if '__file__' in locals():
    current_folder = os.path.dirname(os.path.abspath(__file__))
else:
    current_folder = os.getcwd()

In [3]:
merge_features = '"{}"'.format(os.path.join(current_folder, '..', 'Features', 'Merge features.ipynb'))
calcular_auc = '"{}"'.format(os.path.join(current_folder, '..', 'Calcular AUC.ipynb'))
set_de_entrenamiento_testing_y_prediccion = '"{}"'.format(os.path.join(
    current_folder,
    '..',
    'Set de entrenamiento, testing y predicción.ipynb'
))
hiperparametros_csv = os.path.join(current_folder, 'hiperparametros', 'random_forest.csv')

In [4]:
pd.options.mode.chained_assignment = None
%run $merge_features

KeyboardInterrupt: La limpieza ya corrió en este Kernel

KeyboardInterrupt: La limpieza ya corrió en este Kernel

KeyboardInterrupt: La limpieza ya corrió en este Kernel

In [5]:
assert(df_features.shape[0] == df['person'].unique().shape[0])

Cargo los sets de entrenamiento, testing y predicción.

In [6]:
%run $set_de_entrenamiento_testing_y_prediccion

labels_with_features = labels.merge(df_features, how='inner', on='person')
data = labels_with_features.drop('label', axis=1)
target = labels_with_features['label']

In [42]:
len(data.columns)

128

## Entrenamiento rápido

Obtenemos las métricas con cross validation.

In [36]:
param = {
    'bootstrap': True,
    'max_depth': 15,
    'max_features': 124,
    'min_samples_leaf': 74,
    'min_samples_split': 6,
    'n_estimators': 126
}

cv_splits = 10 # cantidad de splits en el cross validation

regr = RandomForestRegressor(**param)

In [35]:
%%time
scores = cross_val_score(regr, data, target, cv=cv_splits, scoring='roc_auc')
print("Accuracy: %0.6f (+/- %0.6f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.877334 (+/- 0.022223)
CPU times: user 1min 3s, sys: 4 ms, total: 1min 3s
Wall time: 1min 3s


In [37]:
%%time
scores = cross_val_score(regr, data, target, cv=cv_splits, scoring='roc_auc')
print("Accuracy: %0.6f (+/- %0.6f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.877983 (+/- 0.019889)
CPU times: user 4min 6s, sys: 108 ms, total: 4min 6s
Wall time: 4min 6s


## Feature importance

In [38]:
regr.fit(data, target);

In [39]:
feature_importance = pd.DataFrame(data={
    'columna':data.columns,
    'importancia':regr.feature_importances_
}).set_index('columna')

In [40]:
hashing_features = ['feature_hashing_timestamp_days']

In [41]:
for hashing_feature in hashing_features:
    hashing_importance = feature_importance[feature_importance.index.str.startswith(hashing_feature)].sum()
    feature_importance = feature_importance[~feature_importance.index.str.startswith(hashing_feature)]
    feature_importance.loc[hashing_feature] = hashing_importance
feature_importance.sort_values('importancia', ascending=False)

Unnamed: 0_level_0,importancia
columna,Unnamed: 1_level_1
dias ultimo checkout,0.454694
screen_resolution_width std,0.101755
dias ultima compra,0.038657
screen_resolution_height std,0.028084
Cant visitas con Computadoras,0.026298
feature_hashing_timestamp_days,0.024810
diferencia de precio porcentual,0.022871
viewed product,0.021234
checkout,0.019402
screen_resolution_width mean,0.013663


# Hiperparámetros

En esta sección vamos a buscar los hiperparámetros de random forest con un Random Search y cross validation. Para construir este Random Search se usó como base el código de sklearn https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html#sphx-glr-auto-examples-model-selection-plot-randomized-search-py.

Hiperparámetros a probar. 

In [19]:
param_dist = {
    'n_estimators': list(range(1,150,5)),
    'max_depth': list(range(5,80,5)),
    'max_features': randint(1, data.shape[1]),
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(2, 100),
    'bootstrap': [True, False]
}

cv_splits = 10 # cantidad de splits en el cross validation
n_iter_search = 20 # cantidad de puntos, en total splits*n_iter_search RF a probar

regr = RandomForestRegressor()

Nota: hay más info en la consola desde la cual se corre jupyter.

Se puede aumentar *n_jobs* para que corra más procesos en paralelo, pero se corre el riesgo de que se cuelgue por falta de memoria. Recomiendo que prueben ir aumentando *n_jobs* con un *n_iter_search* bajo hasta encontrar el mayor *n_jobs* que se banque su compu.

In [26]:
random_search = RandomizedSearchCV(regr, param_distributions=param_dist, iid=False, refit=True, verbose=10,
                                   return_train_score=True, n_iter=n_iter_search, cv=cv_splits,
                                   scoring='roc_auc', n_jobs=2);

start = time()
random_search.fit(data, target)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    4.0s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    9.9s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:   26.7s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:  1.0min
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:  2.1min
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:  2.2min
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:  2.4min
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  4.6min
[Parallel(n_jobs=2)]: Done  57 tasks      | elapsed:  6.2min
[Parallel(n_jobs=2)]: Done  68 tasks      | elapsed:  7.2min
[Parallel(n_jobs=2)]: Done  81 tasks      | elapsed:  7.9min
[Parallel(n_jobs=2)]: Done  94 tasks      | elapsed:  9.4min
[Parallel(n_jobs=2)]: Done 109 tasks      | elapsed: 10.4min
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed: 10.8min
[Parallel(n_jobs=2)]: Done 141 tasks      | elapsed: 11.8min
[Parallel(

RandomizedSearchCV took 1075.44 seconds for 20 candidates parameter settings.


El **mejor** Random Forest fue:

In [28]:
print('score: {}'.format(random_search.best_score_))
random_search.best_params_

score: 0.8787456657639391


{'bootstrap': True,
 'max_depth': 15,
 'max_features': 124,
 'min_samples_leaf': 74,
 'min_samples_split': 6,
 'n_estimators': 126}

In [21]:
print('score: {}'.format(random_search.best_score_))
random_search.best_params_

score: 0.8794724031906196


{'bootstrap': True,
 'max_depth': 10,
 'max_features': 81,
 'min_samples_leaf': 49,
 'min_samples_split': 8,
 'n_estimators': 56}

El resultado de la búsqueda la podemos importar a un DataFrame de Pandas y analizarlo.

In [16]:
stats_training = pd.DataFrame(data=random_search.cv_results_)
stats_training.head(2)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_bootstrap,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,rank_test_score,...,split6_test_score,split6_train_score,split7_test_score,split7_train_score,split8_test_score,split8_train_score,split9_test_score,split9_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,10.734313,0.022918,0.872385,0.953729,True,10,86,2,10,66,"{'min_samples_leaf': 2, 'max_depth': 10, 'max_...",1,...,0.894631,0.950173,0.880974,0.955364,0.886034,0.950994,0.875967,0.953807,0.78419,0.005731,0.013883,0.002279


Escribo el mejor resultado en un archivo.

In [22]:
hyperparameter_data = {
    'algorithm': 'random_forest',
    'hyperparameters': random_search.best_params_,
    'cv_splits': cv_splits,
    'auc': random_search.best_score_,
    'features': data.columns
} 

In [23]:
%run -i write_hyperparameters.py