# Random Forest

In [1]:
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint
from time import time
import pandas as pd
import os

In [2]:
if '__file__' in locals():
    current_folder = os.path.dirname(os.path.abspath(__file__))
else:
    current_folder = os.getcwd()

In [3]:
merge_features = '"{}"'.format(os.path.join(current_folder, '..', 'Features', 'Merge features.ipynb'))
calcular_auc = '"{}"'.format(os.path.join(current_folder, '..', 'Calcular AUC.ipynb'))
set_de_entrenamiento_testing_y_prediccion = '"{}"'.format(os.path.join(
    current_folder,
    '..',
    'Set de entrenamiento, testing y predicción.ipynb'
))
hiperparametros_csv = os.path.join(current_folder, 'hiperparametros', 'random_forest.csv')

In [4]:
pd.options.mode.chained_assignment = None
%run $merge_features

KeyboardInterrupt: La limpieza ya corrió en este Kernel

KeyboardInterrupt: La limpieza ya corrió en este Kernel

KeyboardInterrupt: La limpieza ya corrió en este Kernel

In [5]:
assert(df_features.shape[0] == df['person'].unique().shape[0])

Cargo los sets de entrenamiento, testing y predicción.

In [6]:
%run $set_de_entrenamiento_testing_y_prediccion

labels_with_features = labels.merge(df_features, how='inner', on='person')
data = labels_with_features.drop('label', axis=1)
target = labels_with_features['label']

In [7]:
len(data.columns)

138

## Entrenamiento rápido

Obtenemos las métricas con cross validation.

In [8]:
param = {
    'bootstrap': True,
    'max_depth': 15,
    'max_features': 124,
    'min_samples_leaf': 74,
    'min_samples_split': 6,
    'n_estimators': 126
}

cv_splits = 10 # cantidad de splits en el cross validation

regr = RandomForestRegressor(**param)

In [9]:
%%time
scores = cross_val_score(regr, data, target, cv=cv_splits, scoring='roc_auc')
print("Accuracy: %0.6f (+/- %0.6f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.877153 (+/- 0.019050)
CPU times: user 6min 16s, sys: 169 ms, total: 6min 16s
Wall time: 6min 16s


In [10]:
%%time
scores = cross_val_score(regr, data, target, cv=cv_splits, scoring='roc_auc')
print("Accuracy: %0.6f (+/- %0.6f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.877668 (+/- 0.019484)
CPU times: user 7min 6s, sys: 358 ms, total: 7min 7s
Wall time: 7min 7s


## Feature importance

In [11]:
regr.fit(data, target);

In [12]:
feature_importance = pd.DataFrame(data={
    'columna':data.columns,
    'importancia':regr.feature_importances_
}).set_index('columna')

In [13]:
hashing_features = ['feature_hashing_timestamp_days']

In [14]:
for hashing_feature in hashing_features:
    hashing_importance = feature_importance[feature_importance.index.str.startswith(hashing_feature)].sum()
    feature_importance = feature_importance[~feature_importance.index.str.startswith(hashing_feature)]
    feature_importance.loc[hashing_feature] = hashing_importance
feature_importance.sort_values('importancia', ascending=False)

Unnamed: 0_level_0,importancia
columna,Unnamed: 1_level_1
dias ultimo checkout,0.451045
screen_resolution_width std,0.092280
dias ultima compra,0.038472
screen_resolution_height std,0.033460
checkout,0.024643
...,...
interest,0.000000
lead,0.000000
compras Enero,0.000000
compras Abril,0.000000


# Hiperparámetros

En esta sección vamos a buscar los hiperparámetros de random forest con un Random Search y cross validation. Para construir este Random Search se usó como base el código de sklearn https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html#sphx-glr-auto-examples-model-selection-plot-randomized-search-py.

Hiperparámetros a probar. 

In [15]:
param_dist = {
    'n_estimators': list(range(1,150,5)),
    'max_depth': list(range(5,80,5)),
    'max_features': randint(1, data.shape[1]),
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(2, 100),
    'bootstrap': [True, False]
}

cv_splits = 10 # cantidad de splits en el cross validation
n_iter_search = 20 # cantidad de puntos, en total splits*n_iter_search RF a probar

regr = RandomForestRegressor()

Nota: hay más info en la consola desde la cual se corre jupyter.

Se puede aumentar *n_jobs* para que corra más procesos en paralelo, pero se corre el riesgo de que se cuelgue por falta de memoria. Recomiendo que prueben ir aumentando *n_jobs* con un *n_iter_search* bajo hasta encontrar el mayor *n_jobs* que se banque su compu.

In [16]:
random_search = RandomizedSearchCV(regr, param_distributions=param_dist, iid=False, refit=True, verbose=10,
                                   return_train_score=True, n_iter=n_iter_search, cv=cv_splits,
                                   scoring='roc_auc', n_jobs=2);

start = time()
random_search.fit(data, target)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   35.2s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:  1.2min
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:  2.8min
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:  3.9min
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:  5.6min
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:  5.8min
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:  7.7min
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  9.3min
[Parallel(n_jobs=2)]: Done  57 tasks      | elapsed: 11.3min
[Parallel(n_jobs=2)]: Done  68 tasks      | elapsed: 17.8min
[Parallel(n_jobs=2)]: Done  81 tasks      | elapsed: 20.1min
[Parallel(n_jobs=2)]: Done  94 tasks      | elapsed: 20.5min
[Parallel(n_jobs=2)]: Done 109 tasks      | elapsed: 25.5min
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed: 26.9min
[Parallel(n_jobs=2)]: Done 141 tasks      | elapsed: 28.9min
[Parallel(

RandomizedSearchCV took 2433.66 seconds for 20 candidates parameter settings.


El **mejor** Random Forest fue:

In [17]:
print('score: {}'.format(random_search.best_score_))
random_search.best_params_

score: 0.8774373099031154


{'bootstrap': True,
 'max_depth': 65,
 'max_features': 98,
 'min_samples_leaf': 69,
 'min_samples_split': 5,
 'n_estimators': 91}

In [18]:
print('score: {}'.format(random_search.best_score_))
random_search.best_params_

score: 0.8774373099031154


{'bootstrap': True,
 'max_depth': 65,
 'max_features': 98,
 'min_samples_leaf': 69,
 'min_samples_split': 5,
 'n_estimators': 91}

El resultado de la búsqueda la podemos importar a un DataFrame de Pandas y analizarlo.

In [19]:
stats_training = pd.DataFrame(data=random_search.cv_results_)
stats_training.head(2)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,...,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,32.907773,3.875361,0.025259,0.008271,True,15,122,41,7,86,"{'bootstrap': True, 'max_depth': 15, 'max_feat...",0.873626,...,0.938307,0.937028,0.938874,0.937734,0.936894,0.938358,0.937021,0.937677,0.937373,0.93763,0.937689,0.000621
1,33.289508,2.279432,0.018642,0.003461,False,25,131,42,8,46,"{'bootstrap': False, 'max_depth': 25, 'max_fea...",0.843863,...,0.94129,0.943594,0.945471,0.94455,0.943853,0.944026,0.941173,0.941788,0.943397,0.941932,0.943107,0.001399


Escribo el mejor resultado en un archivo.

In [20]:
hyperparameter_data = {
    'algorithm': 'random_forest',
    'hyperparameters': random_search.best_params_,
    'cv_splits': cv_splits,
    'auc': random_search.best_score_,
    'features': data.columns
} 

In [21]:
%run -i write_hyperparameters.py