# Random Forest

In [1]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint
from time import time
import pandas as pd
import os

In [2]:
if '__file__' in locals():
    current_folder = os.path.dirname(os.path.abspath(__file__))
else:
    current_folder = os.getcwd()

In [3]:
merge_features = '"{}"'.format(os.path.join(current_folder, '..', 'Features', 'Merge features.ipynb'))
calcular_auc = '"{}"'.format(os.path.join(current_folder, '..', 'Calcular AUC.ipynb'))
set_de_entrenamiento_testing_y_prediccion = '"{}"'.format(os.path.join(
    current_folder,
    '..',
    'Set de entrenamiento, testing y predicción.ipynb'
))
hiperparametros_csv = os.path.join(current_folder, 'hiperparametros', 'random_forest.csv')

In [4]:
pd.options.mode.chained_assignment = None
%run $merge_features

[4.12437, 3.07057]


KeyboardInterrupt: La limpieza ya corrió en este Kernel

KeyboardInterrupt: La limpieza ya corrió en este Kernel

KeyboardInterrupt: La limpieza ya corrió en este Kernel

In [5]:
assert(df_features.shape[0] == df['person'].unique().shape[0])

Cargo los sets de entrenamiento, testing y predicción.

In [6]:
%run $set_de_entrenamiento_testing_y_prediccion

## Entrenamiento rápido

In [7]:
training = labels_training.merge(df_features, how='inner', on='person')

Hiperparámetros:

In [8]:
param = {'n_estimators':100, 'max_depth':10}

In [9]:
regr = RandomForestRegressor(**param)
regr.fit(training.drop('label', axis=1), training['label'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

Predigo:

In [10]:
labels_test = labels_test.merge(df_features, how='inner', on='person')

In [11]:
labels_test['label_predicted'] = regr.predict(labels_test.drop('label', axis=1))

In [12]:
%run $calcular_auc

auc score: 0.832395409891599


## Feature importance

In [13]:
feature_importance = pd.DataFrame(data={
    'columna':training.drop('label', axis=1).columns,
    'importancia':regr.feature_importances_
}).set_index('columna')
feature_importance.sort_values('importancia', ascending=False)

Unnamed: 0_level_0,importancia
columna,Unnamed: 1_level_1
checkout,0.150033
days until 31-05 mean,0.077674
days until 31-05 std,0.052075
event_count,0.033119
viewed product,0.032051
dias ultima compra,0.028888
Cant visitas con Computadoras,0.027832
brand listing,0.026347
Cant visitas con smartphone,0.024597
eventos Martes,0.024556


# Hiperparámetros

En esta sección vamos a buscar los hiperparámetros de random forest con un Random Search y cross validation. Para construir este Random Search se usó como base el código de sklearn https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html#sphx-glr-auto-examples-model-selection-plot-randomized-search-py.

Hiperparámetros a probar. 

In [14]:
labels_with_features = labels.merge(df_features, how='inner', on='person')
regr = RandomForestRegressor()

In [15]:
param_dist = {
    "n_estimators": randint(1, 1000),
    "max_depth": [3, 9, 12, 15, None],
    "max_features": randint(1, labels_with_features.shape[1]),
    "min_samples_split": randint(2, 11),
    "min_samples_leaf": randint(2, 100),
    "bootstrap": [True, False]
}

splits = 10 # cantidad de splits en el cross validation
n_iter_search = 20 # cantidad de combinaciones, en total splits*n_iter_search RF a probar

Nota: hay más info en la consola desde la cual se corre jupyter.

Se puede aumentar *n_jobs* para que corra más procesos en paralelo, pero se corre el riesgo de que se cuelgue por falta de memoria. Recomiendo que prueben ir aumentando *n_jobs* con un *n_iter_search* bajo hasta encontrar el mayor *n_jobs* que se banque su compu.

In [16]:
random_search = RandomizedSearchCV(regr, param_distributions=param_dist, iid=False, refit=True, verbose=10,
                                   return_train_score=True, n_iter=n_iter_search, cv=splits,
                                   scoring=make_scorer(roc_auc_score), n_jobs=2);

start = time()
random_search.fit(labels_with_features.drop('label', axis=1), labels_with_features['label'])
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   28.9s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:   56.7s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:  2.2min
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:  3.4min
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:  5.2min
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:  5.8min
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:  6.2min
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  6.4min
[Parallel(n_jobs=2)]: Done  57 tasks      | elapsed:  8.1min
[Parallel(n_jobs=2)]: Done  68 tasks      | elapsed: 13.9min
[Parallel(n_jobs=2)]: Done  81 tasks      | elapsed: 23.7min
[Parallel(n_jobs=2)]: Done  94 tasks      | elapsed: 27.9min
[Parallel(n_jobs=2)]: Done 109 tasks      | elapsed: 31.2min
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed: 33.3min
[Parallel(n_jobs=2)]: Done 141 tasks      | elapsed: 37.8min
[Parallel(

RandomizedSearchCV took 3598.42 seconds for 20 candidates parameter settings.


El **mejor** Random Forest fue:

In [17]:
print('score: {}'.format(random_search.best_score_))
random_search.best_params_

score: 0.8605316495686459


{'bootstrap': False,
 'max_depth': None,
 'max_features': 55,
 'min_samples_leaf': 67,
 'min_samples_split': 10,
 'n_estimators': 253}

El resultado de la búsqueda la podemos importar a un DataFrame de Pandas y analizarlo.

In [18]:
stats_training = pd.DataFrame(data=random_search.cv_results_)
stats_training.head(2)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_bootstrap,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,rank_test_score,...,split6_test_score,split6_train_score,split7_test_score,split7_train_score,split8_test_score,split8_train_score,split9_test_score,split9_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,25.352844,0.150411,0.856557,0.975477,True,,94,11,7,371,"{'min_samples_leaf': 11, 'n_estimators': 371, ...",9,...,0.875373,0.975423,0.855958,0.97518,0.864083,0.975996,0.850868,0.975413,1.176354,0.043885,0.012797,0.000304
1,32.88061,0.293911,0.859146,0.901806,True,15.0,69,71,6,953,"{'min_samples_leaf': 71, 'n_estimators': 953, ...",3,...,0.877337,0.900315,0.865598,0.901689,0.865165,0.901685,0.849889,0.901744,0.396681,0.040997,0.013597,0.001033


Escribo el mejor resultado en un archivo.

In [19]:
data = random_search.best_params_.copy()
data['features'] = ','.join([f for f in labels_with_features.columns if f != 'label'])
data['auc'] = random_search.best_score_
data['cv'] = splits

In [20]:
mejores_resultados = pd.read_csv(hiperparametros_csv, index_col='fecha')
mejores_resultados.head()

Unnamed: 0_level_0,bootstrap,max_depth,max_features,min_samples_leaf,min_samples_split,n_estimators,cv,auc,features
fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-11-24 19:50,False,9,65,91,7,104,10,0.859541,"screen_resolution_height,screen_resolution_wid..."


In [21]:
mejor_resultado = pd.DataFrame(data=data, index=[pd.datetime.now().strftime("%Y-%m-%d %H:%M")])
mejor_resultado.index.name = 'fecha'
mejores_resultados = mejores_resultados.append(mejor_resultado, sort=False)

In [22]:
mejores_resultados.to_csv(hiperparametros_csv)