# Parameter Tunning RF

In [13]:
import pandas as pd
import numpy as np
from xgboost.sklearn import XGBClassifier
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

from sklearn.metrics import SCORERS
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import mean_squared_error

import xgboost as xgb

# Going to use these 5 base models for the stacking
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

In [14]:
labels_f = pd.read_csv('../data/labels_f_filtrado.csv')
datos = pd.read_csv('../data/datos_filtrado.csv')

In [15]:
atributos = labels_f.columns.tolist()

In [16]:
atributos.remove('person')
atributos.remove('label')

In [17]:
y = labels_f['label'].ravel()
X = labels_f.loc[:,atributos]

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.1, random_state= 123)

In [19]:
def aplicarRus(X_train_p, y_train_p):
    rus= RandomUnderSampler(return_indices=True)
    #id_rus son los índices
    X_rus_train, y_rus_train, id_rus = rus.fit_sample(X_train_p, y_train_p)
    return(X_rus_train, y_rus_train)

In [20]:
X_train, y_train = aplicarRus(X_train,y_train)

In [21]:
modelo = RandomForestClassifier(n_estimators =  500, max_depth = 6)

modelo.fit(X_train, y_train)
    
preds = modelo.predict(X_test)
preds_prob = modelo.predict_proba(X_test)[:,1]
train_accuracy = accuracy_score(y_train, modelo.predict(X_train))
test_accuracy = accuracy_score(y_test, preds)
area_debajo_de_curva = roc_auc_score(y_test, preds_prob)
matriz_de_confusion = confusion_matrix(y_test, preds)
print('train acurracy: ')
print(train_accuracy)
print('test acurracy: ')
print(test_accuracy)
print('Matriz de confusión: ')
print(matriz_de_confusion)
print('Área bajo la curva: ')
print(area_debajo_de_curva)
print()

train acurracy: 
0.840293453724605
test acurracy: 
0.7749742533470649
Matriz de confusión: 
[[1431  417]
 [  20   74]]
Área bajo la curva: 
0.8494462098185502



In [22]:
parametros = {'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [24]:
model = RandomForestClassifier()
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1600, num = 10)]

# Number of features to consider at every split
#max_features = ['auto']

# Maximum number of levels in tree
max_depth = [8,7,9,10]#[int(x) for x in np.linspace(10, 110, num = 11)]

max_depth.append(None)

# Minimum number of samples required to split a node
#min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
#min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
#bootstrap = [True]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               #'max_features': max_features,
               'max_depth': max_depth}#,
               #'min_samples_split': min_samples_split,
               #'min_samples_leaf': min_samples_leaf,
               #'bootstrap': bootstrap}

clf = GridSearchCV(estimator=model, param_grid=random_grid,verbose = 10, scoring = 'roc_auc')
clf.fit(X_train, y_train)

print(clf.best_score_)
print(clf.best_estimator_.alpha)

preds = clf.predict(X_test)
preds_prob = clf.predict_proba(X_test)[:,1]
train_accuracy = accuracy_score(y_train, clf.predict(X_train))
test_accuracy = accuracy_score(y_test, preds)
area_debajo_de_curva = roc_auc_score(y_test, preds_prob)
matriz_de_confusion = confusion_matrix(y_test, preds)
print('train acurracy: ')
print(train_accuracy)
print('test acurracy: ')
print(test_accuracy)
print('Matriz de confusión: ')
print(matriz_de_confusion)
print('Área bajo la curva: ')
print(area_debajo_de_curva)
print()


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] max_depth=8, n_estimators=200 ...................................
[CV]  max_depth=8, n_estimators=200, score=0.8491485573411249, total=   0.4s
[CV] max_depth=8, n_estimators=200 ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV]  max_depth=8, n_estimators=200, score=0.8575696638896869, total=   0.4s
[CV] max_depth=8, n_estimators=200 ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.9s remaining:    0.0s


[CV]  max_depth=8, n_estimators=200, score=0.8626486641769606, total=   0.4s
[CV] max_depth=8, n_estimators=355 ...................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.3s remaining:    0.0s


[CV]  max_depth=8, n_estimators=355, score=0.8526981373265157, total=   0.7s
[CV] max_depth=8, n_estimators=355 ...................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.1s remaining:    0.0s


[CV]  max_depth=8, n_estimators=355, score=0.8582821028440103, total=   0.7s
[CV] max_depth=8, n_estimators=355 ...................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.9s remaining:    0.0s


[CV]  max_depth=8, n_estimators=355, score=0.8595920712438954, total=   0.7s
[CV] max_depth=8, n_estimators=511 ...................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    3.6s remaining:    0.0s


[CV]  max_depth=8, n_estimators=511, score=0.8528465120525932, total=   1.0s
[CV] max_depth=8, n_estimators=511 ...................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    4.7s remaining:    0.0s


[CV]  max_depth=8, n_estimators=511, score=0.8604309106578569, total=   1.0s
[CV] max_depth=8, n_estimators=511 ...................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    5.8s remaining:    0.0s


[CV]  max_depth=8, n_estimators=511, score=0.8623843723068084, total=   1.0s
[CV] max_depth=8, n_estimators=666 ...................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    6.9s remaining:    0.0s


[CV]  max_depth=8, n_estimators=666, score=0.8524812819576333, total=   1.3s
[CV] max_depth=8, n_estimators=666 ...................................
[CV]  max_depth=8, n_estimators=666, score=0.8591554151106003, total=   1.3s
[CV] max_depth=8, n_estimators=666 ...................................
[CV]  max_depth=8, n_estimators=666, score=0.8620396437805229, total=   1.3s
[CV] max_depth=8, n_estimators=822 ...................................
[CV]  max_depth=8, n_estimators=822, score=0.8527552045288532, total=   1.6s
[CV] max_depth=8, n_estimators=822 ...................................
[CV]  max_depth=8, n_estimators=822, score=0.8593277793737432, total=   1.6s
[CV] max_depth=8, n_estimators=822 ...................................
[CV]  max_depth=8, n_estimators=822, score=0.8606722206262568, total=   1.6s
[CV] max_depth=8, n_estimators=977 ...................................
[CV]  max_depth=8, n_estimators=977, score=0.8526524835646458, total=   1.9s
[CV] max_depth=8, n_estimators=977 

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:  5.1min finished


0.859321697831282


AttributeError: 'RandomForestClassifier' object has no attribute 'alpha'

In [26]:
print(clf.best_score_)
print(clf.best_estimator_)

preds = clf.predict(X_test)
preds_prob = clf.predict_proba(X_test)[:,1]
train_accuracy = accuracy_score(y_train, clf.predict(X_train))
test_accuracy = accuracy_score(y_test, preds)
area_debajo_de_curva = roc_auc_score(y_test, preds_prob)
matriz_de_confusion = confusion_matrix(y_test, preds)
print('train acurracy: ')
print(train_accuracy)
print('test acurracy: ')
print(test_accuracy)
print('Matriz de confusión: ')
print(matriz_de_confusion)
print('Área bajo la curva: ')
print(area_debajo_de_curva)
print()



0.859321697831282
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=9, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1444, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
train acurracy: 
0.9255079006772009
test acurracy: 
0.770854788877446
Matriz de confusión: 
[[1422  426]
 [  19   75]]
Área bajo la curva: 
0.8526641797918394

