[View in Colaboratory](https://colab.research.google.com/github/Kneeplay/Classification_RandomForest/blob/Machine-Learning/Random_Forest_Hypertune.ipynb)

In [1]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving SPYV3.csv to SPYV3.csv
User uploaded file "SPYV3.csv" with length 3186566 bytes


In [2]:
import pandas as pd
import io
spy = pd.read_csv(io.StringIO(uploaded[fn].decode('utf-8')), sep=',', usecols=['CLASIFICADOR','1','45','68','75','139','171','179','187'])
                                                       
                                                       
spy.head()

Unnamed: 0,CLASIFICADOR,1,45,68,75,139,171,179,187
0,1,2.16,1.5526,0.95,15.58,0.9,-0.07,0.98,14.64
1,1,2.13,1.5573,0.99,14.55,1.0,-0.07,1.05,14.64
2,1,2.1,1.5436,0.99,14.4,1.01,-0.06,1.05,14.64
3,1,2.07,1.6172,0.97,14.33,1.0,-0.06,0.97,14.64
4,1,2.05,1.5535,0.97,14.42,1.02,-0.06,0.86,14.64


In [6]:
# División del conjunto en train y test

p_train = 0.75 # Porcentaje de train. Modificar para obtener diferentes conjuntos.

train = spy[:int((len(spy))*p_train)]
test = spy[int((len(spy))*p_train):]

print("Ejemplos usados para entrenar: ", len(train))
print("Ejemplos usados para test: ", len(test))
print("\n")

features = spy.columns[1:]
x_train = train[features]
y_train = train['CLASIFICADOR']

x_test = test[features]
y_test = test['CLASIFICADOR']

Ejemplos usados para entrenar:  2086
Ejemplos usados para test:  696




In [7]:
# Utilización de RandomizedSearchCV para busqueda de hiperparámetros

import numpy as np
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, make_scorer
import warnings

warnings.filterwarnings('ignore') 

X, y = x_train, y_train # Datos de entrenamiento

clf = RandomForestClassifier(n_jobs=-1) # Construcción del clasificador

#Construcción de la métrica

metrica = make_scorer(precision_score, greater_is_better=True, average="binary") 
                     
def report(results, n_top=1): # Función para mostrar resultados
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

# Parámetros y distribuciones para muestrear
param_grid = { 
    'n_estimators': [128],
    'max_features': ['auto', 'sqrt', 'log2',7,6,5,4,3,2,1, None],
    "max_depth": [20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,None],
    "min_samples_split": sp_randint(2, 130),
    "min_samples_leaf": sp_randint(1, 130),
    'bootstrap': ['True', 'False'], 'class_weight': ['balanced', None],
    'criterion' :['gini', 'entropy'], 'n_jobs': [-1],
    "random_state": [15], "min_weight_fraction_leaf": [0,0.05,0.10,0.15,0.20,0.25,0.30, 0.35,0.40,0.45,0.50],
    "max_leaf_nodes": [20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,None]}

n_iter_search = 512 # Ejecución
random_search = RandomizedSearchCV(clf, scoring= metrica, 
                                   param_distributions=param_grid, 
                                   n_iter=n_iter_search)
                                   

random_search.fit(X, y)
report(random_search.cv_results_)

Model with rank: 1
Mean validation score: 0.830 (std: 0.061)
Parameters: {'bootstrap': 'True', 'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 16, 'max_features': 5, 'max_leaf_nodes': 20, 'min_samples_leaf': 97, 'min_samples_split': 68, 'min_weight_fraction_leaf': 0, 'n_estimators': 128, 'n_jobs': -1, 'random_state': 15}



In [8]:
# Creación del modelo Random Forest con los parámetros obtenidos

clf_rf = RandomForestClassifier(n_estimators = 1024, criterion = 'entropy', 
                                max_depth =16, max_features = 5, 
                                min_samples_leaf = 97, min_samples_split = 68, 
                                bootstrap=True, oob_score=False, n_jobs=-1, 
                                class_weight= 'balanced', random_state=15, 
                                max_leaf_nodes=20,
                                min_weight_fraction_leaf= 0)

clf_rf.fit(x_train, y_train) # Construcción del modelo

preds_rf = clf_rf.predict(x_test) # Test del modelo

# Visualización de resultados

from sklearn.metrics import classification_report

print("Random Forest: \n" 
      +classification_report(y_true=test['CLASIFICADOR'], y_pred=preds_rf))

# Matriz de confusión

print("Matriz de confusión:\n")
matriz = pd.crosstab(test['CLASIFICADOR'], preds_rf, rownames=['actual'], colnames=['preds'])
print(matriz)

Random Forest: 
             precision    recall  f1-score   support

          0       0.49      0.79      0.61       166
          1       0.92      0.74      0.82       530

avg / total       0.82      0.75      0.77       696

Matriz de confusión:

preds     0    1
actual          
0       131   35
1       136  394
