In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

# load X and y
# NOTE BorutaPy accepts numpy arrays only, hence the .values attribute
X = pd.read_csv('examples/test_X.csv', index_col=0).values
y = pd.read_csv('examples/test_y.csv', header=None, index_col=0).values
y = y.ravel()


In [2]:
# simple encapsulating class to demonstrate the concept
# BorutaPy would be more sklearn consistent if it implemented these routines directly
class BorutaWithMore(BorutaPy):
    def fit(self, X, y):
        rtn = super(BorutaWithMore, self).fit(X, y)
        self.estimator.fit(self.transform(X), y)
        return rtn
    def score(self, X, y, sample_weight=None):
        return self.estimator.score(self.transform(X), y, sample_weight)
    def predict(self, X):
        return self.estimator.predict(self.transform(X))
        

rf = RandomForestClassifier(n_jobs=-1, class_weight='auto', max_depth=5)

# define Boruta feature selection method
feat_selector = BorutaWithMore(rf, n_estimators='auto', verbose=2, random_state=1)



In [3]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25)

In [4]:
feat_selector.fit(train_X, train_y)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	5
Tentative: 	2
Rejected: 	3
Iteration: 	9 / 100
Confirmed: 	5
Tentative: 	2
Rejected: 	3
Iteration: 	10 / 100
Confirmed: 	5
Tentative: 	2
Rejected: 	3
Iteration: 	11 / 100
Confirmed: 	5
Tentative: 	2
Rejected: 	3
Iteration: 	12 / 100
Confirmed: 	5
Tentative: 	2
Rejected: 	3
Iteration: 	13 / 100
Confirmed: 	5
Tentative: 	2
Rejected: 	3
Iteration: 	14 / 100
Confirmed: 	5
Tentative: 	2
Rejected: 	3
Iteration: 	15 / 100
Confirmed: 	5
Tentative: 	2
Rejected: 	3
Iteration: 	16 / 100
Confirmed: 	5
Tentative: 	1
Rejected: 	4
Iteration:

  hits = np.where(cur_imp[0] > imp_sha_max)[0]


BorutaWithMore(alpha=0.05,
        estimator=RandomForestClassifier(bootstrap=True, class_weight='auto', criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=69, n_jobs=-1, oob_score=False,
            random_state=<mtrand.RandomState object at 0x1156183c0>,
            verbose=0, warm_start=False),
        max_iter=100, n_estimators='auto', perc=100,
        random_state=<mtrand.RandomState object at 0x1156183c0>,
        two_step=True, verbose=2)

In [5]:
feat_selector.score(test_X, test_y)

0.98799999999999999

In [6]:
feat_selector.predict(test_X)

array([0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1])

In [7]:
from sklearn import model_selection
cv_score = model_selection.cross_val_score(feat_selector, X=X, y=y, cv=5)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	5
Tentative: 	0
Rejected: 	5


BorutaPy finished running.

Iteration: 	9 / 100
Confirmed: 	5
Tentative: 	0
Rejected: 	5
Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentativ

In [8]:
cv_score

array([ 1.   ,  0.985,  0.98 ,  0.99 ,  0.985])