Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PR for packaging, Python3 Support, and Issues #3 and #4 #5

Merged
merged 6 commits into from
Apr 10, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 9 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
*.pyc
*~
dev/

# Compiled python modules.
*.pyc

# Setuptools distribution folder.
/dist/

# Python egg metadata, regenerated from source files by setuptools.
/*.egg-info
2 changes: 2 additions & 0 deletions boruta/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .boruta_py import BorutaPy
from .boruta_py2 import BorutaPyPlus
33 changes: 23 additions & 10 deletions boruta_py.py → boruta/boruta_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@

License: BSD 3 clause
"""
from __future__ import print_function, division

import numpy as np
import scipy as sp
import pandas as pd
from statsmodels.sandbox.stats.multicomp import multipletests as multicor
from sklearn.utils import check_X_y
from bottleneck import nanrankdata
Expand Down Expand Up @@ -228,6 +230,8 @@ def fit_transform(self, X, y, weak=False):
def _fit(self, X, y):
# check input params
self._check_params(X, y)
# if pandas cast to numpy
X = self._check_pandas(X)

# setup variables for Boruta
n_sample, n_feat = X.shape
Expand Down Expand Up @@ -313,13 +317,16 @@ def _fit(self, X, y):
iter_ranks = nanrankdata(imp_history_rejected, axis=1)
rank_medians = np.nanmedian(iter_ranks, axis=0)
ranks = nanrankdata(rank_medians)
# set smallest rank to 3 if there are tentative feats
if tentative.shape[0] > 0:
ranks = ranks - np.min(ranks) + 3
else:
# and 2 otherwise
ranks = ranks - np.min(ranks) + 2
self.ranking_[not_selected] = ranks

# update rank for not_selected features
if not_selected.shape[0] > 0:
# set smallest rank to 3 if there are tentative feats
if tentative.shape[0] > 0:
ranks = ranks - np.min(ranks) + 3
else:
# and 2 otherwise
ranks = ranks - np.min(ranks) + 2
self.ranking_[not_selected] = ranks

# notify user
if self.verbose > 0:
Expand All @@ -339,6 +346,12 @@ def _transform(self, X, weak=False):
X = X[:, self.support_]
return X

def _check_pandas(self, X):
if isinstance(X, pd.DataFrame):
return X.as_matrix()
else:
return X

def _check_params(self, X, y):
X, y = check_X_y(X, y)
multi_corr_methods = ['bonferroni', 'sidak', 'holm-sidak', 'holm',
Expand Down Expand Up @@ -373,7 +386,7 @@ def _print_results(self, dec_reg, iter, flag):
content = map(str, [n_iter, n_confirmed, n_tentative, n_rejected])
result = '\n'.join([x[0] +'\t' + x[1] for x in zip(cols, content)])
output = "\n\nBorutaPy finished running.\n\n" + result
print output
print(output)


def _get_tree_num(self, n_feat):
Expand All @@ -383,9 +396,9 @@ def _get_tree_num(self, n_feat):
# how many times a feature should be considered on average
f_repr = 100
# 2 because the training matrix is extended with n shadow features
multi = ((n_feat * 2) / float(np.sqrt(n_feat * 2) * depth))
multi = ((n_feat * 2) / np.sqrt(n_feat * 2) * depth)
n_estimators = int(multi * f_repr)
return int(n_estimators)
return n_estimators

def _get_imp(self, X, y):
try:
Expand Down
46 changes: 33 additions & 13 deletions boruta_py2.py → boruta/boruta_py2.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,17 @@

License: BSD 3 clause
"""
from __future__ import print_function, division

import numpy as np
import scipy as sp
import pandas as pd
from statsmodels.sandbox.stats.multicomp import multipletests as multicor
from sklearn.utils import check_X_y
from bottleneck import nanrankdata

class BorutaPy2(object):

class BorutaPyPlus(object):
"""
2nd improved Python implementation of the Boruta R package. This version
modified the core of the algorithm based on lots of benchmarking using
Expand Down Expand Up @@ -216,7 +219,10 @@ def fit_transform(self, X, y, weak=False):

def _fit(self, X, y):
# check input params

self._check_params(X, y)
# if pandas cast to numpy
X = self._check_pandas(X)

# setup variables for Boruta
n_sample, n_feat = X.shape
Expand All @@ -241,6 +247,7 @@ def _fit(self, X, y):
while np.any(dec_reg == 0) and iter < self.max_iter:
# find optimal number of trees and depth
if self.n_estimators == 'auto':
# number of features that aren't rejected
not_rejected = np.where(dec_reg >= 0)[0].shape[0]
n_tree = self._get_tree_num(not_rejected)
self.estimator.set_params(n_estimators=n_tree)
Expand Down Expand Up @@ -303,13 +310,16 @@ def _fit(self, X, y):
iter_ranks = nanrankdata(imp_history_rejected, axis=1)
rank_medians = np.nanmedian(iter_ranks, axis=0)
ranks = nanrankdata(rank_medians)
# set smallest rank to 3 if there are tentative feats
if tentative.shape[0] > 0:
ranks = ranks - np.min(ranks) + 3
else:
# and 2 otherwise
ranks = ranks - np.min(ranks) + 2
self.ranking_[not_selected] = ranks

# update rank for not_selected features
if not_selected.shape[0] > 0:
# set smallest rank to 3 if there are tentative feats
if tentative.shape[0] > 0:
ranks = ranks - np.min(ranks) + 3
else:
# and 2 otherwise
ranks = ranks - np.min(ranks) + 2
self.ranking_[not_selected] = ranks

# notify user
if self.verbose > 0:
Expand All @@ -335,10 +345,10 @@ def _get_tree_num(self, n_feat):
depth = 10
# how many times a feature should be considered on average
f_repr = 100
# 2 because the training matrix is extended with n shadow features
multi = ((n_feat * 2) / float(np.sqrt(n_feat * 2) * depth))
# n_feat * 2 because the training matrix is extended with n shadow features
multi = ((n_feat * 2) / (np.sqrt(n_feat * 2) * depth))
n_estimators = int(multi * f_repr)
return (n_estimators)
return n_estimators

def _get_imp(self, X, y):
try:
Expand Down Expand Up @@ -419,8 +429,18 @@ def _do_tests(self, dec_reg, hit_reg, iter):

return dec_reg

def _check_pandas(self, X):
if isinstance(X, pd.DataFrame):
return X.as_matrix()
else:
return X
def _check_params(self, X, y):
X, y = check_X_y(X, y)
"""
Check hyperparameters as well as X and y before proceeding with fit. Raise errors as needed
Input X, y
Returns None
"""
X, y = check_X_y(X, y) # check X and y are consistent len, X is Array and y is column

if self.perc <= 0 or self.perc > 100:
raise ValueError('The percentile should be between 0 and 100.')
Expand Down Expand Up @@ -449,4 +469,4 @@ def _print_results(self, dec_reg, iter, flag):
content = map(str, [n_iter, n_confirmed, n_tentative, n_rejected])
result = '\n'.join([x[0] +'\t' + x[1] for x in zip(cols, content)])
output = "\n\nBorutaPy finished running.\n\n" + result
print output
print(output)