#  Bagging and Random Forest for Imbalanced Classification

1. sklearn.cross_validation.cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch=‘2*n_jobs’): 交叉验证函数  
estimator: 数据对象  
X: 数据  
y: 预测数据  
scoring: 调用的方法  
cv: 交叉验证生成器或可迭代的次数  
n_jobs: 同时工作的cpu个数(-1带代表全部)  
verbose: 详细程度  
fit_params: 传递给估计器的拟合方法的参数  
per_dispatch: 控制并行执行期间调度的作业数量。减少这个数量对于避免在CPU发送更多作业时CPU内存消耗的扩大是有用的，该参数可以是:   
    *  没有， 在这种情况下，所有的工作立即创建并产生。将其用于轻量级和快速运行的作业，以避免由于按需产生作业而导致延迟
    *   一个int, 给出所产生的总工作的确切数量
    *   一个字符串，给出一个表达式作为n_jobs的函数， 如'2 * n_jobs'
2. python sklearn中的random_state: 随机数种子。







In [0]:
#* bagged decision trees on an imbalanced classification problem
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC

#* generate dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=4)
#* define model
model = BaggingClassifier().fit(X, y)
#* define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
#* evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
#* summarize performance
print("Mean ROC AUC: {:.3f}".format(mean(scores)))

# model.predict([[0, 1]])

Mean ROC AUC: 0.869


In [0]:
#* bagged decision trees with random undersampling for imbalanced classification
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.ensemble import BalancedBaggingClassifier

#* generate dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=4)
#* define model
model = BalancedBaggingClassifier().fit(X,y)
#* define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
#* evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
#* summarize performace
print("Mean ROC AUC: {:.3f}".format(mean(scores)))


In [0]:
#* random forest for imbalanced classification
from  numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
#* generate dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=4)
#* define model
model = RandomForestClassifier(n_estimators=10).fit(X, y)
#* define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
#* evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
#* summarize performance
print('Mean ROC AUC: {:.3f}'.format(mean(scores)))

Mean ROC AUC: 0.869


In [12]:
#* class balanced random forest for imbalanced classification
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier

#* generate dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=4)
#* define model
model = RandomForestClassifier(n_estimators=10, class_weight='balanced')
#* define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
#* evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
#* summarize performance
print("Mean ROC AUC: {:.3f}".format(mean(scores)))

Mean ROC AUC: 0.876


In [11]:
#* bootstrap class balanced random forest for imbalanced classification
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier

#* generate dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0,  random_state=4)
#* define model
model = RandomForestClassifier(n_estimators=10, class_weight='balanced_subsample')
#* define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
#* evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
#* summarize performance
print("Mean ROC AUC: {:.3f}".format(mean(scores)))

Mean ROC AUC: 0.874


In [29]:
#* random forest with random undersampling for imbalanced classification
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

#* generate dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=4)
#* define model
model = RandomForestClassifier(n_estimators=10).fit(X, y)
#* define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
#* evaluate model
Y = model.predict(X)
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
#* summarize performance
# print(scores)
print("Mean ROC AUC: {:.3f}".format(mean(scores)))

Mean ROC AUC: 0.874


In [31]:
#* easy ensemble for imbalanced classification
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.ensemble import EasyEnsembleClassifier

#* generate dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=4)
#* define model
model = EasyEnsembleClassifier(n_estimators=10)
#* define evalualtion procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
#* evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
#* summarize performance
print("Mean ROC AUC: {:.3f}".format(mean(scores)))

Mean ROC AUC: 0.960
