In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import numpy as np

# load the breast_cancer dataset
init_data = load_breast_cancer()
(X, y) = load_breast_cancer(return_X_y=True)
X = pd.DataFrame(data=X, columns=init_data['feature_names'])
y = pd.DataFrame(data=y, columns=['label'])['label']

# split X into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Train a RandomForestClassifier as model
forest = RandomForestClassifier(criterion='entropy',
                                n_estimators=200,
                                random_state=1,
                                n_jobs=2)
forest.fit(X_train, y_train)

# sfm = SelectFromModel(forest, threshold=0.16, prefit=True)
# Z_forest_alt = sfm.transform(X)
importances = forest.feature_importances_
indices = np.argsort(importances)[::]
# X_refine = np.array([])
X_refine = X
for f in range(X_train.shape[1]):
    X_refine = X_refine.drop(X.columns.values[indices[f]], 1)
    X_train_refine, X_test_refine, y_train_refine, y_test_refine = train_test_split(X_refine, y, test_size=0.3, random_state=0)
    forest_refine = RandomForestClassifier(criterion='entropy',
                                    n_estimators=200,
                                    random_state=1,
                                    n_jobs=2)
    forest_refine.fit(X_train_refine, y_train_refine)
    y_pred_refine = forest_refine.predict(X_test_refine)
    if round(accuracy_score(y_test_refine, y_pred_refine)/X_refine.shape[1], 2) == 0.44:
        print('Accuracy: %.2f' % accuracy_score(y_test_refine, y_pred_refine))
        print('Accuracy per feature: %.2f' % (accuracy_score(y_test_refine, y_pred_refine)/X_refine.shape[1]))
        break

Accuracy: 0.89
Accuracy per feature: 0.44
