In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# load the breast_cancer dataset
init_data = load_breast_cancer()
(X, y) = load_breast_cancer(return_X_y=True)
X = pd.DataFrame(data=X, columns=init_data['feature_names'])
y = pd.DataFrame(data=y, columns=['label'])

# split X into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Train a RandomForestClassifier as model 
forest = RandomForestClassifier(criterion='entropy',
                                n_estimators=200, 
                                random_state=1,
                                n_jobs=2)
forest.fit(X_train, y_train.values.ravel())

y_pred = forest.predict(X_test)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
print('Accuracy per feature: %.2f' % (accuracy_score(y_test, y_pred)/X.shape[1]))

Accuracy: 0.98
Accuracy per feature: 0.03


In [2]:
from sklearn.feature_selection import SelectFromModel
importances = forest.feature_importances_
# get sort indices in descending order
indices = np.argsort(importances)[::-1]
sfm = SelectFromModel(forest, threshold=0.117, prefit=True)
X_data_selec = sfm.transform(X)
for f in range(X_data_selec.shape[1]): #mdf
    print("%2d) %-*s %f" % (f + 1, 30, 
                            X.columns.values[indices[f]], 
                            importances[indices[f]]))

 1) worst concave points           0.141849
 2) mean concave points            0.117697


In [3]:
X_data_selec_train, X_data_selec_test, y_data_selec_train, y_data_selec_test = train_test_split(X_data_selec, y, test_size=0.3, random_state=0)

# Train a RandomForestClassifier as model 
forest_data_selec = RandomForestClassifier(criterion='entropy',
                                n_estimators=200, 
                                random_state=1,
                                n_jobs=2)
forest.fit(X_data_selec_train, y_data_selec_train.values.ravel())

y_data_selec_pred = forest.predict(X_data_selec_test)
print('Accuracy after data selection: %.2f' % accuracy_score(y_data_selec_test, y_data_selec_pred))
print('Accuracy per feature after data selection: %.2f' % (accuracy_score(y_data_selec_test, y_data_selec_pred)/X_data_selec.shape[1]))

Accuracy after data selection: 0.89
Accuracy per feature after data selection: 0.44
