In [1]:
# define a small classification dataset
from sklearn.datasets import make_classification

# define dataset
X, y = make_classification(n_samples=1000, n_features=5, n_informative=2, n_redundant=3, random_state=1)

# summarize the shape of the dataset
print(X.shape, y.shape)

(1000, 5) (1000,)


In [2]:
# evaluate a decision tree on the entire small dataset
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier

# define dataset
X, y = make_classification(n_samples=1000, n_features=3, n_informative=2, n_redundant=1, random_state=1)

# define model
model = DecisionTreeClassifier()

# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

# report result
print('Mean Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Mean Accuracy: 0.810 (0.032)


In [3]:
# feature selection by enumerating all possible subsets of features
from itertools import product
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier

# define dataset
X, y = make_classification(n_samples=1000, n_features=5, n_informative=2, n_redundant=3, random_state=1)

# determine the number of columns
n_cols = X.shape[1]

best_subset, best_score = None, 0.0

# enumerate all combinations of input features
for subset in product([True, False], repeat=n_cols):
    
    # convert into column indexes
    ix = [i for i, x in enumerate(subset) if x]
    
    # check for now column (all False)
    if len(ix) == 0:
        continue
        
    # select columns
    X_new = X[:, ix]
    
    # define model
    model = DecisionTreeClassifier()
    
    # define evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    
    # evaluate model
    scores = cross_val_score(model, X_new, y, scoring='accuracy', cv=cv, n_jobs=-1)
    
    # summarize scores
    result = mean(scores)
    
    # report progress
    print('>f(%s) = %f ' % (ix, result))
    
    # check if it is better than the best so far
    if best_score is None or result >= best_score:
        
        # better result
        best_subset, best_score = ix, result
        
# report best
print('Done!')
print('f(%s) = %f' % (best_subset, best_score))

>f([0, 1, 2, 3, 4]) = 0.820000 
>f([0, 1, 2, 3]) = 0.827333 
>f([0, 1, 2, 4]) = 0.818667 
>f([0, 1, 2]) = 0.819667 
>f([0, 1, 3, 4]) = 0.821667 
>f([0, 1, 3]) = 0.825667 
>f([0, 1, 4]) = 0.808667 
>f([0, 1]) = 0.815333 
>f([0, 2, 3, 4]) = 0.824333 
>f([0, 2, 3]) = 0.827333 
>f([0, 2, 4]) = 0.826333 
>f([0, 2]) = 0.818667 
>f([0, 3, 4]) = 0.826667 
>f([0, 3]) = 0.822000 
>f([0, 4]) = 0.816000 
>f([0]) = 0.639333 
>f([1, 2, 3, 4]) = 0.822000 
>f([1, 2, 3]) = 0.819000 
>f([1, 2, 4]) = 0.822333 
>f([1, 2]) = 0.821333 
>f([1, 3, 4]) = 0.818667 
>f([1, 3]) = 0.822000 
>f([1, 4]) = 0.806667 
>f([1]) = 0.797000 
>f([2, 3, 4]) = 0.830667 
>f([2, 3]) = 0.755333 
>f([2, 4]) = 0.831667 
>f([2]) = 0.516667 
>f([3, 4]) = 0.826667 
>f([3]) = 0.514333 
>f([4]) = 0.777667 
Done!
f([2, 4]) = 0.831667


In [4]:
# define a large classification dataset
from sklearn.datasets import make_classification

# define dataset
X, y = make_classification(n_samples=10000, n_features=500, n_informative=10, n_redundant=490, random_state=1)

# summarize the shape of the dataset
print(X.shape, y.shape)

(10000, 500) (10000,)


In [5]:
# evaluate a decision tree on the entire larger dataset
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier

# define dataset
X, y = make_classification(n_samples=10000, n_features=500, n_informative=10, n_redundant=490, random_state=1)

# define model
model = DecisionTreeClassifier()

# define evaluation procedure
cv = StratifiedKFold(n_splits=3)

# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

# report result
print('Mean Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Mean Accuracy: 0.916 (0.002)


In [6]:
# stochastic optimization for feature selection
from numpy import mean
from numpy.random import rand
from numpy.random import choice
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

# objective function
def objective(X, y, subset):
    
    # convert into column indexes
    ix = [i for i, x in enumerate(subset) if x]
    
    # check for now column (all False)
    if len(ix) == 0:
        return 0.0
    
    # select columns
    X_new = X[:, ix]
    
    # define model
    model = DecisionTreeClassifier()
    
    # evaluate model
    scores = cross_val_score(model, X_new, y, scoring='accuracy', cv=3, n_jobs=-1)
    
    # summarize scores
    result = mean(scores)
    return result, ix

# mutation operator
def mutate(solution, p_mutate):
    
    # make a copy
    child = solution.copy()
    for i in range(len(child)):
        
        # check for a mutation
        if rand() < p_mutate:
            
            # flip the inclusion
            child[i] = not child[i]
    return child

# hill climbing local search algorithm
def hillclimbing(X, y, objective, n_iter, p_mutate):
    
    # generate an initial point
    solution = choice([True, False], size=X.shape[1])
    
    # evaluate the initial point
    solution_eval, ix = objective(X, y, solution)
    
    # run the hill climb
    for i in range(n_iter):
        
        # take a step
        candidate = mutate(solution, p_mutate)
        
        # evaluate candidate point
        candidate_eval, ix = objective(X, y, candidate)
        
        # check if we should keep the new point
        if candidate_eval >= solution_eval:
            
            # store the new point
            solution, solution_eval = candidate, candidate_eval
            
        # report progress
        print('>%d f(%s) = %f' % (i+1, len(ix), solution_eval))
    return solution, solution_eval

# define dataset
X, y = make_classification(n_samples=10000, n_features=500, n_informative=10, n_redundant=490, random_state=1)

# define the total iterations
n_iter = 100

# probability of including/excluding a column
p_mut = 10.0 / 500.0

# perform the hill climbing search
subset, score = hillclimbing(X, y, objective, n_iter, p_mut)

# convert into column indexes
ix = [i for i, x in enumerate(subset) if x]
print('Done!')
print('Best: f(%d) = %f' % (len(ix), score))

>1 f(260) = 0.907800
>2 f(255) = 0.907800
>3 f(265) = 0.907800
>4 f(267) = 0.907800
>5 f(262) = 0.907800
>6 f(262) = 0.908299
>7 f(264) = 0.908299
>8 f(260) = 0.908299
>9 f(256) = 0.908299
>10 f(260) = 0.908399
>11 f(260) = 0.911099
>12 f(259) = 0.911099
>13 f(263) = 0.911099
>14 f(255) = 0.911099
>15 f(255) = 0.911099
>16 f(258) = 0.911099
>17 f(260) = 0.911099
>18 f(260) = 0.911099
>19 f(266) = 0.911099
>20 f(261) = 0.911099
>21 f(263) = 0.911099
>22 f(263) = 0.911099
>23 f(253) = 0.911099
>24 f(260) = 0.911099
>25 f(254) = 0.911099
>26 f(262) = 0.911099
>27 f(260) = 0.911200
>28 f(260) = 0.911200
>29 f(261) = 0.911200
>30 f(258) = 0.911200
>31 f(260) = 0.911200
>32 f(264) = 0.912300
>33 f(265) = 0.912300
>34 f(265) = 0.912399
>35 f(263) = 0.912399
>36 f(267) = 0.912399
>37 f(262) = 0.912399
>38 f(260) = 0.912399
>39 f(265) = 0.912399
>40 f(262) = 0.912399
>41 f(261) = 0.912399
>42 f(261) = 0.912399
>43 f(261) = 0.912399
>44 f(272) = 0.912399
>45 f(266) = 0.912399
>46 f(262) = 0.9123