forked from scikit-image/scikit-image
-
Notifications
You must be signed in to change notification settings - Fork 0
/
plot_haar_extraction_selection_classification.py
171 lines (138 loc) · 6.94 KB
/
plot_haar_extraction_selection_classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
"""
======================================================
Face classification using Haar-like feature descriptor
======================================================
Haar-like feature descriptors were successfully used to implement the first
real-time face detector [1]_. Inspired by this application, we propose an
example illustrating the extraction, selection, and classification of Haar-like
features to detect faces vs. non-faces.
Notes
-----
This example relies on scikit-learn for feature selection and classification.
References
----------
.. [1] Viola, Paul, and Michael J. Jones. "Robust real-time face
detection." International journal of computer vision 57.2
(2004): 137-154.
http://www.merl.com/publications/docs/TR2004-043.pdf
:DOI:`10.1109/CVPR.2001.990517`
"""
import sys
from time import time
import numpy as np
import matplotlib.pyplot as plt
from dask import delayed
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from skimage.data import lfw_subset
from skimage.transform import integral_image
from skimage.feature import haar_like_feature
from skimage.feature import haar_like_feature_coord
from skimage.feature import draw_haar_like_feature
###########################################################################
# The procedure to extract the Haar-like features from an image is relatively
# simple. Firstly, a region of interest (ROI) is defined. Secondly, the
# integral image within this ROI is computed. Finally, the integral image is
# used to extract the features.
@delayed
def extract_feature_image(img, feature_type, feature_coord=None):
"""Extract the haar feature for the current image"""
ii = integral_image(img)
return haar_like_feature(ii, 0, 0, ii.shape[0], ii.shape[1],
feature_type=feature_type,
feature_coord=feature_coord)
###########################################################################
# We use a subset of CBCL dataset which is composed of 100 face images and
# 100 non-face images. Each image has been resized to a ROI of 19 by 19
# pixels. We select 75 images from each group to train a classifier and
# determine the most salient features. The remaining 25 images from each
# class are used to assess the performance of the classifier.
images = lfw_subset()
# To speed up the example, extract the two types of features only
feature_types = ['type-2-x', 'type-2-y']
# Build a computation graph using Dask. This allows the use of multiple
# CPU cores later during the actual computation
X = delayed(extract_feature_image(img, feature_types) for img in images)
# Compute the result
t_start = time()
X = np.array(X.compute(scheduler='threads'))
time_full_feature_comp = time() - t_start
y = np.array([1] * 100 + [0] * 100)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=150,
random_state=0,
stratify=y)
# Extract all possible features
feature_coord, feature_type = \
haar_like_feature_coord(width=images.shape[2], height=images.shape[1],
feature_type=feature_types)
###########################################################################
# A random forest classifier can be trained in order to select the most
# salient features, specifically for face classification. The idea is to
# determine which features are most often used by the ensemble of trees.
# By using only the most salient features in subsequent steps, we can
# drastically speed up the computation while retaining accuracy.
# Train a random forest classifier and assess its performance
clf = RandomForestClassifier(n_estimators=1000, max_depth=None,
max_features=100, n_jobs=-1, random_state=0)
t_start = time()
clf.fit(X_train, y_train)
time_full_train = time() - t_start
auc_full_features = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
# Sort features in order of importance and plot the six most significant
idx_sorted = np.argsort(clf.feature_importances_)[::-1]
fig, axes = plt.subplots(3, 2)
for idx, ax in enumerate(axes.ravel()):
image = images[0]
image = draw_haar_like_feature(image, 0, 0,
images.shape[2],
images.shape[1],
[feature_coord[idx_sorted[idx]]])
ax.imshow(image)
ax.set_xticks([])
ax.set_yticks([])
fig.suptitle('The most important features')
###########################################################################
# We can select the most important features by checking the cumulative sum
# of the feature importance. In this example, we keep the features
# representing 70% of the cumulative value (which corresponds to using only 3%
# of the total number of features).
cdf_feature_importances = np.cumsum(clf.feature_importances_[idx_sorted])
cdf_feature_importances /= np.max(cdf_feature_importances)
sig_feature_count = np.count_nonzero(cdf_feature_importances < 0.7)
sig_feature_percent = round(sig_feature_count /
len(cdf_feature_importances) * 100, 1)
print(('{} features, or {}%, account for 70% of branch points in the '
'random forest.').format(sig_feature_count, sig_feature_percent))
# Select the determined number of most informative features
feature_coord_sel = feature_coord[idx_sorted[:sig_feature_count]]
feature_type_sel = feature_type[idx_sorted[:sig_feature_count]]
# Note: it is also possible to select the features directly from the matrix X,
# but we would like to emphasize the usage of `feature_coord` and `feature_type`
# to recompute a subset of desired features.
# Build the computational graph using Dask
X = delayed(extract_feature_image(img, feature_type_sel, feature_coord_sel)
for img in images)
# Compute the result
t_start = time()
X = np.array(X.compute(scheduler='threads'))
time_subs_feature_comp = time() - t_start
y = np.array([1] * 100 + [0] * 100)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=150,
random_state=0,
stratify=y)
###########################################################################
# Once the features are extracted, we can train and test a new classifier.
t_start = time()
clf.fit(X_train, y_train)
time_subs_train = time() - t_start
auc_subs_features = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
summary = (('Computing the full feature set took {:.3f}s, plus {:.3f}s '
'training, for an AUC of {:.2f}. Computing the restricted '
'feature set took {:.3f}s, plus {:.3f}s training, '
'for an AUC of {:.2f}.')
.format(time_full_feature_comp, time_full_train,
auc_full_features, time_subs_feature_comp,
time_subs_train, auc_subs_features))
print(summary)
plt.show()