|
|
@@ -4,37 +4,35 @@ |
|
|
================================================= |
|
|
|
|
|
This example shows how to perform univariate feature selection before running a |
|
|
SVC (support vector classifier) to improve the classification scores. |
|
|
SVC (support vector classifier) to improve the classification scores. We use |
|
|
the iris dataset (4 features) and add 36 non-informative features. We can find |
|
|
that our model achieves best performance when we select around 10% of features. |
|
|
""" |
|
|
print(__doc__) |
|
|
|
|
|
import numpy as np |
|
|
import matplotlib.pyplot as plt |
|
|
from sklearn.datasets import load_digits |
|
|
from sklearn.datasets import load_iris |
|
|
from sklearn.feature_selection import SelectPercentile, chi2 |
|
|
from sklearn.model_selection import cross_val_score |
|
|
from sklearn.pipeline import Pipeline |
|
|
from sklearn.preprocessing import StandardScaler |
|
|
from sklearn.svm import SVC |
|
|
|
|
|
|
|
|
# ############################################################################# |
|
|
# Import some data to play with |
|
|
X, y = load_digits(return_X_y=True) |
|
|
# Throw away data, to be in the curse of dimension settings |
|
|
X = X[:200] |
|
|
y = y[:200] |
|
|
n_samples = len(y) |
|
|
X = X.reshape((n_samples, -1)) |
|
|
# add 200 non-informative features |
|
|
X = np.hstack((X, 2 * np.random.random((n_samples, 200)))) |
|
|
X, y = load_iris(return_X_y=True) |
|
|
# Add non-informative features |
|
|
np.random.seed(0) |
|
|
X = np.hstack((X, 2 * np.random.random((X.shape[0], 36)))) |
|
|
|
|
|
# ############################################################################# |
|
|
# Create a feature-selection transform and an instance of SVM that we |
|
|
# Create a feature-selection transform, a scaler and an instance of SVM that we |
|
|
# combine together to have an full-blown estimator |
|
|
|
|
|
transform = SelectPercentile(chi2) |
|
|
|
|
|
clf = Pipeline([('anova', transform), ('svc', SVC(gamma="auto"))]) |
|
|
clf = Pipeline([('anova', SelectPercentile(chi2)), |
|
|
('scaler', StandardScaler()), |
|
|
('svc', SVC(gamma="auto"))]) |
|
|
|
|
|
# ############################################################################# |
|
|
# Plot the cross-validation score as a function of percentile of features |
|
|
@@ -44,17 +42,15 @@ |
|
|
|
|
|
for percentile in percentiles: |
|
|
clf.set_params(anova__percentile=percentile) |
|
|
# Compute cross-validation score using 1 CPU |
|
|
this_scores = cross_val_score(clf, X, y, cv=5, n_jobs=1) |
|
|
this_scores = cross_val_score(clf, X, y, cv=5) |
|
|
score_means.append(this_scores.mean()) |
|
|
score_stds.append(this_scores.std()) |
|
|
|
|
|
plt.errorbar(percentiles, score_means, np.array(score_stds)) |
|
|
|
|
|
plt.title( |
|
|
'Performance of the SVM-Anova varying the percentile of features selected') |
|
|
plt.xticks(np.linspace(0, 100, 11, endpoint=True)) |
|
|
plt.xlabel('Percentile') |
|
|
plt.ylabel('Prediction rate') |
|
|
|
|
|
plt.ylabel('Accuracy Score') |
|
|
plt.axis('tight') |
|
|
plt.show() |
0 comments on commit
1deb95a