add example

scikit-learn · Nov 14, 2017 · 12cf3a9 · 12cf3a9
1 parent 616f9a2
commit 12cf3a9
Showing 1 changed file with 102 additions and 0 deletions.
diff --git a/examples/neighbors/plot_nca_dim_reduction.py b/examples/neighbors/plot_nca_dim_reduction.py
@@ -0,0 +1,102 @@
+"""
+==============================================================
+Dimensionality Reduction with Neighborhood Components Analysis
+==============================================================
+
+Sample usage of Neighborhood Components Analysis for dimensionality reduction.
+
+This example compares different (linear) dimensionality reduction methods
+applied on the Digits data set. The data set contains images of digits from
+0 to 9 with approximately 180 samples of each class. Each image is of
+dimension 8x8 = 64, and is reduced to a two-dimensional data point.
+
+Principal Component Analysis (PCA) applied to this data identifies the
+combination of attributes (principal components, or directions in the
+feature space) that account for the most variance in the data. Here we
+plot the different samples on the 2 first principal components.
+
+Linear Discriminant Analysis (LDA) tries to identify attributes that
+account for the most variance *between classes*. In particular,
+LDA, in contrast to PCA, is a supervised method, using known class labels.
+
+Neighborhood Components Analysis (NCA) tries to find a feature space such
+that a stochastic nearest neighbor algorithm will give the best accuracy.
+Like LDA, it is a supervised method.
+
+One can see that NCA enforces a clustering of the data that is visually
+meaningful even after the large dimensionality reduction.
+"""
+
+# License: BSD 3 clause
+
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn import datasets
+from sklearn.model_selection import train_test_split
+from sklearn.decomposition import PCA
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.neighbors import KNeighborsClassifier, \
+    NeighborhoodComponentsAnalysis
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+
+print(__doc__)
+
+n_neighbors = 3
+random_state = 0
+
+# Load Digits dataset
+digits = datasets.load_digits()
+X, y = digits.data, digits.target
+
+# Split into train/test
+X_train, X_test, y_train, y_test = \
+    train_test_split(X, y, test_size=0.5, stratify=y,
+                     random_state=random_state)
+
+dim = len(X[0])
+n_classes = len(np.unique(y))
+
+# Reduce dimension to 2 with PCA
+pca = make_pipeline(StandardScaler(),
+                    PCA(n_components=2, random_state=random_state))
+
+# Reduce dimension to 2 with LinearDiscriminantAnalysis
+lda = make_pipeline(StandardScaler(),
+                    LinearDiscriminantAnalysis(n_components=2))
+
+# Reduce dimension to 2 with NeighborhoodComponentAnalysis
+nca = make_pipeline(StandardScaler(),
+                    NeighborhoodComponentsAnalysis(n_features_out=2,
+                                                   verbose=1,
+                                                   random_state=random_state))
+
+# Use a nearest neighbor classifier to evaluate the methods
+knn = KNeighborsClassifier(n_neighbors=n_neighbors)
+
+# Make a list of the methods to be compared
+dim_reduction_methods = [('PCA', pca), ('LDA', lda), ('NCA', nca)]
+
+plt.figure()
+for i, (name, model) in enumerate(dim_reduction_methods):
+    plt.subplot(1, 3, i + 1)
+
+    # Fit the method's model
+    model.fit(X_train, y_train)
+
+    # Fit a nearest neighbor classifier on the embedded training set
+    knn.fit(model.transform(X_train), y_train)
+
+    # Compute the nearest neighbor accuracy on the embedded test set
+    acc_knn = knn.score(model.transform(X_test), y_test)
+
+    # Embed the data set in 2 dimensions using the fitted model
+    X_embedded = model.transform(X)
+
+    # Plot the embedding and show the evaluation score
+    plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y)
+    plt.title("{}, KNN (k={})".format(name, n_neighbors))
+    plt.text(0.9, 0.1, '{:.2f}'.format(acc_knn), size=15,
+             ha='center', va='center', transform=plt.gca().transAxes)
+
+plt.show()