scikit-learn · clayw · Jul 18, 2011 · Jul 20, 2011 · Jul 23, 2011 · Jul 26, 2011
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -415,6 +415,20 @@ From text
    kernel_approximation.AdditiveChi2Sampler
    kernel_approximation.SkewedChi2Sampler
 
+Label propagation
+=================
+
+.. automodule:: scikits.learn.label_propagation
+   :no-members:
+   :no-inherited-members:
+
+.. currentmodule:: scikits.learn
+
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
+   label_propagation.LabelPropagation
+   label_propagation.LabelSpreading
 
 .. _lda_ref:
 

diff --git a/doc/modules/label_propagation.rst b/doc/modules/label_propagation.rst
@@ -0,0 +1,50 @@
+.. _label_propagation:
+
+===================================================
+Label Propagation
+===================================================
+
+`sklearn.semisupervised.label_propagation` contains a few variations of semi-supervised
+graph inference algorithms. In the semi-supervised classification setting, the
+learning algorithm is fed both labeled and unlabeled data. With the addition of
+unlabeled data in the training model, the algorithm can better learn the total
+structure of the data. These algorithms generally do very well in practice even
+when faced with far fewer labeled points than ordinary classification models.
+
+A few features available in this model:
+  * Can be used for classification and regression tasks
+  * Kernel methods to project data into alternate dimensional spaces
+
+.. currentmodule:: scikits.learn.label_propagation
+.. topic:: Input labels for semi-supervised learning
+    It is important to assign an identifier to unlabeled points along with the
+    labeled data when training the model with the `fit` method.
+
+This module provides two label propagation models: :class:`LabelPropagation` and
+:class:`LabelSpreading`. Both work by forming a fully connected graph for each
+item in the input dataset. They differ only in the definition of the matrix
+that represents the graph and the clamp effect on the label distributions.
+:class:`LabelPropagation` is far more intuitive than :class:`LabelSpreading`
+which is motivatived by deeper mathematics.
+
+Clamping
+========
+=======
+
+Clamping allows the algorithm to change the weight of the true ground labeled
+data to some degree. The :class:`LabelPropagation` algorithm performs hard
+clamping of input labels, which means :math:`\alpha=1`. This clamping factor
+can be relaxed, to say :math:`\alpha=0.8`, which means that we will always
+retain 80 percent of our original label distribution, but the algorithm gets to
+change it's confidence of the distribution within 20 percent.
+
+Examples
+========
+  * :ref:`example_label_propagation_plot_label_propagation_versus_svm_iris.py`
+  * :ref:`example_label_propagation_structure.py`
+
+
+References
+==========
+[1] Yoshua Bengio, Olivier Delalleau, Nicolas Le Roux. In Semi-Supervised
+Learning (2006), pp. 193-216
diff --git a/doc/unsupervised_learning.rst b/doc/unsupervised_learning.rst
@@ -14,5 +14,6 @@ Unsupervised learning
     modules/covariance
     modules/outlier_detection
     modules/hmm
+    modules/label_propagation
 
 
diff --git a/examples/semi_supervised/label_propagation_digits_active_learning.py b/examples/semi_supervised/label_propagation_digits_active_learning.py
@@ -0,0 +1,93 @@
+"""
+========================================
+Label Propagation digits active learning
+========================================
+
+Demonstrates an active learning technique to learn handwritten digits
+using label propagation.
+
+We start by training a label propagation model with only 10 labeled points,
+then we select the top five most uncertain points to label. Next, we train
+with 15 labeled points (original 10 + 5 new ones). We repeat this process
+four times to have a model trained with 30 labeled examples.
+
+A plot will appear showing the top 5 most uncertain digits for each iteration
+of training. These may or may not contain mistakes, but we will train the next
+model with their true labels.
+"""
+print __doc__
+
+import numpy as np
+import pylab as pl
+
+from scipy import stats
+
+from sklearn import datasets
+from sklearn import label_propagation
+
+from sklearn.metrics import metrics
+from sklearn.metrics.metrics import confusion_matrix
+
+digits = datasets.load_digits()
+X = digits.data[:330]
+y = digits.target[:330]
+
+n_total_samples = len(y)
+n_labeled_points = 10
+
+unlabeled_indices = np.arange(n_total_samples)[n_labeled_points:]
+f = pl.figure()
+
+for i in range(5):
+    y_train = np.copy(y)
+    y_train[unlabeled_indices] = -1
+
+    lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=5)
+    lp_model.fit(X, y_train)
+
+    predicted_labels = lp_model.transduction_[unlabeled_indices]
+    true_labels = y[unlabeled_indices]
+
+    cm = confusion_matrix(true_labels, predicted_labels,
+            labels=lp_model.unique_labels_)
+
+    print "Label Spreading model: %d labeled & %d unlabeled (%d total)" %\
+        (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples)
+
+    print metrics.classification_report(true_labels, predicted_labels)
+
+    print "Confusion matrix"
+    print cm
+
+    # compute the entropies of transduced label distributions
+    pred_entropies = stats.distributions.entropy(
+            lp_model.label_distributions_.T)
+
+    # select five digit examples that the classifier is most uncertain about
+    uncertainty_index = uncertainty_index = np.argsort(pred_entropies)[-5:]
+
+    # keep track of indicies that we get labels for
+    delete_indices = np.array([])
+
+    f.text(.05, (1 - (i + 1) * .183),
+           "model %d\n\nfit with\n%d labels" % ((i + 1), i * 5 + 10), size=10)
+    for index, image_index in enumerate(uncertainty_index):
+        image = digits.images[image_index]
+
+        sub = f.add_subplot(5, 5, index + 1 + (5 * i))
+        sub.imshow(image, cmap=pl.cm.gray_r)
+        sub.set_title('predict: %i\ntrue: %i' % (
+            lp_model.transduction_[image_index], y[image_index]), size=10)
+        sub.axis('off')
+
+        # labeling 5 points, remote from labeled set
+        delete_index, = np.where(unlabeled_indices == image_index)
+        delete_indices = np.concatenate((delete_indices, delete_index))
+
+    unlabeled_indices = np.delete(unlabeled_indices, delete_indices)
+    n_labeled_points += 5
+
+f.suptitle("Active learning with Label Propagation.\nRows show 5 most "
+           "uncertain labels to learn with the next model.")
+pl.subplots_adjust(0.12, 0.03, 0.9, 0.8, 0.2, 0.45)
+pl.show()
diff --git a/examples/semi_supervised/label_propagation_versus_svm_iris.py b/examples/semi_supervised/label_propagation_versus_svm_iris.py
@@ -0,0 +1,96 @@
+"""
+================================================
+Label Propagation versus SVM on the Iris dataset
+================================================
+
+Performance comparison between Label Propagation in the semi-supervised setting
+to SVM in the supervised setting in the iris dataset.
+
+First 9 experiments: SVM (SVM), Label Propagation (LP), Label Spreading (LS)
+operate in the "inductive setting". That is, the system is trained with some
+percentage of data and then queried against unseen datapoints to infer a label.
+
+The final 10th experiment is in the transductive setting. Using a label
+spreading algorithm, the system is trained with approximately 24 percent of the
+data labeled and during training, unlabeled points are transductively assigned
+values. The test precision, recall, and F1 scores are based on these
+transductively assigned labels.
+"""
+print __doc__
+
+import numpy as np
+
+from sklearn import datasets
+from sklearn import svm
+from sklearn import label_propagation
+
+from sklearn.metrics.metrics import precision_score
+from sklearn.metrics.metrics import recall_score
+from sklearn.metrics.metrics import f1_score
+
+rng = np.random.RandomState(0)
+
+iris = datasets.load_iris()
+
+X = iris.data
+y = iris.target
+
+# 80% data to keep
+hold_80 = rng.rand(len(y)) < 0.8
+train, = np.where(hold_80)
+
+# 20% test data
+test, = np.where(hold_80 == False)
+
+X_all = X[train]
+y_all = y[train]
+
+svc = svm.SVC(kernel='rbf')
+svc.fit(X_all, y_all)
+print "Limited Label data example"
+print "Test name\tprecision\trecall   \tf1"
+print "SVM 80.0pct\t%0.6f\t%0.6f\t%0.6f" %\
+        (precision_score(svc.predict(X[test]), y[test]),
+         recall_score(svc.predict(X[test]), y[test]),
+         f1_score(svc.predict(X[test]), y[test]))
+
+print "-------"
+
+for num in [0.2, 0.3, 0.4, 1.0]:
+    lp = label_propagation.LabelPropagation()
+    hold_new = rng.rand(len(train)) > num
+    train_new, = np.where(hold_new)
+    y_dup = np.copy(y_all)
+    y_dup[train_new] = -1
+    lp.fit(X_all, y_dup)
+    print "LP %0.1fpct\t%0.6f\t%0.6f\t%0.6f" % \
+            (80 * num, precision_score(lp.predict(X[test]), y[test]),
+             recall_score(lp.predict(X[test]), y[test]),
+             f1_score(lp.predict(X[test]), y[test]))
+
+# label spreading
+for num in [0.2, 0.3, 0.4, 1.0]:
+    lspread = label_propagation.LabelSpreading()
+    hold_new = rng.rand(len(train)) > num
+    train_new, = np.where(hold_new)
+    y_dup = np.copy(y_all)
+    y_dup[train_new] = -1
+    lspread.fit(X_all, y_dup)
+    print "LS %0.1fpct\t%0.6f\t%0.6f\t%0.6f" % \
+            (80 * num, precision_score(lspread.predict(X[test]), y[test]),
+             recall_score(lspread.predict(X[test]), y[test]),
+             f1_score(lspread.predict(X[test]), y[test]))
+
+print "-------"
+lspread = label_propagation.LabelSpreading(alpha=0.8)
+y_dup = np.copy(y)
+hold_new = rng.rand(len(train)) > 0.3
+train_new, = np.where(hold_new)
+y_dup = np.copy(y)
+y_dup[train_new] = -1
+lspread.fit(X, y)
+trans_result = np.asarray(lspread.transduction_)
+print "LS 20tran\t%0.6f\t%0.6f\t%0.6f" % \
+        (precision_score(trans_result[test], y[test]),
+         recall_score(trans_result[test], y[test]),
+         f1_score(trans_result[test], y[test]))
diff --git a/examples/semi_supervised/plot_label_propagation_digits.py b/examples/semi_supervised/plot_label_propagation_digits.py
@@ -0,0 +1,83 @@
+"""
+===================================================
+Label Propagation digits: Demonstrating performance
+===================================================
+
+This example demonstrates the power of semisupervised learning by
+training a Label Spreading model to classify handwritten digits
+with sets of very few labels.
+
+The handwritten digit dataset has 1797 total points. The model will
+be trained using all points, but only 30 will be labeled. Results
+in the form of a confusion matrix and a series of metrics over each
+class will be very good.
+
+At the end, the top 10 most uncertain predictions will be shown.
+"""
+print __doc__
+
+import numpy as np
+import pylab as pl
+
+from scipy import stats
+
+from sklearn import datasets
+from sklearn import label_propagation
+
+from sklearn.metrics import metrics
+from sklearn.metrics.metrics import confusion_matrix
+
+digits = datasets.load_digits()
+X = digits.data[:330]
+y = digits.target[:330]
+
+n_total_samples = len(y)
+n_labeled_points = 30
+
+indices = np.arange(n_total_samples)
+
+unlabeled_set = indices[n_labeled_points:]
+
+# shuffle everything around
+y_train = np.copy(y)
+y_train[unlabeled_set] = -1
+
+###############################################################################
+# Learn with LabelSpreading
+lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=5)
+lp_model.fit(X, y_train)
+predicted_labels = lp_model.transduction_[unlabeled_set]
+true_labels = y[unlabeled_set]
+
+cm = confusion_matrix(true_labels, predicted_labels,
+        labels=lp_model.unique_labels_)
+
+print "Label Spreading model: %d labeled & %d unlabeled points (%d total)" % \
+        (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples)
+
+print metrics.classification_report(true_labels, predicted_labels)
+
+print "Confusion matrix"
+print cm
+
+# calculate uncertainty values for each transduced distribution
+pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)
+
+# pick the top 10 most uncertain labels
+uncertainty_index = np.argsort(pred_entropies)[-10:]
+
+###############################################################################
+# plot
+f = pl.figure(figsize=(7, 5))
+for index, image_index in enumerate(uncertainty_index):
+    image = digits.images[image_index]
+
+    sub = f.add_subplot(2, 5, index + 1)
+    sub.imshow(image, cmap=pl.cm.gray_r)
+    pl.xticks([])
+    pl.yticks([])
+    sub.set_title('predict: %i\ntrue: %i' % (
+        lp_model.transduction_[image_index], y[image_index]))
+
+f.suptitle('Learning with small amount of labeled data')
+pl.show()