scikit-learn · amueller · Dec 1, 2016 · Nov 7, 2016 · Nov 7, 2016 · Nov 7, 2016
diff --git a/examples/linear_model/plot_ard.py b/examples/linear_model/plot_ard.py
@@ -54,8 +54,8 @@
 ols.fit(X, y)
 
 ###############################################################################
-# Plot the true weights, the estimated weights and the histogram of the
-# weights
+# Plot the true weights, the estimated weights, the histogram of the
+# weights, and predictions with standard deviations
 plt.figure(figsize=(6, 5))
 plt.title("Weights of the model")
 plt.plot(clf.coef_, color='darkblue', linestyle='-', linewidth=2,
@@ -81,4 +81,29 @@
 plt.plot(clf.scores_, color='navy', linewidth=2)
 plt.ylabel("Score")
 plt.xlabel("Iterations")
+
+# Plotting some predictions for polynomial regression
+def f(x, noise_amount):
+    """ function to approximate by polynomial interpolation"""
+    y = np.sqrt(x) * np.sin(x)
+    noise = np.random.normal(0, 1, len(x))
+    return y + noise_amount * noise
+
+degree = 30
+X = np.linspace(0, 10, 100)
+y = f(X, noise_amount=1)
+clf_poly = ARDRegression()
+clf_poly.fit(np.vander(X, degree), y)
+
+X_plot = np.linspace(0, 11, 25)
+y_plot = f(X_plot, noise_amount=0)
+y_mean, y_std = clf_poly.predict(np.vander(X_plot, degree), return_std=True)
+plt.figure(figsize=(6, 5))
+plt.errorbar(X_plot, y_mean, y_std, color='navy',
+        label="Polynomial Bayesian Ridge Regression", linewidth=2)
+plt.plot(X_plot, y_plot, color='gold', linewidth=2, 
+         label="Ground Truth")
+plt.ylabel("Output y")
+plt.xlabel("Feature X")
+plt.legend(loc="lower left")
 plt.show()
diff --git a/examples/linear_model/plot_bayesian_ridge.py b/examples/linear_model/plot_bayesian_ridge.py
@@ -51,7 +51,8 @@
 ols.fit(X, y)
 
 ###############################################################################
-# Plot true weights, estimated weights and histogram of the weights
+# Plot true weights, estimated weights, histogram of the weights, and 
+# predictions with standard deviations
 lw = 2
 plt.figure(figsize=(6, 5))
 plt.title("Weights of the model")
@@ -77,4 +78,29 @@
 plt.plot(clf.scores_, color='navy', linewidth=lw)
 plt.ylabel("Score")
 plt.xlabel("Iterations")
+
+# Plotting some predictions for polynomial regression
+def f(x, noise_amount):
+    """ function to approximate by polynomial interpolation"""
+    y = np.sqrt(x) * np.sin(x)
+    noise = np.random.normal(0, 1, len(x))
+    return y + noise_amount * noise
+
+degree = 10
+X = np.linspace(0, 10, 100)
+y = f(X, noise_amount=0.1)
+clf_poly = BayesianRidge()
+clf_poly.fit(np.vander(X, degree), y)
+
+X_plot = np.linspace(0, 11, 25)
+y_plot = f(X_plot, noise_amount=0)
+y_mean, y_std = clf_poly.predict(np.vander(X_plot, degree), return_std=True)
+plt.figure(figsize=(6, 5))
+plt.errorbar(X_plot, y_mean, y_std, color='navy',
+        label="Polynomial Bayesian Ridge Regression", linewidth=lw)
+plt.plot(X_plot, y_plot, color='gold', linewidth=lw, 
+         label="Ground Truth")
+plt.ylabel("Output y")
+plt.xlabel("Feature X")
+plt.legend(loc="lower left")
 plt.show()
diff --git a/sklearn/linear_model/bayes.py b/sklearn/linear_model/bayes.py
@@ -91,6 +91,9 @@ class BayesianRidge(LinearModel, RegressorMixin):
     lambda_ : float
        estimated precision of the weights.
 
+    sigma_ : array, shape = (n_features, n_features)
+        estimated variance-covariance matrix of the weights
+
     scores_ : float
         if computed, value of the objective function (to be maximized)
 
@@ -144,6 +147,8 @@ def fit(self, X, y):
         X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True)
         X, y, X_offset, y_offset, X_scale = self._preprocess_data(
             X, y, self.fit_intercept, self.normalize, self.copy_X)
+        self.X_offset = X_offset
+        self.X_scale = X_scale
         n_samples, n_features = X.shape
 
         # Initialization of the values of the parameters
@@ -216,10 +221,48 @@ def fit(self, X, y):
         self.alpha_ = alpha_
         self.lambda_ = lambda_
         self.coef_ = coef_
+        sigma_ = np.dot(Vh.T,
+                        Vh / (eigen_vals_ + lambda_ / alpha_)[:, None])
+        self.sigma_ = (1. / alpha_) * sigma_
 
         self._set_intercept(X_offset, y_offset, X_scale)
         return self
 
+    def predict(self, X, return_std=False):
+        """Predict using the linear model. In addition to the mean of the
+        predictive distribution, also its standard deviation can be returned.
+
+        See: http://www.utstat.toronto.edu/~rsalakhu/sta4273/notes/Lecture2.pdf
+        Slide 15, titled "Predictive Distribution"
+        Russ's beta is our self.beta_
+        Russ's alpha is our self.lambda_
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape = (n_samples, n_features)
+            Samples.
+
+        return_std : boolean, optional
+            Whether to return the standard deviation of posterior prediction.
+
+        Returns
+        -------
+        y_mean : array, shape = (n_samples,)
+            Mean of predictive distribution of query points.
+
+        y_std : array, shape = (n_samples,)
+            Standard deviation of predictive distribution of query points.
+        """
+        y_mean = self._decision_function(X)
+        if return_std is False:
+            return y_mean
+        else:
+            if self.normalize:
+                X = (X - self.X_offset) / self.X_scale
+            sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
+            y_std = np.sqrt(sigmas_squared_data + (1. / self.alpha_))
+            return y_mean, y_std
+
 
 ###############################################################################
 # ARD (Automatic Relevance Determination) regression
@@ -417,7 +460,7 @@ def fit(self, X, y):
                 s = (lambda_1 * np.log(lambda_) - lambda_2 * lambda_).sum()
                 s += alpha_1 * log(alpha_) - alpha_2 * alpha_
                 s += 0.5 * (fast_logdet(sigma_) + n_samples * log(alpha_) +
-                                                np.sum(np.log(lambda_)))
+                            np.sum(np.log(lambda_)))
                 s -= 0.5 * (alpha_ * rmse_ + (lambda_ * coef_ ** 2).sum())
                 self.scores_.append(s)
 
@@ -434,3 +477,42 @@ def fit(self, X, y):
         self.lambda_ = lambda_
         self._set_intercept(X_offset, y_offset, X_scale)
         return self
+
+    def predict(self, X, return_std=False):
+        """Predict using the linear model. In addition to the mean of the
+        predictive distribution, also its standard deviation can be returned.
+
+        See: http://www.utstat.toronto.edu/~rsalakhu/sta4273/notes/Lecture2.pdf
+        Slide 15, titled "Predictive Distribution"
+        Russ's beta is our self.beta_
+        Russ's alpha is our self.lambda_
+        ARD is only a little different: only dimensions/features for which
+        self.lambda_ < self.threshold_lambda are kept and the rest are
+        discarded.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape = (n_samples, n_features)
+            Samples.
+
+        return_std : boolean, optional
+            Whether to return the standard deviation of posterior prediction.
+
+        Returns
+        -------
+        y_mean : array, shape = (n_samples,)
+            Mean of predictive distribution of query points.
+
+        y_std : array, shape = (n_samples,)
+            Standard deviation of predictive distribution of query points.
+        """
+        y_mean = self._decision_function(X)
+        if return_std is False:
+            return y_mean
+        else:
+            if self.normalize:
+                X = (X - self.X_offset) / self.X_scale
+            X = X[:, self.lambda_ < self.threshold_lambda]
+            sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
+            y_std = np.sqrt(sigmas_squared_data + (1. / self.alpha_))
+            return y_mean, y_std
diff --git a/sklearn/linear_model/tests/test_bayes.py b/sklearn/linear_model/tests/test_bayes.py
@@ -56,3 +56,55 @@ def test_toy_ard_object():
     # Check that the model could approximately learn the identity function
     test = [[1], [3], [4]]
     assert_array_almost_equal(clf.predict(test), [1, 3, 4], 2)
+
+
+def test_return_std_bayesian():
+    def f(X): 
+        return np.dot(X, w) + b
+
+    def f_noise(X): 
+        return f(X) + np.random.randn(X.shape[0])*noise_mult
+
+    d = 5
+    n_train = 50
+    n_test = 10
+
+    noise_mult = 0.1
+    w = np.array([1.0, 0.0, 1.0, -1.0, 0.0])
+    b = 1.0
+
+    X = np.random.random((n_train, d))
+    X_test = np.random.random((n_test, d))
+    y = f_noise(X)
+
+    m1 = BayesianRidge()
+    m1.fit(X, y)
+    X_test = np.random.random((n_test, d))
+    y_mean, y_std = m1.predict(X_test, return_std=True)
+    assert_array_almost_equal(y_std, 0.1, decimal=1)
+
+
+def test_return_std_ard():
+    def f(X): 
+        return np.dot(X, w) + b
+
+    def f_noise(X): 
+        return f(X) + np.random.randn(X.shape[0])*noise_mult
+
+    d = 5
+    n_train = 50
+    n_test = 10
+
+    noise_mult = 0.1
+    w = np.array([1.0, 0.0, 1.0, -1.0, 0.0])
+    b = 1.0
+
+    X = np.random.random((n_train, d))
+    X_test = np.random.random((n_test, d))
+    y = f_noise(X)
+
+    m1 = ARDRegression()
+    m1.fit(X, y)
+    X_test = np.random.random((n_test, d))
+    y_mean, y_std = m1.predict(X_test, return_std=True)
+    assert_array_almost_equal(y_std, 0.1, decimal=1)