Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP

Loading…

ENH method to index ParameterGrid points by parameter values #1842

Closed
wants to merge 4 commits into from

1 participant

@jnothman
Owner

This extension intends to solve #1020 for regular grids: it provides an index into the grid points output by ParameterGrid, such that results from GridSearchCV (particularly when returned as an array as in #1787) can be accessed by parameter value.

For example, I could get the mean test_score for each 'max_depth' value:

grid_search = GridSearchCV(clf, {'max_depth': range(1, 5), 'max_features': range(1, 3)})
grid_search.fit(...)
grid = ParameterGrid(grid_search.param_grid) # should we make it easier to get the ParameterGrid?
keys, index = grid.build_index(['max_depth'], ravel=True)
indexed = np.array(candidate.mean_validation_score for candidate in [grid_search.cv_scores_])[index]
print(indexed.mean(axis=1))

This PR aims to account for most of the functionality of #1034 within the current parameter search framework. The examples from there should be adopted.

@jnothman
Owner

Accessing this functionality from GridSearchCV should probably be less verbose...

@jnothman
Owner

Closing due to lack of interest.

@jnothman jnothman closed this
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
This page is out of date. Refresh to see the latest.
View
5 examples/svm/plot_rbf_parameters.py
@@ -28,7 +28,7 @@
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.cross_validation import StratifiedKFold
-from sklearn.grid_search import GridSearchCV
+from sklearn.grid_search import GridSearchCV, ParameterGrid
##############################################################################
# Load and prepare data set
@@ -110,7 +110,8 @@
# We extract just the scores
scores = [x[1] for x in score_dict]
-scores = np.array(scores).reshape(len(C_range), len(gamma_range))
+_, index = ParameterGrid(grid.param_grid).build_index(['C', 'gamma'])
+scores = np.asarray(scores)[index]
# draw heatmap of accuracy as a function of gamma and C
pl.figure(figsize=(8, 6))
View
80 sklearn/grid_search.py
@@ -47,11 +47,20 @@ class ParameterGrid(object):
Examples
--------
>>> from sklearn.grid_search import ParameterGrid
- >>> param_grid = {'a':[1, 2], 'b':[True, False]}
- >>> list(ParameterGrid(param_grid)) #doctest: +NORMALIZE_WHITESPACE
+ >>> param_grid = ParameterGrid({'a':[1, 2], 'b':[True, False]})
+ >>> list(param_grid) #doctest: +NORMALIZE_WHITESPACE
[{'a': 1, 'b': True}, {'a': 1, 'b': False},
{'a': 2, 'b': True}, {'a': 2, 'b': False}]
+ Using `build_index` to access points by parameter value:
+ >>> import numpy
+ >>> order, index = param_grid.build_index(['b', 'a'])
+ >>> index #doctest: +NORMALIZE_WHITESPACE
+ array([[0, 2], [1, 3]])
+ >>> numpy.asarray(list(param_grid))[index] #doctest: +NORMALIZE_WHITESPACE
+ array([[{'a': 1, 'b': True}, {'a': 2, 'b': True}],
+ [{'a': 1, 'b': False}, {'a': 2, 'b': False}]], dtype=object)
+
See also
--------
:class:`GridSearchCV`:
@@ -77,18 +86,77 @@ def __iter__(self):
"""
for p in self.param_grid:
# Always sort the keys of a dictionary, for reproducibility
- items = sorted(p.items())
+ items = self._ordered_items(p)
keys, values = zip(*items)
for v in product(*values):
params = dict(zip(keys, v))
yield params
+ @staticmethod
+ def _ordered_items(p):
+ return sorted(p.items())
+
def __len__(self):
"""Number of points on the grid."""
# Product function that can handle iterables (np.product can't).
- product = partial(reduce, operator.mul)
- return sum(product(len(v) for v in p.values())
- for p in self.param_grid)
+ return sum(self._len(p) for p in self.param_grid)
+
+ @staticmethod
+ def _len(params, product=partial(reduce, operator.mul)):
+ """Number of points for a single param dict"""
+ return product(len(v) for v in params.values())
+
+ def build_index(self, order=(), grid=0, ravel=False):
+ """Build an index over grid points by parameter values.
+
+ Parameters
+ ----------
+ `order` : sequence of strings, optional
+ Parameter names corresponding to the first axes of the returned
+ index. Any remaining parameters will be returned in arbitrary
+ order.
+ `grid` : integer, default 0
+ The grid index if an array of grids are represented.
+ `ravel` : boolean, default False
+ When True, only parameters listed in `order` are assigned their own
+ dimension, so the output index has `len(order) + 1` dimensions.
+ This simplifies aggregating over the indexed data grouped by
+ selected parameters.
+
+ Returns
+ -------
+ `order` : sequence of strings
+ Parameter names corresponding to axes in `index`. Where `ravel` is
+ True, the final axis is not included.
+ `index` : array
+ An integer index into the list of grid points, such that each
+ parameter corresponds to an axis.
+ """
+ n = self._len(self.param_grid[grid])
+ offset = sum(self._len(p) for p in self.param_grid[:grid])
+ keys, values = zip(*self._ordered_items(self.param_grid[grid]))
+ index = np.arange(offset, offset + n).reshape([len(v) for v in values])
+
+ if order:
+ axis_order = []
+ for key in order:
+ try:
+ axis_order.append(keys.index(key))
+ except IndexError:
+ raise ValueError('Key {!r} unknown'.format(key))
+ if len(axis_order) > len(set(axis_order)):
+ raise ValueError('order contains duplicate keys')
+ axis_order.extend(i for i in xrange(len(keys))
+ if i not in axis_order)
+
+ keys = np.asarray(keys)[axis_order]
+ index = index.transpose(axis_order)
+
+ if ravel:
+ keys = keys[:len(order)]
+ index = index.reshape(index.shape[:len(order)] + (-1,))
+
+ return list(keys), index
class IterGrid(ParameterGrid):
View
70 sklearn/tests/test_grid_search.py
@@ -124,6 +124,76 @@ def test_parameter_grid():
for x, y in product(params2["foo"], params2["bar"])))
+def test_build_index():
+ params = {"foo": [4, 2],
+ "bar": ["ham", "spam", "eggs"]}
+ grid = ParameterGrid(params)
+
+ keys, index1a = grid.build_index(['foo', 'bar'])
+ assert_equal(keys, ['foo', 'bar'])
+ assert_equal(index1a.shape, (2, 3))
+
+ keys, index1b = grid.build_index(['foo'])
+ assert_equal(keys, ['foo', 'bar'])
+ assert_array_equal(index1a, index1b)
+
+ keys, index2a = grid.build_index(['bar', 'foo'])
+ assert_equal(keys, ['bar', 'foo'])
+ assert_equal(index2a.shape, (3, 2))
+
+ keys, index2b = grid.build_index(['bar'])
+ assert_equal(keys, ['bar', 'foo'])
+ assert_array_equal(index2a, index2b)
+
+ keys, index = grid.build_index() # dimensions are implementation-specific?
+ assert_true(keys == ['foo', 'bar'] or keys == ['bar', 'foo'])
+ assert_true(np.all(index == index1a) or np.all(index == index2a))
+
+ points = np.asarray(list(grid))
+ points[index1a]
+ assert_array_equal(
+ points[index1a].ravel(),
+ np.asarray([{"foo": x, "bar": y}
+ for x, y in product(params["foo"], params["bar"])]))
+ assert_array_equal(
+ points[index2a].ravel(),
+ np.asarray([{"foo": y, "bar": x}
+ for x, y in product(params["bar"], params["foo"])]))
+
+ # Test bad `order`s
+ assert_raises(ValueError, grid.build_index, ['foo', 'foo'])
+ assert_raises(ValueError, grid.build_index, ['argh'])
+
+ # Test a list of grids and the `grid` parameter
+
+ grid2 = ParameterGrid([params, {'foo': [3]}])
+
+ keys, index1c = grid2.build_index(['foo'])
+ assert_equal(keys, ['foo', 'bar'])
+ assert_array_equal(index1a, index1c)
+
+ keys, index1d = grid2.build_index(['foo'], grid=0)
+ assert_equal(keys, ['foo', 'bar'])
+ assert_array_equal(index1a, index1d)
+
+ keys, index3 = grid2.build_index(['foo'], grid=1)
+ assert_equal(keys, ['foo'])
+ assert_array_equal(index3, np.array([len(grid2) - 1]))
+
+ params3 = {"foo": [4, 2],
+ "bar": ["ham", "spam", "eggs"],
+ "jon": [2.0, 1.0]}
+ grid3 = ParameterGrid(params3)
+ keys, index = grid3.build_index(['foo'])
+ assert_true(keys == ['foo', 'bar', 'jon'] or keys == ['foo', 'jon', 'bar'])
+ keys, index_ravel = grid3.build_index(['foo'], ravel=True)
+ assert_equal(keys, ['foo'])
+ assert_equal(index.ndim, 3)
+ assert_equal(index_ravel.ndim, 2)
+ for row, row_ravel in zip(index, index_ravel):
+ assert_equal(sorted(row.flat), sorted(row_ravel))
+
+
def test_grid_search():
"""Test that the best estimator contains the right value for foo_param"""
clf = MockClassifier()
Something went wrong with that request. Please try again.