Merge pull request #98 from leesteinberg/master

Incorporate bounds into covering class, satisfying Issue #97. This also incorporates commits from PR #87.
scikit-tda · Jul 12, 2018 · 27f7167 · 27f7167
2 parents 974c9fb + 46a9c77
commit 27f7167
Show file tree

Hide file tree

Showing 3 changed files with 170 additions and 91 deletions.
diff --git a/kmapper/cover.py b/kmapper/cover.py
@@ -16,7 +16,19 @@ def __init__(self,
                  perc_overlap=0.2,
                  # Deprecated parameters:
                  nr_cubes=None,
-                 overlap_perc=None):
+                 overlap_perc=None,
+                 limits=None):
+        """
+        limits: Numpy Array (n_dim,2)
+            (lower bound, upper bound) for every dimension
+            If a value is set to np.float('inf'), the bound will be assumed to be the min/max value of the dimension
+            Also, if limits == None, the limits are defined by the maximum and minimum value of the lens
+            for all dimensions.
+            i.e.
+                [[min_1, max_1],
+                 [min_2, max_2],
+                 [min_3, max_3]]
+        """
 
         self.n_cubes = nr_cubes if nr_cubes else n_cubes
         self.perc_overlap = overlap_perc if overlap_perc else perc_overlap
@@ -25,6 +37,15 @@ def __init__(self,
             warnings.warn(
                 "Arguements `overlap_perc` and `nr_cubes` have been replaced with `perc_overlap` and `n_cubes`. Use `perc_overlap` and `n_cubes` instead. They will be removed in future releases.", DeprecationWarning)
 
+        self.limits = limits
+
+        # Check limits can actually be handled and are set appropriately
+        NoneType = type(None)
+        assert isinstance(self.limits, (list, np.ndarray, type(None))), 'limits should either be an array or None'
+        if isinstance(self.limits, (list, np.ndarray)):
+            self.limits = np.array(self.limits)
+            assert self.limits.shape[1] == 2, 'limits should be (n_dim,2) in shape'
+
     def define_bins(self, data):
         """Returns an iterable of all bins in the cover.
 
@@ -36,8 +57,36 @@ def define_bins(self, data):
         """
 
         indexless_data = data[:, 1:]
-        bounds = (np.min(indexless_data, axis=0),
-                  np.max(indexless_data, axis=0))
+
+        # Find upper and lower bounds of bins using self.limits
+        # If array, use the values in the array
+        # If None, use the maximum and minimum values in the lens
+
+        # If self.limits is array-like
+        if isinstance(self.limits, np.ndarray):
+            # limits_array is used so we can change the values of self.limits from None to the min/max
+            limits_array = np.zeros(self.limits.shape)
+            limits_array[:, 0] = np.min(indexless_data, axis=0)
+            limits_array[:, 1] = np.max(indexless_data, axis=0)
+            limits_array[self.limits != np.float('inf')] = 0
+            self.limits[self.limits == np.float('inf')] = 0
+            bounds_arr = self.limits + limits_array
+            """ bounds_arr[i,j] = self.limits[i,j] if self.limits[i,j] == inf
+                bounds_arr[i,j] = max/min(indexless_data[i]) if self.limits == inf """
+            bounds = (bounds_arr[:, 0], bounds_arr[:, 1])
+
+            # Check new bounds are actually sensible - do they cover the range of values in the dataset?
+            if not ((np.min(indexless_data, axis=0) >= bounds_arr[:, 0]).all() or
+                    (np.max(indexless_data, axis=0) <= bounds_arr[:, 1]).all()):
+                warnings.warn('The limits given do not cover the entire range of the lens functions\n' + \
+                              'Actual Minima: %s\tInput Minima: %s\n' % (
+                              np.min(indexless_data, axis=0), bounds_arr[:, 0]) + \
+                              'Actual Maxima: %s\tInput Maxima: %s\n' % (
+                              np.max(indexless_data, axis=0), bounds_arr[:, 1]))
+
+        else:  # It must be None, as we checked to see if it is array-like or None in __init__
+            bounds = (np.min(indexless_data, axis=0),
+                      np.max(indexless_data, axis=0))
 
         # We chop up the min-max column ranges into 'n_cubes' parts
         self.chunk_dist = (bounds[1] - bounds[0]) / self.n_cubes
@@ -47,6 +96,8 @@ def define_bins(self, data):
 
         # We find our starting point
         self.d = bounds[0]
+        # And our ending point (for testing)
+        self.end = bounds[1]
 
         # Use a dimension index array on the projected X
         # (For now this uses the entire dimensionality, but we keep for experimentation)
@@ -62,7 +113,6 @@ def define_bins(self, data):
 
         coordinates = map(np.asarray, product(
             *(range(i) for i in cubes)))
-
         return coordinates
 
     def find_entries(self, data, cube, verbose=0):
@@ -94,3 +144,10 @@ def find_entries(self, data, cube, verbose=0):
         hypercube = data[np.invert(np.any(entries == False, axis=1))]
 
         return hypercube
+
+
+class CubicalCover(Cover):
+    """
+    Explicit definition of a cubical cover as the default behavior of the cover class
+    """
+    pass
diff --git a/kmapper/kmapper.py b/kmapper/kmapper.py
@@ -35,11 +35,20 @@ class KeplerMapper(object):
     """
 
     def __init__(self, verbose=0):
+        """
+        Inputs
+        ======
+
+        verbose: int, default is 0
+            Logging level. Currently 3 levels (0,1,2) are supported.
+                - for no logging, set `verbose=0`, 
+                - for some logging, set `verbose=1`,
+                - for complete logging, set `verbose=2`
+        """
+
+
         # TODO: move as many of the arguments from fit_transform and map into here.
         self.verbose = verbose
-        self.chunk_dist = []
-        self.overlap_dist = []
-        self.d = []
         self.projection = None
         self.scaler = None
         self.cover = None
@@ -59,7 +68,10 @@ def project(self, X, projection="sum", scaler=preprocessing.MinMaxScaler(), dist
             Projection parameter is either a string, a Scikit-learn class with fit_transform, like manifold.TSNE(), or a list of dimension indices. A string from ["sum", "mean", "median", "max", "min", "std", "dist_mean", "l2norm", "knn_distance_n"]. If using knn_distance_n write the number of desired neighbors in place of n: knn_distance_5 for summed distances to 5 nearest neighbors. Default = "sum".
 
         scaler :
-            Scikit-Learn API compatible scaler. Scaler of the data applied before mapping. Use None for no scaling. Default = preprocessing.MinMaxScaler() if None, do no scaling, else apply scaling to the projection. Default: Min-Max scaling distance_matrix: False or any of: ["braycurtis", "canberra", "chebyshev", "cityblock", "correlation", "cosine", "dice", "euclidean", "hamming", "jaccard", "kulsinski", "mahalanobis", "matching", "minkowski", "rogerstanimoto", "russellrao", "seuclidean", "sokalmichener", "sokalsneath", "sqeuclidean", "yule"]. If False do nothing, else create a squared distance matrix with the chosen metric, before applying the projection.
+            Scikit-Learn API compatible scaler. Scaler of the data applied before mapping. Use None for no scaling. Default = preprocessing.MinMaxScaler() if None, do no scaling, else apply scaling to the projection. Default: Min-Max scaling
+
+        distance_matrix:
+            False or any of: ["braycurtis", "canberra", "chebyshev", "cityblock", "correlation", "cosine", "dice", "euclidean", "hamming", "jaccard", "kulsinski", "mahalanobis", "matching", "minkowski", "rogerstanimoto", "russellrao", "seuclidean", "sokalmichener", "sokalsneath", "sqeuclidean", "yule"]. If False do nothing, else create a squared distance matrix with the chosen metric, before applying the projection.
 
         Returns
         -------
@@ -139,26 +151,6 @@ def dist_mean(X, axis=1):
             if projection in projection_funcs.keys():
                 X = projection_funcs[projection](X, axis=1).reshape((X.shape[0], 1))
 
-            # if projection == "sum":  # sum of row
-            #     X = np.sum(X, axis=1).reshape((X.shape[0], 1))
-            # if projection == "mean":  # mean of row
-            #     X = np.mean(X, axis=1).reshape((X.shape[0], 1))
-            # if projection == "median":  # median of row
-            #     X = np.median(X, axis=1).reshape((X.shape[0], 1))
-            # if projection == "max":  # max of row
-            #     X = np.max(X, axis=1).reshape((X.shape[0], 1))
-            # if projection == "min":  # min of row
-            #     X = np.min(X, axis=1).reshape((X.shape[0], 1))
-            # if projection == "std":  # std of row
-            #     X = np.std(X, axis=1).reshape((X.shape[0], 1))
-            # if projection == "l2norm":
-            #     X = np.linalg.norm(X, axis=1).reshape((X.shape[0], 1))
-
-            # if projection == "dist_mean":  # Distance of x to mean of X
-            #     X_mean = np.mean(X, axis=0)
-            #     X = np.sum(np.sqrt((X - X_mean)**2),
-            #                axis=1).reshape((X.shape[0], 1))
-
             if "knn_distance_" in projection:
                 n_neighbors = int(projection.split("_")[2])
                 if self.distance_matrix:  # We use the distance matrix for finding neighbors
@@ -249,6 +241,7 @@ def map(self,
             clusterer=cluster.DBSCAN(eps=0.5, min_samples=3),
             cover=Cover(n_cubes=10, perc_overlap=0.1),
             nerve=GraphNerve(),
+            precomputed=False,
 
             # These arguments are all deprecated
             overlap_perc=None,
@@ -273,15 +266,19 @@ def map(self,
         nerve: kmapper.Nerve
             Nerve builder implementing `__call__(nodes)` API
 
-
+        precomputed : Boolean
+            Tell Mapper whether the data that you are clustering on is a precomputed distance matrix. If set to
+            `True`, the assumption is that you are also telling your `clusterer` that `metric='precomputed'` (which
+            is an argument for DBSCAN among others), which 
+            will then cause the clusterer to expect a square distance matrix for each hypercube. `precomputed=True` will give a square matrix
+            to the clusterer to fit on for each hypercube.
 
         nr_cubes: Int (Deprecated)
             The number of intervals/hypercubes to create. Default = 10. (DeprecationWarning: define Cover explicitly in future versions)
 
         overlap_perc: Float (Deprecated)
             The percentage of overlap "between" the intervals/hypercubes. Default = 0.1. (DeprecationWarning: define Cover explicitly in future versions)
 
-
         Returns
         =======
         simplicial_complex : dict
@@ -370,12 +367,16 @@ def map(self,
 
             # If at least min_cluster_samples samples inside the hypercube
             if hypercube.shape[0] >= min_cluster_samples:
-
                 # Cluster the data point(s) in the cube, skipping the id-column
                 # Note that we apply clustering on the inverse image (original data samples) that fall inside the cube.
+                ids = [int(nn) for nn in hypercube[:, 0]]
+                X_cube = X[ids]
+
                 X_cube = X[[int(nn) for nn in hypercube[:, 0]]]
-
-                clusterer.fit(X_cube[:, 1:])
+                fit_data = X_cube[:, 1:]
+                if precomputed:
+                    fit_data = fit_data[:, ids]
+                clusterer.fit(fit_data)
 
                 if self.verbose > 1:
                     print("Found %s clusters in cube_%s\n" % (

diff --git a/test/test_coverer.py b/test/test_coverer.py
@@ -7,6 +7,73 @@
 from kmapper.cover import Cover
 
 
+
+
+@pytest.mark.parametrize('CoverClass', [Cover])
+class TestCoverBasic():
+    def test_cube_dim(self, CoverClass):
+
+        data = np.arange(30).reshape(10, 3)
+        c = CoverClass(n_cubes=10)
+        cubes = c.define_bins(data)
+
+        assert all(len(cube) == 2 for cube in cubes)
+
+    def test_cube_count(self, CoverClass):
+        data = np.arange(30).reshape(10, 3)
+        c = CoverClass(n_cubes=10)
+        cubes = c.define_bins(data)
+
+        assert len(list(cubes)) == 10**2, "idx column is ignored"
+
+    def test_single_dim(self, CoverClass):
+        data = np.arange(20).reshape(10, 2)
+        c = CoverClass(n_cubes=10)
+        cubes = c.define_bins(data)
+
+        assert all(len(cube) == 1 for cube in cubes)
+
+    def test_nr_dimensions(self, CoverClass):
+        data = np.arange(30).reshape(10, 3)
+
+        c = CoverClass(n_cubes=10)
+        _ = c.define_bins(data)
+        assert c.nr_dimensions == 2
+
+    def test_entries_even(self, CoverClass):
+        data = np.arange(40).reshape(20, 2)
+
+        cover = CoverClass(n_cubes=10)
+        cubes = cover.define_bins(data)
+
+        for cube in cubes:
+            entries = cover.find_entries(data, cube)
+            assert len(entries) >= 2
+
+    def test_cubes_overlap(self, CoverClass):
+        data = np.arange(40).reshape(20, 2)
+
+        cover = CoverClass(n_cubes=10)
+        cubes = cover.define_bins(data)
+
+        entries = []
+        for cube in cubes:
+            # turn singleton lists into individual elements
+            res = [i[0] for i in cover.find_entries(data, cube)]
+            entries.append(res)
+
+        for i, j in zip(range(9), range(1, 10)):
+            assert set(entries[i]).union(set(entries[j]))
+
+    def test_complete_pipeline(self, CoverClass):
+        # TODO: add a mock that asserts the cover was called appropriately.. or test number of cubes etc.
+        data, _ = datasets.make_circles()
+
+        data = data.astype(np.float64)
+        mapper = KeplerMapper()
+        graph = mapper.map(data, coverer=CoverClass())
+        mapper.visualize(graph)
+
 class TestCover():
     def test_diff_overlap_per_dim(self):
         data = np.random.rand(100, 10)
@@ -24,27 +91,6 @@ def test_find_entries_runs_with_diff_bins(self):
         cubes = list(c.define_bins(data))
         _ = c.find_entries(data, cubes[0])
 
-    def test_cube_count(self):
-        data = np.arange(30).reshape(10, 3)
-        c = Cover(n_cubes=10)
-        cubes = c.define_bins(data)
-
-        assert len(list(cubes)) == 10**2, "idx column is ignored"
-
-    def test_cube_dim(self):
-
-        data = np.arange(30).reshape(10, 3)
-        c = Cover(n_cubes=10)
-        cubes = c.define_bins(data)
-
-        assert all(len(cube) == 2 for cube in cubes)
-
-    def test_single_dim(self):
-        data = np.arange(20).reshape(10, 2)
-        c = Cover(n_cubes=10)
-        cubes = c.define_bins(data)
-
-        assert all(len(cube) == 1 for cube in cubes)
 
     def test_chunk_dist(self):
         data = np.arange(20).reshape(10, 2)
@@ -55,31 +101,13 @@ def test_chunk_dist(self):
         # TODO: this test is really fagile and has magic number, fix.
         assert all(i == 1.8 for i in chunks)
 
-    def test_nr_dimensions(self):
-        data = np.arange(30).reshape(10, 3)
-
-        c = Cover(n_cubes=10)
-        _ = c.define_bins(data)
-        assert c.nr_dimensions == 2
-
     def test_bound_is_min(self):
         data = np.arange(30).reshape(10, 3)
         cov = Cover(n_cubes=10)
         _ = cov.define_bins(data)
         bounds = list(zip(cov.d, range(1, 10)))
         assert all(b[0] == b[1] for b in bounds)
 
-    def test_entries_even(self):
-        data = np.arange(40).reshape(20, 2)
-
-        cover = Cover(n_cubes=10)
-        cubes = cover.define_bins(data)
-
-        for cube in cubes:
-            entries = cover.find_entries(data, cube)
-
-            assert len(entries) >= 2
-
     def test_entries_in_correct_cubes(self):
         # TODO: this test is a little hacky
 
@@ -95,29 +123,22 @@ def test_entries_in_correct_cubes(self):
             assert data[2 * i] in entries[i]
             assert data[2 * i + 1] in entries[i]
 
-    def test_cubes_overlap(self):
-        data = np.arange(40).reshape(20, 2)
 
-        cover = Cover(n_cubes=10)
-        cubes = cover.define_bins(data)
-
-        entries = []
-        for cube in cubes:
-            # turn singleton lists into individual elements
-            res = [i[0] for i in cover.find_entries(data, cube)]
-            entries.append(res)
+class TestCoverBounds:
+    def test_bounds(self):
+        data_vals = np.arange(40).reshape(20,2)
+        data = np.zeros((20,3))
+        data[:,0] = np.arange(20,dtype=int)  # Index row
+        data[:,1:3] = data_vals
 
-        for i, j in zip(range(9), range(1, 10)):
-            assert set(entries[i]).union(set(entries[j]))
+        limits = np.array([[np.float('inf'), np.float('inf')],[-10,100]])
+        cover = Cover(n_cubes=10, limits=limits)
+        cubes=cover.define_bins(data)
 
+        start = cover.d
+        end = cover.end
+        assert np.array_equal(np.array([start, end]), np.array([[0,-10], [38,100]]))
 
-def test_BasicCover():
-    # TODO: add a mock that asserts the cover was called appropriately.. or test number of cubes etc.
-    data, _ = datasets.make_circles()
 
-    data = data.astype(np.float64)
-    mapper = KeplerMapper()
-    graph = mapper.map(data)
-    mapper.visualize(graph)